""" Core rendering logic for GreaterTables. Defines the `GreaterTables` class, which formats and renders pandas DataFrames to HTML, plain text, or LaTeX output using a validated configuration model. This is the main entry point for rendering logic. See `gtconfig.py` for configuration schema. """ from collections import namedtuple from decimal import InvalidOperation from io import StringIO from itertools import groupby import logging import os from pathlib import Path import re import tempfile from typing import Optional, Union, Literal import warnings import yaml from bs4 import BeautifulSoup from cachetools import LRUCache import numpy as np import pandas as pd from pandas.errors import IntCastingNaNError from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \ is_float_dtype, is_extension_array_dtype from pydantic import ValidationError from rich import box from IPython.display import display, SVG from . enums import Breakability from . config import Configurator from . hasher import df_short_hash from . etcher import Etcher from . utilities import * # turn off this fuck-fest pd.set_option('future.no_silent_downcasting', True) # pandas complaining about casting columns eg putting object in float column warnings.simplefilter(action='ignore', category=FutureWarning) # GPT recommended approach logger = logging.getLogger(__name__) class GT(object): """ Create a greater_tables formatting object. Provides html and latex output in quarto/Jupyter accessible manner. Wraps AND COPIES the dataframe df. WILL NOT REFLECT CHANGES TO DF. Recommended usage is to subclass GT (or use functools.partial) and set defaults suitable to your particular application. In that way you can maintain a "house-style" Process -------- **Input transformation** * ``namedtuple`` converted to ``DataFrame`` * ``pd.Series`` converted to ``DataFrame`` * ``list`` converted to ``DataFrame``, optionally using row 0 as ``config.header_row`` * A string is assumed to be a pipe-separated markdown table which is converted to a ``DataFrame`` setting aligners per the alignment row * All other input types are an error The input ``df`` must have unique column names. It is then copied into ``self.df`` which will be changed and ``self.raw_df`` for reference. The copy is hashed for the table name. **Mangling** * If show_index, the index is reset and kept, so that all columns are on an config.equal footing * The index change levels are computed to determine LaTeX hrules * ratio year, and raw columns converted to a list (can be input as a single string name) * Columns, except raw columns, are cast to floats * Column types by index determined * default formatter function set (wrapping input, if any) * Aligner column input decoded into aligner values (``grt-left,grt-right,grt-center``); index aligners separated * Formatters decoded, strings mapped to lambda functions as f-string formatters, integers as number of decimals * Tab values expanded into an iterable * Dataframe at this point (index reset, cast) saved to ``df_pre_applying_formatters`` * Determine formatters (``df_formatters`` property, a list of column index formatting functions: * Make the default float formatter if entered (callable, string, number; wrapped in try/except) * Determine each column's format type and add function * Run ``apply_formatters`` to apply all format choices to ``df``. This function handles index columns slightly differently, but results in the formatters being applied to each column. * Sparsify if requested and if multiindex * Result is a dataframe with all object column types and values that reflect the formatting choices. """ def __init__( self, df, *, caption='', label='', aligners: dict[str, callable] | None = None, formatters: dict[str, callable] | None = None, tabs: Optional[Union[list[float], float, int]] | None = None, unbreakable=None, ratio_cols=None, year_cols=None, date_cols=None, raw_cols=None, show_index=True, # config: Configurator | None = None, config_path: Path | None = None, **overrides, ): """ Parameters: :param df: target DataFrame or list of lists or markdown table string :param caption: table caption, optional (GT will look for gt_caption attribute of df and use that) :param label: TeX label (used in \\label{} command). For markdown tables with #tbl:... in the caption it is extracted automatically. :param aligners: None or dict (type or colname) -> left | center | right :param formatters: None or dict (type or colname) or callable -> format function for the column; formatters trump ratio_cols; if callable passed, it is applied to **all columns**. :param tabs: None or list of column widths in characters or a common int or float width. (It is converted into em; one character is about 0.5em on average; digits are exactly 0.5em.) If None, will be calculated. Default None. :param unbreakable: None or list of columns to be considered unbreakable :param ratio_cols: None, or "all" or list of column names treated as ratios. Set defaults in derived class suitable to application. :param year_cols: None, or "all" or list of column names treated as years (no commas, no decimals). Set defaults in derived class suitable to application. :param date_cols: None, or "all" or list of column names treated as dates. Set defaults in derived class suitable to application. :param raw_cols: None, or "all" or list of column names that are NOT cast to floats. Set defaults in derived class suitable to application. :param show_index: if True, show the index columns, default True Available keyword *overrides: :param config.default_integer_str: format f-string for integers, default value '{x:,d}' :param config.default_float_str: format f-string for floats, default value '{x:,.3f}' :param config.default_date_str: format f-string for dates, default '%Y-%m-%d'. NOTE: no braces or x! :param config.default_ratio_str: format f-string for ratios, default '{x:.1%}' :param config.table_float_format: None or format string for floats in the table format function, applied to entire table, default None :param config.table_hrule_width: width of the table top, botton and header hrule, default 1 :param config.table_vrule_width: width of the table vrule, separating the index from the body, default 1 :param config.hrule_widths: None or tuple of three ints for hrule widths (for use with multiindexes) :param config.vrule_widths: None or tuple of three ints for vrule widths (for use when columns have multiindexes) :param config.sparsify: if True, config.sparsify the index columns, you almost always want this to be true! :param config.sparsify_columns: if True, config.sparsify the columns, default True, generally a better look, headings centered in colspans :param config.spacing: 'tight', 'medium', 'wide' to quickly set cell padding. Medium is default (2, 10, 2, 10). :param config.padding_trbl: None or tuple of four ints for padding, in order top, right, bottom, left. :param config.tikz_scale: scale factor applied to tikz LaTeX tables. :param config.font_body: font size for body text, default 0.9. Units in em. :param config.font_head: font size for header text, default 1.0. Units in em. :param config.font_caption: font size for caption text, default 1.1. Units in em. :param config.font_bold_index: if True, make the index columns bold, default False. :param config.pef_precision: precision (digits after period) for pandas engineering format, default 3. :param config.pef_lower: apply engineering format to floats with absolute value < 10**config.pef_lower; default -3. :param config.pef_upper: apply engineering format to floats with absolute value > 10**config.pef_upper; default 6. :param config.cast_to_floats: if True, try to cast all non-integer, non-date columns to floats :param config.header_row: True: use first row as headers; False no headings. Default True :param config.equal: if True, set all column widths config.equal. Default False. Maybe ignored, depending on computed ideal column widths. :param config.caption_align: for the caption :param config.large_ok: signal that you are intentionally applying to a large dataframe. Sub-classes may restrict or apply .head() to df. :param config.max_str_length: maximum displayed length of object types, that are cast to strings. Eg if you have nested DataFrames! :param str_table_fmt: table border format used for string output (markdown), default mixed_grid DEPRECATED?? :param config.table_width_mode: 'explicit': set using config.max_table_width_em 'natural': each cell on one line (can be very wide with long strings) 'breakable': wrap breakable cells (text strings) at word boundaries to fit longest word 'minimum': wrap breakable and ok-to-break (dates) cells :param config.table_width_header_adjust: additional proportion of table width used to balance header columns. :param config.table_width_header_relax: extra spaces allowed per column heading to facilitate better column header wrapping. :param config.max_table_width_em: max table width used for markdown string output, default 200; width is never less than minimum width. Padding (3 chars per row plus 1) consumed out of config.max_table_width_em in string output mode. :param config.debug: if True, add id to caption and use colored lines in table, default False. """ if config and config_path: raise ValueError( "Pass either 'config' or 'config_path', not both.") if config: base_config = config elif config_path: config_path = Path(config_path) try: raw = yaml.safe_load(config_path.read_text(encoding="utf-8")) base_config = Configurator.model_validate(raw) except (ValidationError, OSError) as e: raise ValueError( f"Failed to load config from {config_path}") from e else: base_config = Configurator() # access through config # update and validate; need to merge to avoid repeated args # merged = dict(base_config.model_dump(), **overrides) merged = base_config.model_dump() | overrides self.config = Configurator(**merged) # no validation # self.config = base_config.model_copy(update=overrides) # deal with alternative input modes for df: None, DataFrame, Series, markdown text table if df is None: # don't want None to fail df = pd.DataFrame([]) if isinstance(df, pd.DataFrame): # usual use case pass elif isinstance(df, pd.Series): df = df.to_frame() elif isinstance(df, list): df = pd.DataFrame(df) # override this selection come what may show_index = False if self.config.header_row: # Set first row as column names df.columns = df.iloc[0] # Drop first row and reset index df = df[1:].reset_index(drop=True) elif isinstance(df, str): df = df.strip() if df == '': df = pd.DataFrame([]) else: df, aligners, caption, label = MD2DF.md_to_df(df) show_index = False elif GT._is_namedtuple_instance(df): df = GT._ntdf(df) else: raise ValueError( 'df must be a DataFrame, a list of lists, or a markdown table string') if len(df) > self.config.large_warning and not self.config.large_ok: raise ValueError( f'Large dataframe (>{self.config.large_warning} rows) and config.large_ok not set to true. ' 'Set large_ok=True or increase value of large_warning.') if not df.columns.is_unique: raise ValueError('df column names are not unique') # extract value BEFORE copying, copying does not carry these attributes over if caption != '': self.caption = caption else: # used by querex etc. self.caption = getattr(df, 'gt_caption', '') self.label = label self.df = df.copy(deep=True) # the object being formatted self.raw_df = df.copy(deep=True) # if not column_names: # get rid of column names # self.df.columns.names = [None] * self.df.columns.nlevels self.df_id = df_short_hash(self.df) if self.caption != '' and self.config.debug: self.caption += f' (id: {self.df_id})' # self.max_str_length = max_str_length # before messing self.show_index = show_index self.nindex = self.df.index.nlevels if self.show_index else 0 self.ncolumns = self.df.columns.nlevels self.ncols = self.df.shape[1] self.dt = self.df.dtypes # reset index to put all columns on an config.equal footing, but note number ofindex cols with warnings.catch_warnings(): if self.show_index: warnings.simplefilter( "ignore", category=pd.errors.PerformanceWarning) self.df = self.df.reset_index( drop=False, col_level=self.df.columns.nlevels - 1) # want the new index to be ints - that is not default if old was multiindex self.df.index = np.arange(self.df.shape[0], dtype=int) self.index_change_level = Indexing.changed_column( self.df.iloc[:, :self.nindex]) if self.ncolumns > 1: # will be empty rows above the index headers self.index_change_level = pd.Series( [i[-1] for i in self.index_change_level]) self.column_change_level = Indexing.changed_level(self.raw_df.columns) # determine ratio columns if ratio_cols is not None and not self.df.columns.is_unique: logger.warning( 'Ratio cols specified with non-unique column names: ignoring request.') self.ratio_cols = [] else: if ratio_cols is None: self.ratio_cols = [] elif ratio_cols == 'all': self.ratio_cols = [i for i in self.df.columns] elif ratio_cols is not None and not isinstance(ratio_cols, (tuple, list)): self.ratio_cols = self.cols_from_regex( ratio_cols) # [ratio_cols] else: self.ratio_cols = ratio_cols # determine year columns if year_cols is not None and not self.df.columns.is_unique: logger.warning( 'Year cols specified with non-unique column names: ignoring request.') self.year_cols = [] else: if year_cols is None: self.year_cols = [] elif year_cols is not None and not isinstance(year_cols, (tuple, list)): self.year_cols = self.cols_from_regex(year_cols) # [year_cols] else: self.year_cols = year_cols # determine date columns if date_cols is not None and not self.df.columns.is_unique: logger.warning( 'Year cols specified with non-unique column names: ignoring request.') self.date_cols = [] else: if date_cols is None: self.date_cols = [] elif date_cols is not None and not isinstance(date_cols, (tuple, list)): self.date_cols = self.cols_from_regex(date_cols) # [date_cols] else: self.date_cols = date_cols # determine columns NOT to cast to floats if raw_cols is not None and not self.df.columns.is_unique: logger.warning( 'Year cols specified with non-unique column names: ignoring request.') self.raw_cols = [] else: if raw_cols is None: self.raw_cols = [] elif raw_cols is not None and not isinstance(raw_cols, (tuple, list)): self.raw_cols = self.cols_from_regex(raw_cols) # [raw_cols] else: self.raw_cols = raw_cols # figure the default formatter (used in conjunction with raw columns) if self.config.default_formatter is None: self.default_formatter = self._default_formatter else: assert callable( self.config.default_formatter), 'config.default_formatter must be callable' def wrapped_default_formatter(x): try: return self.config.default_formatter(x) except ValueError: return str(x) self.default_formatter = wrapped_default_formatter # cast as much as possible to floats with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=pd.errors.PerformanceWarning) if self.config.cast_to_floats: for i, c in enumerate(self.df.columns): if c in self.raw_cols or c in self.date_cols: continue old_type = self.df.dtypes[c] if not np.any((is_integer_dtype(self.df.iloc[:, i]), is_datetime64_any_dtype(self.df.iloc[:, i]), # pyarrow classes excluded too is_extension_array_dtype(self.df.iloc[:, i]))): try: self.df.iloc[:, i] = self.df.iloc[: ,i].astype(float) logger.debug( f'coerced column {i}={c} from {old_type} to float success') except (ValueError, TypeError): logger.debug( f'coercing {i}={c} from {old_type} to float FAILED') # massage unbreakable if unbreakable is None: unbreakable = [] elif isinstance(unbreakable, str): unbreakable = [unbreakable] # now can determine types and infer the break penalties (for column sizes) self.float_col_indices = [] self.integer_col_indices = [] self.date_col_indices = [] self.object_col_indices = [] # not actually used, but for neatness self.break_penalties = [] # manage non-unique col names here logger.debug('FIGURING TYPES') for i, cn in enumerate(self.df.columns): # range(self.df.shape[1]): ser = self.df.iloc[:, i] if cn in self.date_cols: logger.debug(f'col {i}/{cn} specified as date col') self.date_col_indices.append(i) self.break_penalties.append( Breakability.NEVER if cn in unbreakable else Breakability.DATE) elif is_datetime64_any_dtype(ser): logger.debug(f'col {i} = {self.df.columns[i]} is DATE') self.date_col_indices.append(i) self.break_penalties.append( Breakability.NEVER if cn in unbreakable else Breakability.DATE) elif is_integer_dtype(ser): logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER') self.integer_col_indices.append(i) self.break_penalties.append( Breakability.NEVER if cn in unbreakable else Breakability.NEVER) elif is_float_dtype(ser): logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT') self.float_col_indices.append(i) self.break_penalties.append( Breakability.NEVER if cn in unbreakable else Breakability.NEVER) else: logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT') self.object_col_indices.append(i) c = ser.name if c in self.year_cols or c in self.ratio_cols: self.break_penalties.append( Breakability.NEVER if cn in unbreakable else Breakability.NEVER) else: self.break_penalties.append( Breakability.NEVER if cn in unbreakable else Breakability.ACCEPTABLE) # figure out column and index alignment if aligners is not None and np.any(self.df.columns.duplicated()): logger.warning( 'aligners specified with non-unique column names: ignoring request.') aligners = None if aligners is None: # not using aligners = [] elif isinstance(aligners, str): # lrc for each column aligners = {c: a for c, a in zip(self.df.columns, aligners)} self.df_aligners = [] lrc = {'l': 'grt-left', 'r': 'grt-right', 'c': 'grt-center'} # TODO: index aligners for i, c in enumerate(self.df.columns): # test aligners BEFORE index! if c in aligners: self.df_aligners.append(lrc.get(aligners[c], 'grt-center')) elif i < self.nindex: # index -> left self.df_aligners.append('grt-left') elif c in self.year_cols: self.df_aligners.append('grt-center') elif c in self.raw_cols: # these are strings self.df_aligners.append('grt-left') elif i in self.date_col_indices: # center dates, why not! self.df_aligners.append('grt-center') elif c in self.ratio_cols or i in self.float_col_indices or i in self.integer_col_indices: # number -> right self.df_aligners.append('grt-right') else: # all else, left self.df_aligners.append('grt-left') self.df_idx_aligners = self.df_aligners[:self.nindex] self.default_formatters = {} if formatters is None: pass elif callable(formatters): # apply to all columns for k in self.df.columns: self.default_formatters[k] = formatters else: for k, v in formatters.items(): if callable(v): self.default_formatters[k] = v elif isinstance(v, str): self.default_formatters[k] = lambda x: v.format(x=x) elif isinstance(v, int): fmt = f'{{x:.{v}f}}' self.default_formatters[k] = lambda x: fmt.format(x=x) else: raise ValueError( 'formatters must be dict of callables or ints or format strings {x:...}') if tabs is None: self.tabs = None elif isinstance(tabs, (int, float)): self.tabs = (tabs,) * (self.nindex + self.ncols) elif isinstance(tabs, (np.ndarray, pd.Series, list, tuple)): if len(tabs) == self.nindex + self.ncols: self.tabs = tabs # Already iterable and right length, self.tabs = as is else: logger.error( f'{self.tabs=} has wrong length. Ignoring.') self.tabs = None else: logger.error( f'{self.tabs=} must be None, a single number, or a list of ' 'numbers of the correct length. Ignoring.') self.tabs = None if self.config.padding_trbl is not None: padding_trbl = self.config.padding_trbl elif self.config.padding_trbl is None: if self.config.spacing == 'tight': padding_trbl = (0, 5, 0, 5) elif self.config.spacing == 'medium': padding_trbl = (2, 10, 2, 10) elif self.config.spacing == 'wide': padding_trbl = (4, 15, 4, 15) else: raise ValueError( 'config.spacing must be tight, medium, or wide or tuple of four ints.') # pydantic will see to it this is OK self.padt, self.padr, self.padb, self.padl = padding_trbl # because of the problem of non-unique indexes use a list and # not a dict to pass the formatters to to_html self.max_table_width_em = self.config.max_table_inch_width * 72 / self.config.table_font_pt_size self._pef = None self._df_formatters = None self.df_style = '' self.df_html = '' self._clean_html = '' self._clean_tex = '' self._rich_table = None self._string = '' self._df_html_text = "" self._df_style_text = "" self._cache = LRUCache(20) self._text_knowledge_df = None self._html_knowledge_df = None self._tex_knowledge_df = None self._knowledge_dfs = None # finally config.sparsify and then apply formaters # this radically alters the df, so keep a copy for now... self.df_pre_applying_formatters = self.df.copy() self.df = self.apply_formatters(self.df) # config.sparsify if self.config.sparsify and self.nindex > 1: self.df = Sparsify.sparsify(self.df, self.df.columns[:self.nindex]) # for c in self.df.columns[:self.nindex]: # # config.sparsify returns some other stuff... # self.df[c], _ = GT.config.sparsify(self.df[c]) # make final tex and html versions if self.config.tex_to_html is not None: # NEED TO WORK ON INDEXES TOO self.df_html = self.df.map(self.config.tex_to_html) else: self.df_html = self.df if self.config.tikz_escape_tex: self.df_tex = Escaping.escape_df_tex(self.df) else: self.df_tex = self.df def __repr__(self): """Basic representation.""" return f"GT(df_id={self.df_id})" def __str__(self): """String representation, for print().""" return self.make_string() def _repr_html_(self): """ Apply format to self.df. ratio cols like in constructor """ return self.html def _repr_latex_(self): """Generate a LaTeX tabular representation.""" # return '' # latex = self.df.to_latex(caption=self.caption, formatters=self._df_formatters) if self._clean_tex == '': self._clean_tex = self.make_tikz() logger.info('CREATED LATEX') return self._clean_tex def cache_get(self, key): """Retrieve item from cache.""" return self._cache.get(key, None) def cache_set(self, key, value): """Add item to cache.""" self._cache[key] = value def cols_from_regex(self, regex): """ Return columns matching a regex. For Index and MultiIndex. Operates on ``self.df`` and includes index (if ``show_index``) and columns of input dataframe. Search applies to any level of the index. Case sensitive. """ pattern = re.compile(regex) matching_cols = [ col for col in self.df.columns if any(pattern.search(str(level)) for level in (col if isinstance(col, tuple) else (col,))) ] return matching_cols # return [col for col in self.df.columns if isinstance(col, str) and re.search(regex, col)] # define the default and easy formatters =================================================== def default_ratio_formatter(self, x): """Ratio formatter.""" try: return self.config.default_ratio_str.format(x=x) except ValueError: return str(x) def default_date_formatter(self, x): """Date formatter that works for strings too.""" if pd.isna(x): return "" try: dt = pd.to_datetime(x, errors='coerce') if pd.isna(dt): return str(x) return dt.strftime(self.config.default_date_str) except Exception: logger.error("date error with %s", x) return str(x) def default_integer_formatter(self, x): """Integer formatter.""" try: return self.config.default_integer_str.format(x=x) except ValueError: return str(x) def default_year_formatter(self, x): """Year formatter.""" try: return f'{int(x):d}' except (TypeError, ValueError): return str(x) def default_raw_formatter(self, x): """Formatter for columns flagged as raw.""" return str(x) @staticmethod def default_float_format(x, neng=3): """ the endless quest for the perfect float formatter... NOT USED AT THE MINUTE. tester:: for x in 1.123123982398324723947 * 10.**np.arange(-23, 23): print(default_float_format(x)) :param x: :return: """ ef = pd.io.formats.format.EngFormatter(neng, True) # noqa try: if x == 0: ans = '0' elif 1e-3 <= abs(x) < 1e6: if abs(x) <= 10: ans = f'{x:.3g}' elif abs(x) < 100: ans = f'{x:,.2f}' elif abs(x) < 1000: ans = f'{x:,.1f}' else: ans = f'{x:,.0f}' else: ans = ef(x) return ans except ValueError as e: logger.debug(f'ValueError {e}') return str(x) except TypeError as e: logger.debug(f'TypeError {e}') return str(x) except AttributeError as e: logger.debug(f'AttributeError {e}') return str(x) def _default_formatter(self, x): """Default universal formatter for other types.""" try: f = float(x) except (TypeError, ValueError): s = str(x) return s if self.config.max_str_length < 0 else s[:self.config.max_str_length] if self.default_float_formatter: return self.default_float_formatter(f) if np.isinf(f) or np.isnan(f): # clearer handling of weird float cases return str(x) if f.is_integer(): return self.config.default_integer_str.format(x=int(f)) else: return self.config.default_float_str.format(x=f) def pef(self, x): """Pandas engineering format.""" if self._pef is None: self._pef = pd.io.formats.format.EngFormatter(accuracy=self.config.pef_precision, use_eng_prefix=True) # noqa return self._pef(x) def make_float_formatter(self, ser): """ Make a float formatter suitable for the Series ser. Obeys these rules: * All elements in the column are formatted consistently * ... TODO flesh out... at some point shd use pef?! """ amean = ser.abs().mean() # mean = ser.mean() amn = ser.abs().min() amx = ser.abs().max() # smallest = ser.abs().min() # sd = ser.sd() # p10, p50, p90 = np.quantile(ser, [0.1, .5, 0.9], method='inverted_cdf') # pl = 10. ** self.config.pef_lower # pu = 10. ** self.config.pef_upper pl, pu = 10. ** self.config.pef_lower, 10. ** self.config.pef_upper if amean < 1: precision = 5 elif amean < 10: precision = 3 elif amean < 20000: precision = 2 else: precision = 0 fmt = f'{{x:,.{precision}f}}' logger.debug(f'{ser.name=}, {amean=}, {fmt=}') if amean < pl or amean > pu or amx / max(1, amn) > pu: # go with eng def ff(x): try: return self.pef(x) except (ValueError, TypeError, InvalidOperation): return str(x) else: def ff(x): try: return fmt.format(x=x) # well and good but results in ugly differences # by entries in a column # if x == int(x) and np.abs(x) < pu: # return f'{x:,.0f}.' # else: # return fmt.format(x=x) except (ValueError, TypeError): return str(x) return ff @ property def df_formatters(self): """ Make and return the list of formatters. Created one per column. Int, date, objects use defaults, but for float cols the formatter is created custom to the details of each column. """ if self._df_formatters is None: # because of non-unique indexes, index by position not name if self.config.table_float_format is not None: if callable(self.config.table_float_format): # wrap in error protections def ff(x): try: return self.config.table_float_format(x=x) except ValueError: return str(x) except Exception as e: logger.error(f'Custom float function raised {e=}') self.default_float_formatter = ff else: if type(self.config.table_float_format) != str: raise ValueError( 'config.table_float_format must be a string or a function') fmt = self.config.table_float_format def ff(x): try: return fmt.format(x=x) except ValueError: return str(x) except Exception as e: logger.error( f'Custom float format string raised {e=}') self.default_float_formatter = ff else: self.default_float_formatter = False self._df_formatters = [] for i, c in enumerate(self.df.columns): # set a default, note here can have # non-unique index so work with position i if c in self.default_formatters: self._df_formatters.append(self.default_formatters[c]) elif c in self.ratio_cols: # print(f'{i} ratio') self._df_formatters.append(self.default_ratio_formatter) elif c in self.year_cols: self._df_formatters.append(self.default_year_formatter) elif c in self.raw_cols: self._df_formatters.append(self.default_raw_formatter) elif i in self.date_col_indices: self._df_formatters.append(self.default_date_formatter) elif i in self.integer_col_indices: # print(f'{i} int') self._df_formatters.append(self.default_integer_formatter) elif i in self.float_col_indices: # trickier approach... self._df_formatters.append( self.default_float_formatter or self.make_float_formatter(self.df.iloc[:, i])) else: # print(f'{i} default') self._df_formatters.append(self.default_formatter) # self._df_formatters is now a list of length config.equal to cols in df if len(self._df_formatters) != self.df.shape[1]: raise ValueError( f'Something wrong: {len(self._df_formatters)=} != {self.df.shape=}') return self._df_formatters @staticmethod def apply_formatters_work(df, formatters): """Apply formatters to a DataFrame.""" try: # very surprising bug: if df is empty new_df will have type float!! new_df = pd.DataFrame({i: map(f, df.iloc[:, i]) for i, f in enumerate(formatters)}, dtype=object) except TypeError: print('NASTY TYPE ERROR') raise new_df.columns = df.columns return new_df def apply_formatters(self, df, mode='adjusted'): """ Replace df (the raw df) with formatted df, including the index. If mode is 'adjusted' operates on columns only, does not touch the index. Otherwise, called from tikz and operating on raw_df """ if mode == 'adjusted': # apply to df where the index has been reset # number of columns = len(self.df_formatters) return GT.apply_formatters_work(df, self.df_formatters) elif mode == 'raw': # work on raw_df where the index has not been reset # because of non-unique indexes, index by position not name # create the df and the index separately data_formatters = self.df_formatters[self.nindex:] new_body = GT.apply_formatters_work(df, data_formatters) if not self.show_index: return new_body # else have to handle the index index_formatters = self.df_formatters[:self.nindex] df_index = df.reset_index( drop=False, col_level=self.df.columns.nlevels - 1).iloc[:, :self.nindex] new_index = GT.apply_formatters_work(df_index, index_formatters) # put them back together new_df = pd.concat([new_index, new_body], axis=1) new_df = new_df.set_index(list(df_index.columns)) new_df.index.names = df.index.names return new_df else: raise ValueError(f'unknown mode {mode}') @property def text_knowledge_df(self): """Uber source of information for text formatting.""" if self._text_knowledge_df is None: self._text_knowledge_df = self.estimate_column_widths_by_mode('text') return self._text_knowledge_df @property def html_knowledge_df(self): """Uber source of information for html formatting.""" if self._html_knowledge_df is None: self._html_knowledge_df = self.estimate_column_widths_by_mode('html') return self._html_knowledge_df @property def tex_knowledge_df(self): """Uber source of information for tex formatting.""" if self._tex_knowledge_df is None: # seems this is unlikely to be a good idea! # if (all(self.df_tex.index == self.df_html.index) # and all(self.df_tex.columns == self.df_html.columns) # and all(self.df_tex == self.df_html)): # self._tex_knowledge_df = self.html_knowledge_df # else: if not self.config.tikz: # just repeat html so you have something self._tex_knowledge_df = self.html_knowledge_df else: self._tex_knowledge_df = self.estimate_column_widths_by_mode('tex') return self._tex_knowledge_df @property def knowledge_dfs(self): if self._knowledge_dfs is None: self._knowledge_dfs = pd.concat((self.text_knowledge_df.T, self.html_knowledge_df.T, self.tex_knowledge_df.T), keys=['text','html', 'tex'], names=['mode', 'measure']) self._knowledge_dfs['Total'] = self._knowledge_dfs.fillna(0.).apply( lambda row: sum(x for x in row if pd.api.types.is_number(x)), axis=1) idx = self._knowledge_dfs.query('Total == 0').index self._knowledge_dfs.loc[idx, 'Total'] = '' self._knowledge_dfs = self._knowledge_dfs.fillna('') return self._knowledge_dfs def width_report(self): """Return a report summarizing the width information.""" natural = self.text_knowledge_df.natural_width.sum() minimum = self.text_knowledge_df.minimum_width.sum() text = self.text_knowledge_df.recommended.sum() h = self.html_knowledge_df.recommended.sum() tex = self.tex_knowledge_df.recommended.sum() tikz = self.tex_knowledge_df.tikz_colw.sum() mtw = self.max_table_width_em mtiw = self.config.max_table_inch_width pts = self.config.table_font_pt_size bit = pd.DataFrame({ 'text natural': self.text_knowledge_df.natural_width, 'text minimum': self.text_knowledge_df.minimum_width, 'text recommended': self.text_knowledge_df.recommended, 'html recommended': self.html_knowledge_df.recommended, 'tex recommended': self.tex_knowledge_df.recommended, 'tikz recommended': self.tex_knowledge_df.tikz_colw, }).fillna(0) ser = pd.Series({ 'text natural': natural, 'text minimum': minimum, 'text recommended': text, 'html recommended': h, 'tex recommended': tex, 'tikz recommended': tikz, }) bit.loc['total', :] = ser print(f"requested width = {mtw} em\n" f"max tbl inch w = {mtiw} inches\n" f"font pts = {pts} pts\n" f"width in em chk = {mtiw * 72 / pts} em\n" f"width mode = {self.config.table_width_mode}\n" f"header relax = {self.config.table_width_header_adjust}\n" f"header chars = {self.config.table_width_header_relax}") return bit def estimate_column_widths_by_mode(self, mode): r""" Return dataframe of width information: three modes for text, html, and tex. Mode adjusts which df is used and how widths are estimated * text -> self.df and len = str.len * html -> self.df_html and len = * tex -> self.df_tex and len = Returned dataframe has columns named mode_xxx, where xxx can be * natural: max len by col * minimum width = max length given breaks * acceptable = allowing for break type by column * head_natural, head_min, head_acceptable for the heading * raw_recommended * header_adjustment * recommended pat and iso_date_split regex explanation: # re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text) # (?<=...) is a lookbehind to preserve the break character with the left-hand fragment. # [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators: # \s = whitespace # . , : ; ! ? = terminal punctuation # () [] {} = brackets # \- = dash # \\/| = slash, backslash, pipe """ assert mode in ('text', 'html', 'tex'), 'Only html, text and tex modes valid.' if mode == 'text': df = self.df len_function = len # no bold in text mode bold_adjustment = 1.0 elif mode == 'html': df = self.df_html len_function = TextLength.text_display_len bold_adjustment = 1.1 else: # mode == 'tex': df = self.df_tex len_function = TextLength.text_display_len bold_adjustment = 1.1 n_row, n_col = df.shape # The width if content didn't wrap (single line) # Series=dict colname->max width of cells in column natural_width = df.map(lambda x: len_function(x.strip())).max(axis=0).to_dict() # in text mode: figure out where you can break; pat breaks after punctuation or at - pat = r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--*\s+|\s+' iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})' pat = f'{pat}|{iso_date_split}' # Calculate ideal (no wrap) and minimum possible widths for all columns # The absolute minimum width each column can take (e.g., longest word for text) minimum_width = {} header_natural = {} header_minimum = {} for col_name in df.columns: minimum_width[col_name] = ( df[col_name].str .split(pat=pat, regex=True, expand=True) .fillna('') .map(len_function) .max(axis=1) .max() ) # ensure is a tuple ctuple = col_name if isinstance(col_name, tuple) else (col_name, ) header_natural[col_name] = bold_adjustment * max(map(len_function, ctuple)) header_minimum[col_name] = bold_adjustment * min(len_function(part) for i in ctuple for part in re.split(pat, str(i))) # begin to assemble the parts # ans will be the col_width_df; break_penalties needed by all methods ans = pd.DataFrame({ 'alignment': [i[4:] for i in self.df_aligners], 'break_penalties': self.break_penalties, 'breakability': [x.name for x in self.break_penalties], 'natural_width': natural_width.values(), 'minimum_width': minimum_width.values(), }, index=df.columns) ans['acceptable_width'] = np.where( ans.break_penalties == Breakability.ACCEPTABLE, ans.minimum_width, ans.natural_width) ans['header_natural'] = header_natural ans['header_minimum'] = header_minimum if mode in ('html', 'tex'): # put in some padding TODO KLUDGE ans['natural_width'] += 1 ans['minimum_width'] += 1 ans['header_natural'] += 1 ans['header_minimum'] += 1 # adjustments and recommendations - these are keyed to text output with padding natural, acceptable, minimum = ans.iloc[:, 3:6].sum() head_natural, head_minimum = ans.iloc[:, 6:8].sum() if mode == 'text': # +1 for the pipe | symbol PADDING = 2 # per column TODO enhance pad_adjustment = (PADDING + 1) * n_col - 1 else: PADDING = 1 # per column TODO enhance pad_adjustment = PADDING * n_col if self.config.table_width_mode == 'explicit': # target width INCLUDES padding and column marks | target_width = self.max_table_width_em - pad_adjustment elif self.config.table_width_mode == 'natural': target_width = natural + pad_adjustment elif self.config.table_width_mode == 'breakable': target_width = acceptable + pad_adjustment elif self.config.table_width_mode == 'minimum': target_width = minimum + pad_adjustment logger.info('table_width_mode = %s', self.config.table_width_mode) logger.info('config self.max_table_width_em %s', self.max_table_width_em) logger.info('target width after column spacer adjustment %s', target_width) # extra space for the headers to relax, if useful if self.config.table_width_header_adjust > 0: max_extra = int(self.config.table_width_header_adjust * target_width) else: max_extra = 0 if target_width > natural: # everything gets its natural width ans['recommended'] = ans['natural_width'] space = target_width - natural logger.info('Space for NATURAL! Spare space = %s', space) elif target_width > acceptable: # strings wrap ans['recommended'] = ans['acceptable_width'] # use up extra on the ACCEPTABLE cols space = target_width - acceptable logger.info( 'Using "breaks acceptable" (dates not wrapped), spare space = %s', space) elif target_width > minimum: # strings and dates wrap ans['recommended'] = ans['minimum_width'] # use up extra on dates first, then strings space = target_width - minimum logger.info( 'Using "minimum" (all breakable incl dates), spare space = %s', space) else: # OK severely too small ans['recommended'] = ans['minimum_width'] space = target_width - minimum # hard to shut this up... logger.info( 'Mode %s, desired width too small for pleasant formatting, table will be too wide by spare space %s em < 0.', mode, space) logger.info(f'{mode=} {target_width=}, {natural=}, {acceptable=}, {minimum=}, {max_extra=}, {space=}') # this section tweaks the widths for column headers -> text output only. # trust tex and html output to naturally make better decisions about line breaks in the heading. if mode == "text" and space > 0 and df.columns.nlevels == 1: # text mode only: see if some header tweaks are in order (Index only for now, TODO) # Step 1: baseline comes in from code above ans['raw_recommended'] = ans['recommended'] # Step 2: optimize to get rid of intra-line breaks if max_extra > 0: adj = Width.header_adjustment(df, ans['recommended'], space, max_extra) # create new col and populate per GPT ans['header_tweak'] = pd.Series(adj) else: ans['header_tweak'] = 0 ans['recommended'] = ans['recommended'] + ans['header_tweak'] # in this case zero out impact of header_natural and header_minimum cos don't want to use them below ans['header_natural'] = ans['recommended'] ans['header_minimum'] = ans['recommended'] # Step 3 (all modes): distribute remaining shortfall proportionally # account for # obvs remaining == space if mode is not text remaining = target_width - ans['recommended'].sum() ans['pre_shortfall_recommended'] = ans['recommended'] if remaining > 0: shortfall = ans[['natural_width', 'header_natural']].max(axis=1) - ans['recommended'] total_shortfall = shortfall.clip(lower=0).sum() if total_shortfall > 0: logger.info('total shortfall to allocate after header adjustments = %s', total_shortfall) fractions = shortfall.clip(lower=0) / total_shortfall ans['proto_recommended'] = ans['recommended'] + np.floor(fractions * remaining).astype(int) ans['recommended'] = np.minimum(ans[['natural_width', 'header_natural']].max(axis=1), ans['proto_recommended']) else: logger.info('no shortfall to allocate after header adjustments') if mode == 'tex': # tex mode only need tikz raw size for tex code layout tikz_colw = dict.fromkeys(df.columns, 0) tikz_headw = dict.fromkeys(df.columns, 0) for i, c in enumerate(df.columns): # figure width of the column labels c0 = c # before we mess around with it, for setting dict values if not isinstance(c, tuple): # make it one: now index and multi index on same footing c = (c,) # convert to strings c = [str(i) for i in c] tikz_headw[c0] = max(map(len, c)) # now figure the width of the elements in the column tikz_colw[c0] = df.iloc[:, i].map(lambda x: len(str(x))).max() # needed tikz width is greater of two for c in df.columns: tikz_colw[c] = max(tikz_colw[c], tikz_headw[c]) # distribute any overage using the measures already done ans['tikz_colw'] = tikz_colw ans['tikz_colw'] += 2 # for \I # in all cases...assemble the answer with relevant information return_columns = [ 'alignment', 'break_penalties', 'breakability', 'natural_width', 'acceptable_width', 'minimum_width', 'header_natural', 'header_minimum', 'raw_recommended', 'header_tweak', 'pre_space_share_recommended', 'proto_recommended', 'recommended', 'tikz_colw', ] ans = ans[[i for i in return_columns if i in ans.columns]] # need recommended to be > 0 ans['recommended'] = np.maximum(ans['recommended'], 1) return ans def make_style(self, tabs): """Write out custom CSS for the table.""" if self.config.debug: head_tb = '#0ff' body_b = '#f0f' h0 = '#f00' h1 = '#b00' h2 = '#900' bh0 = '#f00' bh1 = '#b00' v0 = '#0f0' v1 = '#0a0' v2 = '#090' else: head_tb = '#000' body_b = '#000' h0 = '#000' h1 = '#000' h2 = '#000' bh0 = '#000' bh1 = '#000' v0 = '#000' v1 = '#000' v2 = '#000' table_hrule = self.config.table_hrule_width table_vrule = self.config.table_vrule_width # for local use padt, padr, padb, padl = self.padt, self.padr, self.padb, self.padl style = [f''' ') logger.info('CREATED CSS') return '\n'.join(style) def make_html(self): """Convert a pandas DataFrame to an HTML table.""" index_name_to_level = dict( zip(self.raw_df.index.names, range(self.nindex))) index_change_level = self.index_change_level.map(index_name_to_level) # this is easier and computed in the init column_change_level = self.column_change_level # Start table html = [f''] if self.label != "": pass # TODO put in achor tag somehow!! if self.caption != '': html.append(f'') # Process header: allow_duplicates=True means can create cols with the same name bit = self.df_html.T.reset_index(drop=False, allow_duplicates=True) idx_header = bit.iloc[:self.nindex, :self.ncolumns] columns = bit.iloc[self.nindex:, :self.ncolumns] # figure appropriate widths tabs = self.html_knowledge_df['recommended'].map(lambda x: np.round(x, 3)) # set column widths; tabs returns lengths of strings in each column tabs = np.array(tabs) + (self.padl + self.padr) / 12 # this gets stripped out by quarto, so make part of style html.append('') for w in tabs: html.append(f'') html.append('') # TODO Add header aligners # this is TRANSPOSED!! if self.config.sparsify_columns: html.append("") for i in range(self.ncolumns): # one per row of columns m index, usually only 1 html.append("") if self.show_index: for j, r in enumerate(idx_header.iloc[:, i]): # columns one per level of index html.append(f'') # if not for col span issue you could just to this: # for j in range(self.ncols): # hrule = f'grt-bhrule-{i}' if i < self.ncolumns - 1 else '' # if j == 0: # # start with the first column come what may # vrule = f'grt-vrule-index' # elif j >= self.column_change_level[i]: # vrule = f'grt-vrule-{column_change_level[cum_col]}' # else: # vrule = '' # html.append(f'') # here, the groupby needs to consider all levels at and above i # this concats all the levels # need :i+1 to get down to the ith level cum_col = 0 # keep track of where we are up to for j, (nm, g) in enumerate(groupby(columns.iloc[:, :i + 1]. apply(lambda x: ':::'.join(str(i) for i in x), axis=1))): # ::: needs to be something that does not appear in the col names # need to combine for groupby but be able to split off the last level # picks off the name of the bottom level nm = nm.split(':::')[-1] hrule = f'grt-bhrule-{i}' if i < self.ncolumns - 1 else '' colspan = sum(1 for _ in g) if 0 < j: vrule = f'grt-vrule-{column_change_level[cum_col]}' elif j == 0 and self.show_index: # start with the first column if showing index vrule = f'grt-vrule-index' else: vrule = '' if j == 0 and not self.show_index: # first column, no index, left align label html.append( f'') else: html.append( f'') cum_col += colspan html.append("") html.append("") else: html.append("") for i in range(self.ncolumns): # one per row of columns m index, usually only 1 html.append("") if self.show_index: for j, r in enumerate(idx_header.iloc[:, i]): # columns one per level of index html.append(f'') for j, r in enumerate(columns.iloc[:, i]): # one per column of dataframe # figure how high up mindex the vrules go # all headings get hrules, it's the vrules that are tricky hrule = f'grt-bhrule-{i}' if i < self.ncolumns - 1 else '' if 0 < j < self.ncols and i >= column_change_level[j]: vrule = f'grt-vrule-{column_change_level[j]}' elif j == 0 and self.show_index: # start with the first column come what may vrule = f'grt-vrule-index' else: vrule = '' html.append( f'') html.append("") html.append("") bold_idx = 'grt-bold' if self.config.font_bold_index else '' html.append("") for i, (n, r) in enumerate(self.df_html.iterrows()): # one per row of dataframe html.append("") hrule = '' if self.show_index: for j, c in enumerate(r.iloc[:self.nindex]): # dx = data in index # if this is the level that changes for this row # will use a top rule hence omit i = 0 which already has an hrule # appears in the index change level. But if it DOES NOT appear then # it isn't a change level so no rule required if i > 0 and hrule == '' and i in index_change_level and j == index_change_level[i]: hrule = f'grt-hrule-{j}' # html.append(f'') col_id = f'grt-c-{j}' html.append( f'') for j, c in enumerate(r.iloc[self.nindex:]): # first col left handled by index/body divider if 0 < j < self.ncols: vrule = f'grt-vrule-{column_change_level[j]}' elif j == 0 and self.show_index: # start with the first column come what may vrule = f'grt-vrule-index' else: vrule = '' # html.append(f'') col_id = f'grt-c-{j+self.nindex}' html.append( f'') html.append("") html.append("") text = '\n'.join(html) self._df_html_text = Escaping.clean_html_tex(text) logger.info('CREATED HTML') self._df_style_text = self.make_style(tabs) def clean_style(self, soup): """Minify CSS inside

Rendered Table

{table_html} ''' p = Path(fn) p.parent.mkdir(parents=True, exist_ok=True) p = p.with_suffix('.html') print(p) html = html_boiler_plate.replace('{table_html}', self.html) soup = BeautifulSoup(html, 'html.parser') p.write_text(soup.prettify(), encoding='utf-8') logger.info(f'Saved to {p}') def show_svg(self): """Display svg in Jupyter.""" svg = self.make_svg() if svg != 'no svg output': display(SVG(svg)) else: print('No SVG file available (TeX compile error).') def show_html(self, fn=''): if fn == '': with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp: tmp_path = Path(tmp.name) else: tmp_path = Path(fn) self.save_html(fn=tmp_path) os.startfile(tmp_path) # native Windows way to open in default browser return tmp_path @staticmethod def uber_test(df, show_html=False, **kwargs): """ Print various diagnostics and all the formats. show_html -> run show_html to display in new browser tab. """ f = GT(df, **kwargs) display(f) if show_html: f.show_html() print(f) f.show_svg() display(df) display(f.width_report()) print(f.make_tikz()) return f @staticmethod def _is_namedtuple_instance(x) -> bool: """Heuristic: namedtuple instances are tuples whose class defines _fields.""" return isinstance(x, tuple) and isinstance(getattr(type(x), "_fields", None), tuple) @staticmethod def _ntdf(t): """Convert named tuple to pandas dataframe to display.""" return pd.Series(t, index=pd.Index(t._fields, name="Item")).to_frame('Value')
{self.caption}
{r}{nm}{nm}{nm}
{r}{r}
{c}{c}{c}{c}