diff --git a/docs/versions.rst b/docs/versions.rst index 136b719..2a54d67 100644 --- a/docs/versions.rst +++ b/docs/versions.rst @@ -1,6 +1,22 @@ Versions ========== +5.0.0 +------- +* Introduced ``knowledge_df`` as source of all formatting instructions +* ``df_tex`` and ``df_html`` computed before ``knowledge`` applying TeX and HTML specific transformations which are reflected in the estimated widths +* Introduced gtutilities to pull out text width estimation, cleaning and escaping, etc. +* Delete rich table output format? +* Moved logging setup to separate file, called from cli but optional for use in other situation. GPT recommended approach. + +* test cases + * tex and or html in data, index, columns, escaped/unescaped + * long cols + * text with multiindex columns + +* TODO + * self.padl and r / 12 in make html width adj s/b elsewhere + 4.0.0 ------- * Moved constants out of magic strings and into config diff --git a/greater_tables/__init__.py b/greater_tables/__init__.py index 51d6041..04a00e7 100644 --- a/greater_tables/__init__.py +++ b/greater_tables/__init__.py @@ -1,4 +1,4 @@ -__version__ = '4.0.0' +__version__ = '5.0.0' __project__ = 'greater_tables' __author__ = 'Stephen J Mildenhall' diff --git a/greater_tables/cli.py b/greater_tables/cli.py index a0052a6..34b1bc1 100644 --- a/greater_tables/cli.py +++ b/greater_tables/cli.py @@ -1,6 +1,15 @@ +""" +Command line interface: convert files to HTML, text, svg or pdf. +""" + import click import pandas as pd from pathlib import Path + +from . gtlogging import setup_logging + +setup_logging() # <-- must come before using your package + from .gtconfig import Configurator, write_template from .gtcore import GT diff --git a/greater_tables/gtconfig.py b/greater_tables/gtconfig.py index 7cd9459..29647b9 100644 --- a/greater_tables/gtconfig.py +++ b/greater_tables/gtconfig.py @@ -85,10 +85,9 @@ class Configurator(BaseModel): padding_trbl: Optional[tuple[int, int, int, int]] = Field( None, description="Manual padding in the order (top, right, bottom, left)" ) + tex_to_html: Optional[Callable[[str], str]] = Field( + default=None, description="Function to map non-math TeX to HTML, eg remap \\textbf{...}") - tikz_scale: float = Field( - 1.0, description="Scaling factor applied to LaTeX TikZ tables" - ) font_body: float = Field( 0.9, description="Font size for body text (in em units)" ) @@ -132,6 +131,9 @@ class Configurator(BaseModel): -1, description="Maximum length for stringified objects (e.g. nested DataFrames); -1 = unlimited" ) + header_alignment: Literal["few", "center"] = Field( + default='few', description="NYI!! TOOD Alignment of header cells, few=follow column, center=center." ) + max_table_width: int = Field( 200, description="Maximum table width for markdown/text output mode" ) @@ -153,10 +155,13 @@ class Configurator(BaseModel): ) # tikz specific options + tikz_scale: float = Field( + 1.0, description="Scaling factor applied to LaTeX TikZ tables" + ) tikz_column_sep: float = Field( - 0.5, description="Separation between columns") + 1, description="Separation between columns") tikz_row_sep: float = Field( - 0.125, description="Separation between rows") + 0.25, description="Separation between rows") tikz_container_env: Literal["table", "figure", "sidewaysfigure"] = Field( default="table", description="Type of element: 'table', 'figure', or 'sidewaysfigure'" @@ -175,7 +180,8 @@ class Configurator(BaseModel): '', description="non-line commands put at bottom of table") tikz_latex: Optional[str] = Field( None, description="arguments at top of table \\begin{table}[tikz_latex]") - + tikz_escape_tex: Optional[bool] = Field( + True, description="If true, escape non-math TeX like %, _ and \\. Default no escaping, allowing \\textbf{...} to work.") # meta debug: bool = Field( False, description="Run in debug mode with more reporting, include internal ID in caption and use colored output lines") diff --git a/greater_tables/gtcore.py b/greater_tables/gtcore.py index 8315edf..723c563 100644 --- a/greater_tables/gtcore.py +++ b/greater_tables/gtcore.py @@ -7,15 +7,14 @@ to HTML, plain text, or LaTeX output using a validated configuration model. This is the main entry point for rendering logic. See `gtconfig.py` for configuration schema. """ -from collections import namedtuple from decimal import InvalidOperation from io import StringIO from itertools import groupby import logging +import os from pathlib import Path import re -import sys -from textwrap import wrap +import tempfile from typing import Optional, Union, Literal import warnings import yaml @@ -29,16 +28,15 @@ from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \ is_float_dtype # , is_numeric_dtype from pydantic import ValidationError from rich import box -from rich.table import Table from IPython.display import display, SVG -from . gtenums import Breakability, Alignment -from . gtformats import GT_Format, TableFormat, Line, DataRow +from . gtenums import Breakability from . gtconfig import Configurator from . gthasher import df_short_hash from . gtetcher import Etcher +from . gtutilities import * -# turn this fuck-fest off +# turn off this fuck-fest pd.set_option('future.no_silent_downcasting', True) # pandas complaining about casting columns eg putting object in float column warnings.simplefilter(action='ignore', category=FutureWarning) @@ -46,22 +44,6 @@ warnings.simplefilter(action='ignore', category=FutureWarning) # GPT recommended approach logger = logging.getLogger(__name__) -# Disable log propagation to prevent duplicates -logger.propagate = False -if logger.hasHandlers(): - # Clear existing handlers - logger.handlers.clear() -# SET DEGBUGGER LEVEL -LEVEL = logging.WARNING # DEBUG or INFO, WARNING, ERROR, CRITICAL -logger.setLevel(LEVEL) -handler = logging.StreamHandler(sys.stderr) -handler.setLevel(LEVEL) -formatter = logging.Formatter( - '%(asctime)s | %(levelname)s | %(funcName)-15s | %(message)s') -handler.setFormatter(formatter) -logger.addHandler(handler) -logger.info(f'Logger Setup; {__name__} module recompiled.') - class GT(object): """ @@ -215,17 +197,6 @@ class GT(object): default False. """ - # TeX control sequence display widths (heuristic) - TEX_SIMPLE_GLYPHS = { - 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', - 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', - 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega', 'infty', - 'sum', 'prod', 'int', 'cup', 'cap', 'vee', 'wedge', 'forall', 'exists', - 'neg', 'leq', 'geq', 'neq', 'approx', 'to', 'leftarrow', 'rightarrow' - } - TEX_WIDE = {'frac', 'sqrt', 'sum', 'int', 'prod'} - TEX_SPACING = {'quad', 'qquad', ',', ';', ' ', '!'} - def __init__( self, df, @@ -293,7 +264,7 @@ class GT(object): if df == '': df = pd.DataFrame([]) else: - df, aligners, caption, label = GT.md_to_df(df) + df, aligners, caption, label = MD2DF.md_to_df(df) show_index = False else: raise ValueError( @@ -349,14 +320,14 @@ class GT(object): drop=False, col_level=self.df.columns.nlevels - 1) # want the new index to be ints - that is not default if old was multiindex self.df.index = np.arange(self.df.shape[0], dtype=int) - self.index_change_level = GT.changed_column( + self.index_change_level = Indexing.changed_column( self.df.iloc[:, :self.nindex]) if self.ncolumns > 1: # will be empty rows above the index headers self.index_change_level = pd.Series( [i[-1] for i in self.index_change_level]) - self.column_change_level = GT.changed_level(self.raw_df.columns) + self.column_change_level = Indexing.changed_level(self.raw_df.columns) # determine ratio columns if ratio_cols is not None and not self.df.columns.is_unique: @@ -618,19 +589,33 @@ class GT(object): self._clean_tex = '' self._rich_table = None self._string = '' - self._column_width_df = None + self._df_html_text = "" + self._df_style_text = "" + self._cache = LRUCache(20) + self._text_knowledge_df = None + self._html_knowledge_df = None + self._tex_knowledge_df = None + # finally config.sparsify and then apply formaters # this radically alters the df, so keep a copy for now... self.df_pre_applying_formatters = self.df.copy() self.df = self.apply_formatters(self.df) - # cache for various things... - self._cache = LRUCache(20) # config.sparsify if self.config.sparsify and self.nindex > 1: - self.df = GT.sparsify(self.df, self.df.columns[:self.nindex]) + self.df = Sparsify.sparsify(self.df, self.df.columns[:self.nindex]) # for c in self.df.columns[:self.nindex]: # # config.sparsify returns some other stuff... # self.df[c], _ = GT.config.sparsify(self.df[c]) + # make final tex and html versions + if self.config.tex_to_html is not None: + # NEED TO WORK ON INDEXES TOO + self.df_html = self.df.map(self.config.tex_to_html) + else: + self.df_html = self.df + if self.config.tikz_escape_tex: + self.df_tex = Escaping.escape_df_tex(self.df) + else: + self.df_tex def __repr__(self): """Basic representation.""" @@ -657,6 +642,14 @@ class GT(object): logger.info('CREATED LATEX') return self._clean_tex + def cache_get(self, key): + """Retrieve item from cache.""" + return self._cache.get(key, None) + + def cache_set(self, key, value): + """Add item to cache.""" + self._cache[key] = value + def cols_from_regex(self, regex): """ Return columns matching a regex. @@ -674,14 +667,6 @@ class GT(object): return matching_cols # return [col for col in self.df.columns if isinstance(col, str) and re.search(regex, col)] - def cache_get(self, key): - """Retrieve item from cache.""" - return self._cache.get(key, None) - - def cache_set(self, key, value): - """Add item to cache.""" - self._cache[key] = value - # define the default and easy formatters =================================================== def default_ratio_formatter(self, x): """Ratio formatter.""" @@ -953,119 +938,140 @@ class GT(object): else: raise ValueError(f'unknown mode {mode}') - @staticmethod - def changed_column(bit): - """Return the column that changes with each row.""" - tf = bit.ne(bit.shift()) - tf = tf.loc[tf.any(axis=1)] - return tf.idxmax(axis=1) - - @staticmethod - def changed_level(idx): - """ - Return the level of index that changes with each row. - - Very ingenious GTP code with some SM enhancements. - """ - # otherwise you alter the actual index - idx = idx.copy() - idx.names = [i for i in range(idx.nlevels)] - # Determine at which level the index changes - # Convert MultiIndex to a DataFrame - index_df = idx.to_frame(index=False) - # true / false match last row - tf = index_df.ne(index_df.shift()) - # changes need at least one true - tf = tf.loc[tf.any(axis=1)] - level_changes = tf.idxmax(axis=1) - return level_changes + @property + def text_knowledge_df(self): + """Uber source of information for text formatting.""" + if self._text_knowledge_df is None: + self._text_knowledge_df = self.estimate_column_widths_by_mode('text') + return self._text_knowledge_df @property - def column_width_df(self): + def html_knowledge_df(self): + """Uber source of information for html formatting.""" + if self._html_knowledge_df is None: + self._html_knowledge_df = self.estimate_column_widths_by_mode('html') + return self._html_knowledge_df + + @property + def tex_knowledge_df(self): + """Uber source of information for tex formatting.""" + if self._tex_knowledge_df is None: + self._tex_knowledge_df = self.estimate_column_widths_by_mode('tex') + return self._tex_knowledge_df + + def width_report(self): + """Return a report summarizing the width information.""" + natural = self.text_knowledge_df.natural_width.sum() + minimum = self.text_knowledge_df.minimum_width.sum() + ht = self.text_knowledge_df.header_tweak.sum() + text = self.text_knowledge_df.recommended.sum() + h = self.html_knowledge_df.recommended.sum() + tikz = self.tex_knowledge_df['tikz_colw'].sum() + tex = self.tex_knowledge_df['scaled_tabs'].sum() + mtw = self.config.max_table_width + cols = self.df.shape[1] + df = pd.Series({ + 'text natural': natural, + 'text minimum': minimum, + 'text header tweak': ht, + 'text recommended': text, + 'html recommended': h, + 'tex recommended': tex, + 'tikz recommended': tikz, + 'requested': mtw, + 'width mode' : self.config.table_width_mode, + 'header relax': self.config.table_width_header_adjust, + 'header chars': self.config.table_width_header_relax, + }).to_frame('value') + df.index.name = 'metric' + return df + + def estimate_column_widths_by_mode(self, mode): """ - The single source of truth for all info about column widths. + Return dataframe of width information: three modes for text, html, and tex. + + Mode adjusts which df is used and how widths are estimated + + * text -> self.df and len = str.len + * html -> self.df_html and len = + * tex -> self.df_tex and len = + + Returned dataframe has columns named mode_xxx, where xxx can be + + * natural: max len by col + * minimum width = max length given breaks + * acceptable = allowing for break type by column + + * head_natural, head_min, head_acceptable for the heading + + * raw_recommended + * header_adjustment + * recommended + + pat and iso_date_split regex explanation: + + # re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text) + # (?<=...) is a lookbehind to preserve the break character with the left-hand fragment. + # [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators: + # \s = whitespace + # . , : ; ! ? = terminal punctuation + # () [] {} = brackets + # \- = dash + # \\/| = slash, backslash, pipe - Adds `estimate_column_widths` columns to `make_column_width_df`. """ - if self._column_width_df is None: - self._column_width_df = self.make_column_width_df() - tikz_colw, tabs, scaled_tabs = self.estimate_column_widths() - self._column_width_df['tikz_colw'] = tikz_colw - self._column_width_df['tikz_colw'] += 2 # for \I spacer! - self._column_width_df['estimated_tabs'] = tabs - self._column_width_df['estimated_scaled_tabs'] = scaled_tabs - if self.tabs is not None: - self._column_width_df['input_tabs'] = self.tabs - else: - self._column_width_df['input_tabs'] = -1 - # this column should be used in place of tabs from estimate_column_widths - # in make html and make tikz - self._column_width_df['tabs'] = np.maximum(self._column_width_df['input_tabs'], - self._column_width_df['estimated_tabs']) - self._column_width_df['scaled_tabs'] = np.maximum(self._column_width_df['input_tabs'], - self._column_width_df['estimated_scaled_tabs']) + assert mode in ('text', 'html', 'tex'), 'Only html, text and tex modes valid.' + if mode == 'text': + df = self.df + len_function = len + elif mode == 'html': + df = self.df_html + len_function = TextLength.text_display_len - return self._column_width_df + else: # mode == 'tex': + df = self.df_tex + len_function = TextLength.text_display_len - def make_column_width_df(self): - """ - Return dataframe of width information. - - Returned dataframe has columns for - - * natural width, all on one line = max len by col - * min width = max length given breaks - * break type of column - * alignment of column - * index natural width - * index min width - """ - df = self.df n_row, n_col = df.shape # The width if content didn't wrap (single line) # Series=dict colname->max width of cells in column - natural_width = df.map(lambda x: len(x.strip())).max(axis=0).to_dict() + natural_width = df.map(lambda x: len_function(x.strip())).max(axis=0).to_dict() - # re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text) - # (?<=...) is a lookbehind to preserve the break character with the left-hand fragment. - # [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators: - # \s = whitespace - # . , : ; ! ? = terminal punctuation - # () [] {} = brackets - # \- = dash - # \\/| = slash, backslash, pipe + # in text mode: figure out where you can break pat = r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--+\s+|\s+' iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})' pat = f'{pat}|{iso_date_split}' # Calculate ideal (no wrap) and minimum possible widths for all columns # The absolute minimum width each column can take (e.g., longest word for text) - min_acceptable_width = {} + minimum_width = {} for col_name in df.columns: - min_acceptable_width[col_name] = ( + minimum_width[col_name] = ( df[col_name].str .split(pat=pat, regex=True, expand=True) .fillna('') - .map(len) + .map(len_function) .max(axis=1) .max() ) - # ans will be the col_width_df + + # begin to assemble the parts + # ans will be the col_width_df; break_penalties needed by all methods + ans = pd.DataFrame({ 'alignment': [i[4:] for i in self.df_aligners], 'break_penalties': self.break_penalties, 'breakability': [x.name for x in self.break_penalties], 'natural_width': natural_width.values(), - 'min_acceptable_width': min_acceptable_width.values(), - }, index=df.columns) - ans['break_acceptable'] = np.where( - ans.break_penalties == Breakability.ACCEPTABLE, ans.min_acceptable_width, ans.natural_width) - # DUH - this is min_acceptable_width - # ans['break_dates'] = np.where(ans.break_penalties==Breakability.DATE, ans.min_acceptable_width, ans.break_acceptable) + 'minimum_width': minimum_width.values(), + }, index=df.columns) + ans['acceptable_width'] = np.where( + ans.break_penalties == Breakability.ACCEPTABLE, ans.minimum_width, ans.natural_width) - natural, acceptable, min_acceptable = ans.iloc[:, 3:].sum() - PADDING = 2 # per column + # adjustments and recommendations + natural, acceptable, minimum = ans.iloc[:, 3:].sum() + PADDING = 2 # per column TODO enhance if self.config.table_width_mode == 'explicit': # target width INCLUDES padding and column marks | target_width = self.config.max_table_width - \ @@ -1077,7 +1083,7 @@ class GT(object): elif self.config.table_width_mode == 'breakable': target_width = acceptable + (PADDING + 1) * n_col + 1 elif self.config.table_width_mode == 'minimum': - target_width = min_acceptable + (PADDING + 1) * n_col + 1 + target_width = minimum + (PADDING + 1) * n_col + 1 # extra space for the headers to relax, if useful if self.config.table_width_header_adjust > 0: @@ -1086,6 +1092,8 @@ class GT(object): else: max_extra = 0 + logger.info(f'{mode=} {target_width=}, {natural=}, {acceptable=}, {minimum=}') + if target_width > natural: # everything gets its natural width ans['recommended'] = ans['natural_width'] @@ -1093,614 +1101,234 @@ class GT(object): logger.info('Space for NATURAL! Spare space = %s', space) elif target_width > acceptable: # strings wrap - ans['recommended'] = ans['break_acceptable'] + ans['recommended'] = ans['acceptable_width'] # use up extra on the ACCEPTABLE cols space = target_width - acceptable logger.info( 'Using breaks acceptable (dates not wrapped), spare space = %s', space) - elif target_width > min_acceptable: + elif target_width > minimum: # strings and dates wrap - ans['recommended'] = ans['min_acceptable_width'] + ans['recommended'] = ans['minimum_width'] # use up extra on dates first, then strings - space = target_width - min_acceptable + space = target_width - minimum logger.info( 'Breaking all breakable (incl dates), spare space = %s', space) else: # OK severely too small - ans['recommended'] = ans['min_acceptable_width'] + ans['recommended'] = ans['minimum_width'] logger.info( 'Desired width too small for pleasant formatting, table will be too wide.') - space = target_width - min_acceptable + space = target_width - minimum - input_df = None - if space >= 0: - # Allocate the excess ------------------------------ - # Fancy col headings currently only for 1-d index - # TODO NOTE: use config.sparsify logic you have for index applied to df.T - # to sort the columns!! - if df.columns.nlevels == 1: - # Step 1: baseline comes in from code above - ans['raw_rec'] = ans['recommended'] + # this section adjusts for column headers. text has a + # more rigorous adjustment than the other two methds (which + # will naturally make better decisions about line breaks in the heading). + if mode == "text": + input_df = None + if space >= 0: + # Allocate the excess ------------------------------ + # Fancy col headings currently only for 1-d index + # TODO NOTE: use config.sparsify logic you have for index applied to df.T + # to sort the columns!! + if df.columns.nlevels == 1: + # Step 1: baseline comes in from code above + ans['raw_recommended'] = ans['recommended'] - # Step 2: get rid of intra-line breaks - if max_extra > 0: - adj, input_df = self.header_adjustment( - df, ans['recommended'], space, max_extra) - # create new col and populate per GPT - ans['header_tweak'] = pd.Series(adj) + # Step 2: get rid of intra-line breaks + if max_extra > 0: + adj, input_df = Width.header_adjustment( + df, ans['recommended'], space, max_extra) + # create new col and populate per GPT + ans['header_tweak'] = pd.Series(adj) + else: + ans['header_tweak'] = 0 + ans['recommended'] = ans['recommended'] + ans['header_tweak'] + ans['natural_w_header'] = ans['recommended'] else: - ans['header_tweak'] = 0 - ans['recommended'] = ans['recommended'] + ans['header_tweak'] - ans['natural_w_header'] = ans['recommended'] - else: - # avoid a failure blow - ans['raw_rec'] = np.nan - ans['header_tweak'] = np.nan - ans['natural_w_header'] = np.nan - # Step 3: distribute remaining slack proportionally - remaining = target_width - ans['recommended'].sum() - if remaining > 0: - slack = ans['natural_width'] - ans['recommended'] - total_slack = slack.clip(lower=0).sum() - if total_slack > 0: - fractions = slack.clip(lower=0) / total_slack - ans['recommended'] += np.floor(fractions * - remaining).astype(int) - ans['recommended'] = np.maximum( - ans['recommended'], ans['natural_w_header']) + # avoid a failure blow + ans['raw_recommended'] = np.nan + ans['header_tweak'] = np.nan + ans['natural_w_header'] = np.nan + # Step 3: distribute remaining slack proportionally + remaining = target_width - ans['recommended'].sum() + if remaining > 0: + slack = ans['natural_width'] - ans['recommended'] + total_slack = slack.clip(lower=0).sum() + if total_slack > 0: + fractions = slack.clip(lower=0) / total_slack + ans['recommended'] += np.floor(fractions * + remaining).astype(int) + ans['recommended'] = np.maximum( + ans['recommended'], ans['natural_w_header']) - # Ensure final constraint - try: - ans['recommended'] = ans['recommended'].astype(int) - except IntCastingNaNError: - print('getting error') - print(ans['recommended']) - ans['recommended'] = pd.to_numeric( - ans['recommended'], errors='coerce').fillna(0).astype(int) + # Ensure final constraint + try: + ans['recommended'] = ans['recommended'].astype(int) + except IntCastingNaNError: + print('getting error') + print(ans['recommended']) + ans['recommended'] = pd.to_numeric( + ans['recommended'], errors='coerce').fillna(0).astype(int) - logger.info("Raw rec: %s\tTweaks: %s\tActual: %s\tTarget: %s\tOver/(U): %s", - ans['raw_rec'].sum(), - ans['header_tweak'].sum(), - ans['recommended'].sum(), - target_width, - ans['recommended'].sum() - target_width - ) - ans = ans[[ - 'alignment', - 'break_penalties', - 'breakability', - 'natural_width', - 'break_acceptable', - 'min_acceptable_width', - 'raw_rec', - 'header_tweak', - 'natural_w_header', - 'recommended', - ]] - # in all cases... - # need recommended to be > 0 - ans['recommended'] = np.maximum(ans['recommended'], 1) - self.cache_set('column_width_df', ans) - # info about the header adjustment - self.cache_set('input_df', input_df) - - return ans - - @staticmethod - def header_adjustment(df, min_widths, space, max_extra): - """ - Fine-adjust heading for optimal config.spacing. - - Return a dict with per-column recommended width adjustments to avoid - intra-word breaks and reduce overall header height. - - Parameters: - df: DataFrame with 1-level string column names - min_widths: dict of column name -> minimal acceptable width - space: amount of space available to be allocated - max_extra: max extra characters to consider allocating per column - - Returns: - dict: column -> additional width to allocate - """ - colnames = list(df.columns) - adjustments = {col: 0 for col in colnames} - num_lines = 0 - - def has_intra_word_break(text: str, width: int) -> bool: - """ - Determine if textwrap.wrap breaks any words in the given text. - - Gemini - GPT code did not work, even after seveal iterations. - This is a nice approach to the problem. - - Args: - text: The input string. - width: The maximum width for wrapping. - - Returns: - True if any word is broken across lines, False otherwise. - """ - nonlocal num_lines - wrapped_lines = wrap(text, width=width) - num_lines = len(wrapped_lines) - original_words = text.split() - - reconstructed_text_from_wrapped = " ".join(wrapped_lines) - reconstructed_words = reconstructed_text_from_wrapped.split() - - # If the number of words differs, it means some words were split. - # This catches cases where a word might be split and then later re-joined - # due to subsequent wrapping logic, leading to a different number of words. - if len(original_words) != len(reconstructed_words): - return True - - # Compare word by word. If any word from the original doesn't exactly match - # a word from the reconstructed list, it implies a split. - for i in range(len(original_words)): - if original_words[i] != reconstructed_words[i]: - return True - - return False - - # First pass: avoid ugly intraword breaks - # make dict of col -> longest word length - min_acceptable = {c: v for c, v in - zip(colnames, map(lambda x: max(len(i) for i in re.split(r'[ \-/]', x)), colnames))} - options = [] - for col in colnames: - if not isinstance(col, str): - continue - base_width = min_widths[col] - if not has_intra_word_break(col, base_width): - options.append([col, 0, num_lines]) - # nothing to be gained, move to next col - continue - extra0 = max(0, min_acceptable[col] - base_width) - if extra0 > max_extra: - # ok, can't flatten word because it is too long - extra0 = 0 - elif extra0 == max_extra: - # go with that - adjustments[col] = max_extra - continue - # see if col can be flattened within max_extra chars, starting - # at extra0, which is enough to avoid intraword breaks - for extra in range(extra0, max_extra + 1): - if not has_intra_word_break(col, base_width + extra): - options.append([col, extra, num_lines]) - if adjustments[col] == 0: - # take first, but compute rest... - adjustments[col] = extra - # temporary diagnostic DEBUG information - comment in prod - # from IPython.display import display - # config.debug = pd.Series([col, min_acceptable[col], base_width, has_intra_word_break(col, base_width), extra0, max_extra, - # wrap(col, base_width), extra], - # index=['col name', 'min acceptable', 'base_width (from data)', 'intra word break', 'extra0', 'max_extra', 'split', 'selected extra']).to_frame('Value') - # display(config.debug) - # make df[col name, amount of extra space for col, resulting number of lines] - # this is needed as input for the optimal heading function (next) - input_df = pd.DataFrame(options, columns=['col', 'extra', 'num_lines']) - # min amount to avoid intra work breaks - avoid_intra = input_df.groupby('col').min().extra.sum() - if avoid_intra >= space: - # that's all we can do - print("NO FURTHER IMPROVEMENTS") + logger.info("Raw rec: %s\tTweaks: %s\tActual: %s\tTarget: %s\tOver/(U): %s", + ans['raw_recommended'].sum(), + ans['header_tweak'].sum(), + ans['recommended'].sum(), + target_width, + ans['recommended'].sum() - target_width + ) else: - # can try for a better solution - sol = GT.optimal_heading(input_df, space) - adjustments.update(sol[1]) - logger.info('best solution: %s', sol) - # global temp - # temp = input_df - return adjustments, input_df + # for html and tex modes: adapts from old estimate_column_widths + target_width = self.config.max_table_width + nc_index = self.nindex - @staticmethod - def optimal_heading(input_df: pd.DataFrame, total_es_budget: int) -> tuple[int, dict[str, int]]: - """ - Optimize extra config.spacing for best heading. - - Finds the best way to allocate extra space to minimize max_lines in heading. - - Gemini solution. - - Args: - input_df: DataFrame with 'col', 'extra', 'num_lines'. - total_es_budget: The total extra space to allocate. - - Returns: - A tuple: (min_max_lines, optimal_extra_allocation_per_column). - - .. _table_layout_optimization: - - Table Layout Optimization - ========================= - - This document describes the algorithm implemented in the :py:func:`find_best_layout` function, which aims to optimize the allocation of a fixed amount of extra space (`ES`) among table columns to minimize the overall table height (i.e., the maximum number of lines used by any single column). - - Problem Statement - ----------------- - - Given a set of table columns, each with a known relationship between allocated "extra space" and the resulting "number of lines" it occupies when wrapped, and a total budget of extra space, the goal is to find an allocation of this extra space to each column such that the maximum number of lines among all columns is minimized. - - For example, a column named "location category (float)" might take 3 lines with 0 extra space, but perhaps only 2 lines with 2 extra space, and 1 line with 5 extra space. The relationship is provided in a Pandas DataFrame with columns `col`, `extra`, and `num_lines`. - - Algorithm: Binary Search on the Answer - ---------------------------------------- - - The problem exhibits a monotonic property: if a table layout can be achieved with a maximum height of `X` lines, it can also be achieved with any maximum height `Y > X` lines (by simply using the same or more `extra` space). This property makes binary search on the *minimum possible maximum lines* an efficient solution. - - The algorithm proceeds as follows: - - 1. **Preprocessing the Input Data:** - The input `pandas.DataFrame` is processed to create a convenient lookup structure. For each unique column, a sorted list of `(extra_space, num_lines)` tuples is created. This allows for quick identification of the minimum `extra` space required for a given `column` to fit within a `target_max_lines`. - - .. code-block:: python - - unique_cols = input_df['col'].unique().tolist() - col_extra_num_lines_options = {} - for col_name in unique_cols: - col_data = input_df[input_df['col'] == col_name].sort_values(by='extra') - col_extra_num_lines_options[col_name] = list(zip(col_data['extra'], col_data['num_lines'])) - - 2. **Defining the Search Space (Bounds for `max_lines`):** - The binary search operates on the possible values for the `optimal_max_lines`. - * **Lower Bound (`L`):** The absolute minimum number of lines observed across all columns and all `extra` space options in the input data. This represents the theoretical minimum height a column could ever achieve. - * **Upper Bound (`R`):** The absolute maximum number of lines observed across all columns and all `extra` space options in the input data. This represents the worst-case height, which is always achievable. - - .. code-block:: python - - all_num_lines = input_df['num_lines'].unique() - if len(all_num_lines) == 0: - return 0, {} # Handle empty DataFrame case - L = all_num_lines.min() - R = all_num_lines.max() - - 3. **The `check(target_max_lines)` Function:** - This is the core helper function for the binary search. Given a `target_max_lines` (a candidate for the overall maximum height), it determines if it's *possible* to achieve this height for *all* columns simultaneously, without exceeding the `total_es_budget`. - - For each column: - * It iterates through its `(extra_space, num_lines)` options (which are sorted by `extra_space`). - * It finds the *smallest* `extra_space` value for which the corresponding `num_lines` is less than or config.equal to `target_max_lines`. - * If no such `extra_space` is found for a column (meaning even with the maximum available `extra` for that column, it still exceeds `target_max_lines`), then `target_max_lines` is not achievable, and the function returns `False`. - * Otherwise, it sums up these minimum required `extra_space` values across all columns. - * If the total `extra_space` required is less than or config.equal to `total_es_budget`, the function returns `True` (meaning `target_max_lines` is achievable). Otherwise, it returns `False`. - - .. code-block:: python - - def check(target_max_lines: int) -> bool: - current_extra_needed = 0 - for col_name in unique_cols: - min_extra_for_col = float('inf') - found_suitable_extra = False - for extra_val, num_lines_val in col_extra_num_lines_options[col_name]: - if num_lines_val <= target_max_lines: - min_extra_for_col = extra_val - found_suitable_extra = True - break # Found the minimum extra for this column - - if not found_suitable_extra: - return False # This target_max_lines is too low for this column - - current_extra_needed += min_extra_for_col - - return current_extra_needed <= total_es_budget - - 4. **Binary Search Loop:** - The main binary search loop iteratively narrows down the range `[L, R]`. - * In each iteration, it calculates the `mid_max_lines = L + (R - L) // 2`. - * It then calls the `check(mid_max_lines)` function. - * If `check(mid_max_lines)` returns `True` (meaning `mid_max_lines` is achievable): - * `mid_max_lines` becomes a candidate for the `optimal_max_lines`. We record the current allocation that achieved it. - * We try to achieve an even smaller `max_lines` by setting `R = mid_max_lines - 1`. - * If `check(mid_max_lines)` returns `False` (meaning `mid_max_lines` is not achievable): - * We need to allow for more lines, so we set `L = mid_max_lines + 1`. - - The loop continues until `L > R`, at which point `optimal_max_lines` will hold the smallest possible maximum height, and `best_allocation` will store the corresponding `extra_space` allocation for each column. - - .. code-block:: python - - optimal_max_lines = R - best_allocation = {} - - while L <= R: - mid_max_lines = L + (R - L) // 2 - - # Recalculate allocation within the loop to store the specific 'extra' values - temp_current_extra_needed = 0 - temp_current_allocation = {} - possible = True - for col_name in unique_cols: - min_extra_for_col = float('inf') - found_suitable_extra = False - for extra_val, num_lines_val in col_extra_num_lines_options[col_name]: - if num_lines_val <= mid_max_lines: - min_extra_for_col = extra_val - found_suitable_extra = True - break - - if not found_suitable_extra: - possible = False - break - - temp_current_extra_needed += min_extra_for_col - temp_current_allocation[col_name] = min_extra_for_col - - if possible and temp_current_extra_needed <= total_es_budget: - optimal_max_lines = mid_max_lines - best_allocation = temp_current_allocation.copy() - R = mid_max_lines - 1 + # without tex adjustment + tikz_colw = dict.fromkeys(df.columns, 0) + # with tex adjustment + tex_colw = dict.fromkeys(df.columns, 0) + headw = dict.fromkeys(df.columns, 0) + tikz_headw = dict.fromkeys(df.columns, 0) + tabs = [] + mxmn = {} + for i, c in enumerate(df.columns): + # figure width of the column labels; if index c= str, if MI then c = tuple + # cw is the width of the column header/title + # tzcw is for tikz - no wrapping and no tex adjustment + if type(c) == str: + if i < nc_index: + cw = len_function(c) + tzcw = len(c) else: - L = mid_max_lines + 1 - - The function returns the `optimal_max_lines` and the `best_allocation` dictionary, mapping each column name to the minimal `extra_space` it needs to achieve that optimal height. - - Why this approach is effective: - --------------------------------- - - * **Optimal Solution:** The binary search guarantees finding the absolute minimum possible `max_lines` because it systematically explores the entire solution space. - * **Efficiency:** The `check` function runs in time proportional to the number of columns times the average number of `extra` options per column. The binary search itself performs `log(range_of_num_lines)` iterations. This makes the overall complexity efficient for typical table sizes. - * **Flexibility:** It does not assume any particular mathematical function relating `extra` space to `num_lines`. It works with arbitrary discrete relationships provided in the input DataFrame, as long as `num_lines` is non-increasing as `extra` increases (which is the natural expectation for this problem). - - """ - # Pre-processing - unique_cols = input_df['col'].unique().tolist() - - col_extra_num_lines_options = {} - for col_name in unique_cols: - col_data = input_df[input_df['col'] == - col_name].sort_values(by='extra') - col_extra_num_lines_options[col_name] = list( - zip(col_data['extra'], col_data['num_lines'])) - - def check(target_max_lines: int) -> bool: - current_extra_needed = 0 - for col_name in unique_cols: - min_extra_for_col = float('inf') - found_suitable_extra = False - for extra_val, num_lines_val in col_extra_num_lines_options[col_name]: - if num_lines_val <= target_max_lines: - min_extra_for_col = extra_val - found_suitable_extra = True - break - - if not found_suitable_extra: - return False - - current_extra_needed += min_extra_for_col - - return current_extra_needed <= total_es_budget - - all_num_lines = input_df['num_lines'].unique() - - # Corrected line: Check length of the numpy array - if len(all_num_lines) == 0: - return 0, {} - - L = all_num_lines.min() - R = all_num_lines.max() - - optimal_max_lines = R - best_allocation = {} - - while L <= R: - mid_max_lines = L + (R - L) // 2 - - temp_current_extra_needed = 0 - temp_current_allocation = {} - possible = True - for col_name in unique_cols: - min_extra_for_col = float('inf') - found_suitable_extra = False - for extra_val, num_lines_val in col_extra_num_lines_options[col_name]: - if num_lines_val <= mid_max_lines: - min_extra_for_col = extra_val - found_suitable_extra = True - break - - if not found_suitable_extra: - possible = False - break - - temp_current_extra_needed += min_extra_for_col - temp_current_allocation[col_name] = min_extra_for_col - - if possible and temp_current_extra_needed <= total_es_budget: - optimal_max_lines = mid_max_lines - best_allocation = temp_current_allocation.copy() - R = mid_max_lines - 1 - else: - L = mid_max_lines + 1 - - return optimal_max_lines, best_allocation - - def estimate_column_widths(self): - """ - Estimate sensible column widths for the dataframe in character units. - - Used by HTML and TeX output. returns tikz_colw used by TeX output to print - the tikz (no impact on output, just makes the produced TeX align nicely), - tabs and scaled_tabs (reflecting scale). These three columns are added - to the column_width_df. - - Internal variables: - mxmn affects alignment: are all columns the same width? - - TODO: de-TeX-ification will mess up how the tex table is printed... - but one rarely looks at that. - - :param df: - :param nc_index: number of columns in the index...these are not counted as "data columns" - :param config.equal: if True, try to make all data columns the same width (hint can be rejected) - :return: - tikz_colw affects how the tex is printed to ensure it "looks neat" (actual width of data elements) - tabs affects the actual output - """ - # local variables (conversion from global method) - df = self.df - target_width = self.config.max_table_width - nc_index = self.nindex - scale = self.config.tikz_scale - equal = self.config.equal - - # tabs from _tabs, an estimate column widths, determines the size of the table columns as displayed - # print(f'{nc_index=}, {scale=}, {config.equal=}') - # without tex adjustment - tikz_colw = dict.fromkeys(df.columns, 0) - # with tex adjustment - tex_colw = dict.fromkeys(df.columns, 0) - headw = dict.fromkeys(df.columns, 0) - tikz_headw = dict.fromkeys(df.columns, 0) - tabs = [] - scaled_tabs = [] - mxmn = {} - if df.empty: - return tikz_colw, tabs, scaled_tabs - nl = nc_index - for i, c in enumerate(df.columns): - # figure width of the column labels; if index c= str, if MI then c = tuple - # cw is the width of the column header/title - # tzcw is for tikz - no wrapping and no tex adjustment - if type(c) == str: - if i < nl: - cw = GT.text_display_len(c) - tzcw = len(c) + # for data columns look at words rather than whole phrase + cw = max(map(len_function, c.split(' '))) + tzcw = len(c) + # logger.info(f'leng col = {len(c)}, longest word = {cw}') else: - # for data columns look at words rather than whole phrase - cw = max(map(GT.text_display_len, c.split(' '))) - tzcw = len(c) - # logger.info(f'leng col = {len(c)}, longest word = {cw}') - else: - # column name could be float etc. or if multi index a tuple - try: - if isinstance(c, tuple): - # multiindex: join and split into words and take length of each word - words = ' '.join(c).split(' ') - cw = max( - map(lambda x: GT.text_display_len(str(x)), words)) - tzcw = max(map(len, words)) - else: - cw = max(map(lambda x: GT.text_display_len(str(x)), c)) - tzcw = max(map(len, c)) - # print(f'{c}: {cw=} no error') - except TypeError: - # not a MI, float or something - cw = GT.text_display_len(str(c)) - tzcw = len(str(c)) - # print(f'{c}: {cw=} WITH error') - headw[c] = cw - tikz_headw[c] = tzcw - # now figure the width of the elements in the column - # mxmn is used to determine whether to center the column (if all the same size) - if df.dtypes.iloc[i] == object: - # weirdness here were some objects actually contain floats, str evaluates to NaN - # and picks up width zero - try: - lens = df.iloc[:, i].map( - lambda x: GT.text_display_len(str(x))) + # column name could be float etc. or if multi index a tuple + try: + if isinstance(c, tuple): + # multiindex: join and split into words and take length of each word + words = ' '.join(c).split(' ') + cw = max( + map(lambda x: len_function(str(x)), words)) + tzcw = max(map(len, words)) + else: + cw = max(map(lambda x: len_function(str(x)), c)) + tzcw = max(map(len, c)) + # print(f'{c}: {cw=} no error') + except TypeError: + # not a MI, float or something + cw = len_function(str(c)) + tzcw = len(str(c)) + # print(f'{c}: {cw=} WITH error') + headw[c] = cw + tikz_headw[c] = tzcw + # now figure the width of the elements in the column + # mxmn is used to determine whether to center the column (if all the same size) + if df.dtypes.iloc[i] == object: + # weirdness here were some objects actually contain floats, str evaluates to NaN + # and picks up width zero + try: + lens = df.iloc[:, i].map( + lambda x: len_function(str(x))) + tex_colw[c] = lens.max() + mxmn[c] = (lens.max(), lens.min()) + raw_lens = df.iloc[:, i].map(len) + tikz_colw[c] = raw_lens.max() + except Exception as e: + raise + # logger.error( + # f'{c} error {e} DO SOMETHING ABOUT THIS...if it never occurs dont need the if') + # tikz_colw[c] = df[c].str.len().max() + # mxmn[c] = (df[c].str.len().max(), df[c].str.len().min()) + else: + lens = df.iloc[:, i].map(lambda x: len_function(str(x))) tex_colw[c] = lens.max() mxmn[c] = (lens.max(), lens.min()) raw_lens = df.iloc[:, i].map(len) tikz_colw[c] = raw_lens.max() - except Exception as e: - raise - # logger.error( - # f'{c} error {e} DO SOMETHING ABOUT THIS...if it never occurs dont need the if') - # tikz_colw[c] = df[c].str.len().max() - # mxmn[c] = (df[c].str.len().max(), df[c].str.len().min()) - else: - lens = df.iloc[:, i].map(lambda x: GT.text_display_len(str(x))) - tex_colw[c] = lens.max() - mxmn[c] = (lens.max(), lens.min()) - raw_lens = df.iloc[:, i].map(len) - tikz_colw[c] = raw_lens.max() - # pick up long headers too - for c in df.columns: - tikz_colw[c] = max(tikz_colw[c], tikz_headw[c]) - # print(tikz_colw) - # now know all column widths...decide what to do - # are all the data columns about the same width? - data_cols = np.array([tex_colw[k] for k in df.columns[nl:]]) - same_size = (data_cols.std() <= 0.1 * data_cols.mean()) - # print(f'same size test requires {data_cols.std()} <= {0.1 * data_cols.mean()}') - common_size = 0 - if same_size: - common_size = int(data_cols.mean() + data_cols.std()) - logger.info(f'data cols appear same size = {common_size}') - # print(f'data cols appear same size = {common_size}') - for i, c in enumerate(df.columns): - if i < nl or not same_size: - # index columns - tabs.append(int(max(tex_colw[c], headw[c]))) - else: - # data all seems about the same width - tabs.append(common_size) - logger.info(f'Determined tab config.spacing: {tabs}') - if equal: - # see if config.equal widths makes sense - dt = tabs[nl:] - if max(dt) / sum(dt) < 4 / 3: - tabs = tabs[:nl] + [max(dt)] * (len(tabs) - nl) - logger.info(f'Taking config.equal width hint: {tabs}') - # print(f'Taking config.equal width hint: {tabs}') - else: - logger.info(f'Rejecting config.equal width hint') - # print(f'Rejecting config.equal width hint') - # look to rescale, shoot for width of 150 on 100 scale basis - data_width = sum(tabs[nl:]) - index_width = sum(tabs[:nl]) - target_width = target_width * scale - index_width - if data_width and data_width / target_width < 0.9: - # don't rescale above 1:1 - don't want too large - rescale = min(1 / scale, target_width / data_width) - scaled_tabs = [w if i < nl else - int(w * rescale) for i, w in enumerate(tabs)] - logger.info(f'Rescale {rescale} applied; tabs = {tabs}') - else: - scaled_tabs = tabs - # print(f'Rescale {rescale} applied; tabs = {tabs}') - # print(f'{tikz_colw.values()=}\n{tabs=}') - return tikz_colw, tabs, scaled_tabs - - @staticmethod - def text_display_len(s: str) -> int: - """Estimate text display length of a string allowing for TeX constructs.""" - # note you DO WANT SPACES! So, no strip applied ever. - if s.find('$') < 0: - return len(s) - parts = re.split(r'(\$\$.*?\$\$)|(\$.*?\$)', s) - total = 0 - for part in parts: - if part is None: - continue - if part.startswith('$$') and part.endswith('$$'): - total += GT._estimate_math_width(part[2:-2]) - elif part.startswith('$') and part.endswith('$'): - total += GT._estimate_math_width(part[1:-1]) - else: - total += len(part) - return total - - @staticmethod - def _estimate_math_width(tex: str) -> int: - tokens = re.findall(r'\\[a-zA-Z]+|[a-zA-Z0-9]|.', tex) - width = 0 - for tok in tokens: - if tok.startswith('\\'): - name = tok[1:] - if name in GT.TEX_SIMPLE_GLYPHS: - width += 1 - elif name in GT.TEX_WIDE: - width += 3 - elif name in GT.TEX_SPACING: - width += 1 + # pick up long headers too + for c in df.columns: + tikz_colw[c] = max(tikz_colw[c], tikz_headw[c]) + # print(tikz_colw) + # now know all column widths...decide what to do + # are all the data columns about the same width? + data_cols = np.array([tex_colw[k] for k in df.columns[nc_index:]]) + same_size = (data_cols.std() <= 0.1 * data_cols.mean()) + # print(f'same size test requires {data_cols.std()} <= {0.1 * data_cols.mean()}') + common_size = 0 + if same_size: + common_size = int(data_cols.mean() + data_cols.std()) + logger.info(f'data cols appear same size = {common_size}') + # print(f'data cols appear same size = {common_size}') + for i, c in enumerate(df.columns): + if i < nc_index or not same_size: + # index columns + tabs.append(int(max(tex_colw[c], headw[c]))) else: - width += 2 # unknown control sequences - elif tok in '{}^_': - continue # grouping, sub/superscripts: ignore + # data all seems about the same width + tabs.append(common_size) + logger.info(f'Determined tabs spacing: {tabs}') + if self.config.equal: + # see if config.equal widths makes sense + dt = tabs[nc_index:] + if max(dt) / sum(dt) < 4 / 3: + tabs = tabs[:nc_index] + [max(dt)] * (len(tabs) - nc_index) + logger.info(f'Taking config.equal width hint: {tabs}') + # print(f'Taking config.equal width hint: {tabs}') + else: + logger.info(f'Rejecting config.equal width hint') + # print(f'Rejecting config.equal width hint') + # look to rescale, shoot for width of 150 on 100 scale basis + data_width = sum(tabs[nc_index:]) + index_width = sum(tabs[:nc_index]) + target_width = target_width * self.config.tikz_scale - index_width + if data_width and data_width / target_width < 0.9: + # don't rescale above 1:1 - don't want too large + rescale = min(1 / self.config.tikz_scale, target_width / data_width) + scaled_tabs = [w if i < nc_index else + int(w * rescale) for i, w in enumerate(tabs)] + logger.info(f'Rescale {rescale} applied; tabs = {tabs}') else: - width += 1 - return width + scaled_tabs = tabs + # add to the answer + ans['tikz_colw'] = tikz_colw + ans['tikz_colw'] += 2 # padding \I spacer + ans['estimated_tabs'] = tabs + ans['estimated_scaled_tabs'] = scaled_tabs + if self.tabs is not None: + ans['input_tabs'] = self.tabs + else: + ans['input_tabs'] = -1 + # this column should be used in place of tabs from estimate_column_widths + # in make html and make tikz + ans['tabs'] = np.maximum(ans['input_tabs'], + ans['estimated_tabs']) + ans['scaled_tabs'] = np.maximum(ans['input_tabs'], + ans['estimated_scaled_tabs']) + + # in all cases...assemble the answer with relevant information + return_columns = [ + 'alignment', + 'break_penalties', + 'breakability', + 'natural_width', + 'acceptable_width', + 'minimum_width', + 'raw_recommended', + 'header_tweak', + 'recommended', + 'tikz_colw', + 'estimated_tabs', + 'estimated_scaled_tabs', + 'input_tabs', + 'tabs', + 'scaled_tabs', + ] + ans = ans[[i for i in return_columns if i in ans.columns]] + # need recommended to be > 0 + ans['recommended'] = np.maximum(ans['recommended'], 1) + return ans def make_style(self, tabs): """Write out custom CSS for the table.""" @@ -1837,8 +1465,8 @@ class GT(object): font-weight: bold; }} '''] - for i, w in enumerate(tabs): - style.append(f' #{self.df_id} .grt-c-{i} {{ width: {w}em; }}') + # for i, w in enumerate(tabs): + # style.append(f' #{self.df_id} .grt-c-{i} {{ width: {w}em; }}') style.append('') logger.info('CREATED CSS') return '\n'.join(style) @@ -1860,30 +1488,19 @@ class GT(object): html.append(f'