From cecd2e8e2b8c867e3cfc944da66e83e2760cad7a Mon Sep 17 00:00:00 2001
From: Stephen Mildenhall <steve@convexrisk.com>
Date: Thu, 26 Jun 2025 11:54:10 +0100
Subject: [PATCH] rewrote width code for tex and html output - no column head
 balancing

---
 greater_tables/_version.py     |   2 +-
 greater_tables/core.py         | 320 +++++++++++----------------------
 greater_tables/etcher.py       |   8 +-
 greater_tables/fabrications.py |  13 +-
 greater_tables/utilities.py    |   6 +-
 pyproject.toml                 |   2 +-
 6 files changed, 119 insertions(+), 232 deletions(-)

diff --git a/greater_tables/_version.py b/greater_tables/_version.py
index 7e28f64..0d72820 100644
--- a/greater_tables/_version.py
+++ b/greater_tables/_version.py
@@ -1 +1 @@
-__version__ = "5.0.1.dev1+gc3d2dfc.d20250626"
+__version__ = "5.1.0"
diff --git a/greater_tables/core.py b/greater_tables/core.py
index 1d023df..46f50d8 100644
--- a/greater_tables/core.py
+++ b/greater_tables/core.py
@@ -920,23 +920,27 @@ class GT(object):
     def tex_knowledge_df(self):
         """Uber source of information for tex formatting."""
         if self._tex_knowledge_df is None:
-            if (all(self.df_tex.index == self.df_html.index)
-                and all(self.df_tex.columns == self.df_html.columns)
-                and all(self.df_tex == self.df_html)):
-                self._tex_knowledge_df = self.html_knowledge_df
-            else:
-                self._tex_knowledge_df = self.estimate_column_widths_by_mode('tex')
+            # seems this is unlikely to be a good idea!
+            # if (all(self.df_tex.index == self.df_html.index)
+            #     and all(self.df_tex.columns == self.df_html.columns)
+            #     and all(self.df_tex == self.df_html)):
+            #     self._tex_knowledge_df = self.html_knowledge_df
+            # else:
+            self._tex_knowledge_df = self.estimate_column_widths_by_mode('tex')
         return self._tex_knowledge_df
 
     def width_report(self):
         """Return a report summarizing the width information."""
         natural = self.text_knowledge_df.natural_width.sum()
         minimum = self.text_knowledge_df.minimum_width.sum()
-        ht = self.text_knowledge_df.header_tweak.sum()
+        if 'header_tweak' in self.text_knowledge_df:
+            ht = self.text_knowledge_df.header_tweak.sum()
+        else:
+            ht = 0
         text = self.text_knowledge_df.recommended.sum()
         h = self.html_knowledge_df.recommended.sum()
-        tikz = self.tex_knowledge_df['tikz_colw'].sum()
-        tex =  self.tex_knowledge_df['scaled_tabs'].sum()
+        tex =  self.tex_knowledge_df.recommended.sum()
+        tikz = self.tex_knowledge_df.tikz_colw.sum()
         mtw = self.max_table_width_em
         mtiw = self.config.max_table_inch_width
         pts = self.config.table_font_pt_size
@@ -946,8 +950,8 @@ class GT(object):
                         'text header tweak': self.text_knowledge_df.header_tweak,
                         'text recommended': self.text_knowledge_df.recommended,
                         'html recommended': self.html_knowledge_df.recommended,
-                        'tex recommended': self.tex_knowledge_df['scaled_tabs'],
-                        'tikz recommended': self.tex_knowledge_df['tikz_colw'],
+                        'tex recommended': self.tex_knowledge_df.recommended,
+                        'tikz recommended': self.tex_knowledge_df.tikz_colw,
         }).fillna(0)
         ser = pd.Series({
                         'text natural': natural,
@@ -1009,7 +1013,6 @@ class GT(object):
         elif mode == 'html':
             df = self.df_html
             len_function = TextLength.text_display_len
-
         else: #  mode == 'tex':
             df = self.df_tex
             len_function = TextLength.text_display_len
@@ -1039,8 +1042,7 @@ class GT(object):
             )
 
         # begin to assemble the parts
-        # ans will be the col_width_df;  break_penalties needed by all methods
-
+        # ans will be the col_width_df; break_penalties needed by all methods
         ans = pd.DataFrame({
             'alignment': [i[4:] for i in self.df_aligners],
             'break_penalties': self.break_penalties,
@@ -1051,21 +1053,27 @@ class GT(object):
         ans['acceptable_width'] = np.where(
             ans.break_penalties == Breakability.ACCEPTABLE, ans.minimum_width, ans.natural_width)
 
-        # adjustments and recommendations
+        # adjustments and recommendations - these are keyed to text output with padding
         natural, acceptable, minimum = ans.iloc[:, 3:].sum()
         PADDING = 2  # per column TODO enhance
-        if self.config.table_width_mode == 'explicit':
-            # target width INCLUDES padding and column marks |
-            target_width = self.max_table_width_em - \
-                           (PADDING + 1) * n_col - 1
-            logger.info(f'Col padding effect {self.max_table_width_em=}'
-                        f' ==> {target_width=}')
-        elif self.config.table_width_mode == 'natural':
-            target_width = natural + (PADDING + 1) * n_col + 1
-        elif self.config.table_width_mode == 'breakable':
-            target_width = acceptable + (PADDING + 1) * n_col + 1
-        elif self.config.table_width_mode == 'minimum':
-            target_width = minimum + (PADDING + 1) * n_col + 1
+        if mode == 'text':
+            if self.config.table_width_mode == 'explicit':
+                # target width INCLUDES padding and column marks |
+                target_width = self.max_table_width_em - \
+                               (PADDING + 1) * n_col - 1
+                logger.info(f'{self.max_table_width_em=}'
+                            f' ==> {target_width=} after column spacer adjustment')
+            elif self.config.table_width_mode == 'natural':
+                # +1 for the pipe!
+                target_width = natural + (PADDING + 1) * n_col + 1
+            elif self.config.table_width_mode == 'breakable':
+                target_width = acceptable + (PADDING + 1) * n_col + 1
+            elif self.config.table_width_mode == 'minimum':
+                target_width = minimum + (PADDING + 1) * n_col + 1
+        else:
+            # tex and html ignore niceties of padding?? these will be narrower
+            target_width = self.max_table_width_em
+            logger.info(f'{target_width=} ignoring column spacers')
 
         # extra space for the headers to relax, if useful
         if self.config.table_width_header_adjust > 0:
@@ -1074,7 +1082,7 @@ class GT(object):
         else:
             max_extra = 0
 
-        logger.info(f'{mode=} {target_width=}, {natural=}, {acceptable=}, {minimum=}')
+        logger.info(f'{mode=} {target_width=}, {natural=}, {acceptable=}, {minimum=}, {max_extra=}')
 
         if target_width > natural:
             # everything gets its natural width
@@ -1087,212 +1095,94 @@ class GT(object):
             # use up extra on the ACCEPTABLE cols
             space = target_width - acceptable
             logger.info(
-                'Using breaks acceptable (dates not wrapped), spare space = %s', space)
+                'Using "breaks acceptable" (dates not wrapped), spare space = %s', space)
         elif target_width > minimum:
             # strings and dates wrap
             ans['recommended'] = ans['minimum_width']
             # use up extra on dates first, then strings
             space = target_width - minimum
             logger.info(
-                'Breaking all breakable (incl dates), spare space = %s', space)
+                'Using "minimum" (all breakable incl dates), spare space = %s', space)
         else:
             # OK severely too small
             ans['recommended'] = ans['minimum_width']
-            logger.info(
-                'Desired width too small for pleasant formatting, table will be too wide.')
             space = target_width - minimum
+            logger.warning(
+                'Desired width too small for pleasant formatting, table will be too wide by spare space %s < 0.',
+                space)
 
-        # this section adjusts for column headers. text has a
-        # more rigorous adjustment than the other two methds (which
-        # will naturally make better decisions about line breaks in the heading).
-        if mode == "text":
-            input_df = None
-            if space >= 0:
-                # Allocate the excess ------------------------------
-                # Fancy col headings currently only for 1-d index
-                # TODO NOTE: use config.sparsify logic you have for index applied to df.T
-                # to sort the columns!!
-                if df.columns.nlevels == 1:
-                    # Step 1: baseline comes in from code above
-                    ans['raw_recommended'] = ans['recommended']
+        # this section tweaks the widths for column headers -> text output only.
+        # trust tex and html output to naturally make better decisions about line breaks in the heading.
+        if mode == "text" and space > 0:
+            # text mode only: see if some header tweaks are in order (Index only for now, TODO)
+            if df.columns.nlevels == 1:
+                # Step 1: baseline comes in from code above
+                ans['raw_recommended'] = ans['recommended']
 
-                    # Step 2: get rid of intra-line breaks
-                    if max_extra > 0:
-                        adj, input_df = Width.header_adjustment(
-                            df, ans['recommended'], space, max_extra)
-                        # create new col and populate per GPT
-                        ans['header_tweak'] = pd.Series(adj)
-                    else:
-                        ans['header_tweak'] = 0
-                    ans['recommended'] = ans['recommended'] + ans['header_tweak']
-                    ans['natural_w_header'] = ans['recommended']
+                # Step 2: get rid of intra-line breaks
+                if max_extra > 0:
+                    adj = Width.header_adjustment(df, ans['recommended'], space, max_extra)
+                    # create new col and populate per GPT
+                    ans['header_tweak'] = pd.Series(adj)
                 else:
-                    # avoid a failure blow
-                    ans['raw_recommended'] = np.nan
-                    ans['header_tweak'] = np.nan
-                    ans['natural_w_header'] = np.nan
-                # Step 3: distribute remaining slack proportionally
-                remaining = target_width - ans['recommended'].sum()
-                if remaining > 0:
-                    slack = ans['natural_width'] - ans['recommended']
-                    total_slack = slack.clip(lower=0).sum()
-                    if total_slack > 0:
-                        fractions = slack.clip(lower=0) / total_slack
-                        ans['recommended'] += np.floor(fractions *
-                                                       remaining).astype(int)
-                        ans['recommended'] = np.maximum(
-                            ans['recommended'], ans['natural_w_header'])
+                    ans['header_tweak'] = 0
+                ans['recommended'] = ans['recommended'] + ans['header_tweak']
 
-                # Ensure final constraint
-                try:
-                    ans['recommended'] = ans['recommended'].astype(int)
-                except IntCastingNaNError:
-                    print('getting error')
-                    print(ans['recommended'])
-                    ans['recommended'] = pd.to_numeric(
-                        ans['recommended'], errors='coerce').fillna(0).astype(int)
-
-                logger.info("Raw rec: %s\tTweaks: %s\tActual: %s\tTarget: %s\tOver/(U): %s",
-                            ans['raw_recommended'].sum(),
-                            ans['header_tweak'].sum(),
-                            ans['recommended'].sum(),
-                            target_width,
-                            ans['recommended'].sum() - target_width
-                            )
+        # Step 3 (all modes): distribute remaining slack proportionally
+        # obvs remaining == space if mode is not text
+        remaining = target_width - ans['recommended'].sum()
+        ans['pre_space_share_recommended'] = ans['recommended']
+        if remaining > 0:
+            slack = ans['natural_width'] - ans['recommended']
+            total_slack = slack.clip(lower=0).sum()
+            if total_slack > 0:
+                logger.info('total slack to allocate after header adjustments = %s', total_slack)
+                fractions = slack.clip(lower=0) / total_slack
+                ans['recommended'] += np.floor(fractions *
+                                               remaining).astype(int)
             else:
-                # avoid a failure blow
-                ans['raw_recommended'] = np.nan
-                ans['header_tweak'] = np.nan
-                ans['natural_w_header'] = np.nan
-        else:
-            # for html and tex modes: adapts from old estimate_column_widths
-            target_width = self.max_table_width_em
-            nc_index = self.nindex
+                logger.info('no slack to allocate after header adjustments')
 
-            # without tex adjustment
+        # Ensure final constraint
+        # try:
+        #     ans['recommended'] = ans['recommended'].astype(int)
+        # except IntCastingNaNError:
+        #     print('getting error')
+        #     print(ans['recommended'])
+        #     ans['recommended'] = pd.to_numeric(
+        #         ans['recommended'], errors='coerce').fillna(0).astype(int)
+
+        # logger.info("Raw rec: %s\tTweaks: %s\tActual: %s\tTarget: %s\tOver/(U): %s",
+        #             ans['raw_recommended'].sum(),
+        #             ans['header_tweak'].sum(),
+        #             ans['recommended'].sum(),
+        #             target_width,
+        #             ans['recommended'].sum() - target_width
+        #             )
+
+        if mode == 'tex':
+            # tex mode only need tikz raw size for tex code layout
+            nc_index = self.nindex
             tikz_colw = dict.fromkeys(df.columns, 0)
-            # with tex adjustment
-            tex_colw = dict.fromkeys(df.columns, 0)
-            headw = dict.fromkeys(df.columns, 0)
             tikz_headw = dict.fromkeys(df.columns, 0)
-            tabs = []
-            mxmn = {}
             for i, c in enumerate(df.columns):
-                # figure width of the column labels; if index c= str, if MI then c = tuple
-                # cw is the width of the column header/title
-                # tzcw is for tikz - no wrapping and no tex adjustment
-                if type(c) == str:
-                    if i < nc_index:
-                        cw = len_function(c)
-                        tzcw = len(c)
-                    else:
-                        # for data columns look at words rather than whole phrase
-                        cw = max(map(len_function, c.split(' ')))
-                        tzcw = len(c)
-                        # logger.info(f'leng col = {len(c)}, longest word = {cw}')
-                else:
-                    # column name could be float etc. or if multi index a tuple
-                    try:
-                        if isinstance(c, tuple):
-                            # multiindex: join and split into words and take length of each word
-                            words = ' '.join(c).split(' ')
-                            cw = max(
-                                map(lambda x: len_function(str(x)), words))
-                            tzcw = max(map(len, words))
-                        else:
-                            cw = max(map(lambda x: len_function(str(x)), c))
-                            tzcw = max(map(len, c))
-                        # print(f'{c}: {cw=} no error')
-                    except TypeError:
-                        # not a MI, float or something
-                        cw = len_function(str(c))
-                        tzcw = len(str(c))
-                        # print(f'{c}: {cw=} WITH error')
-                headw[c] = cw
-                tikz_headw[c] = tzcw
+                # figure width of the column labels
+                c0 = c # before we mess around with it, for setting dict values
+                if not isinstance(c, tuple):
+                    # make it one: now index and multi index on same footing
+                    c = (c,)
+                # convert to strings
+                c = [str(i) for i in c]
+                tikz_headw[c0] = max(map(len, c))
+
                 # now figure the width of the elements in the column
-                # mxmn is used to determine whether to center the column (if all the same size)
-                if df.dtypes.iloc[i] == object:
-                    # weirdness here were some objects actually contain floats, str evaluates to NaN
-                    # and picks up width zero
-                    try:
-                        lens = df.iloc[:, i].map(
-                            lambda x: len_function(str(x)))
-                        tex_colw[c] = lens.max()
-                        mxmn[c] = (lens.max(), lens.min())
-                        raw_lens = df.iloc[:, i].map(len)
-                        tikz_colw[c] = raw_lens.max()
-                    except Exception as e:
-                        raise
-                        # logger.error(
-                        #     f'{c} error {e} DO SOMETHING ABOUT THIS...if it never occurs dont need the if')
-                        # tikz_colw[c] = df[c].str.len().max()
-                        # mxmn[c] = (df[c].str.len().max(), df[c].str.len().min())
-                else:
-                    lens = df.iloc[:, i].map(lambda x: len_function(str(x)))
-                    tex_colw[c] = lens.max()
-                    mxmn[c] = (lens.max(), lens.min())
-                    raw_lens = df.iloc[:, i].map(len)
-                    tikz_colw[c] = raw_lens.max()
-            # pick up long headers too
+                tikz_colw[c0] = df.iloc[:, i].map(lambda x: len(str(x))).max()
+            # needed tikz width is greater of two
             for c in df.columns:
                 tikz_colw[c] = max(tikz_colw[c], tikz_headw[c])
-            # print(tikz_colw)
-            # now know all column widths...decide what to do
-            # are all the data columns about the same width?
-            data_cols = np.array([tex_colw[k] for k in df.columns[nc_index:]])
-            same_size = (data_cols.std() <= 0.1 * data_cols.mean())
-            # print(f'same size test requires {data_cols.std()} <= {0.1 * data_cols.mean()}')
-            common_size = 0
-            if same_size:
-                common_size = int(data_cols.mean() + data_cols.std())
-                logger.info(f'data cols appear same size = {common_size}')
-                # print(f'data cols appear same size = {common_size}')
-            for i, c in enumerate(df.columns):
-                if i < nc_index or not same_size:
-                    # index columns
-                    tabs.append(int(max(tex_colw[c], headw[c])))
-                else:
-                    # data all seems about the same width
-                    tabs.append(common_size)
-            logger.info(f'Determined tabs spacing: {tabs}')
-            if self.config.equal:
-                # see if config.equal widths makes sense
-                dt = tabs[nc_index:]
-                if max(dt) / sum(dt) < 4 / 3:
-                    tabs = tabs[:nc_index] + [max(dt)] * (len(tabs) - nc_index)
-                    logger.info(f'Taking config.equal width hint: {tabs}')
-                    # print(f'Taking config.equal width hint: {tabs}')
-                else:
-                    logger.info(f'Rejecting config.equal width hint')
-                    # print(f'Rejecting config.equal width hint')
-            # look to rescale, shoot for width of 150 on 100 scale basis
-            data_width = sum(tabs[nc_index:])
-            index_width = sum(tabs[:nc_index])
-            target_width = target_width * self.config.tikz_scale - index_width
-            if data_width and data_width / target_width < 0.9:
-                # don't rescale above 1:1 - don't want too large
-                rescale = min(1 / self.config.tikz_scale, target_width / data_width)
-                scaled_tabs = [w if i < nc_index else
-                               int(w * rescale) for i, w in enumerate(tabs)]
-                logger.info(f'Rescale {rescale} applied; tabs = {tabs}')
-            else:
-                scaled_tabs = tabs
-            # add to the answer
+            # distribute any overage using the measures already done
             ans['tikz_colw'] = tikz_colw
-            ans['tikz_colw'] += 2  # padding \I spacer
-            ans['estimated_tabs'] = tabs
-            ans['estimated_scaled_tabs'] = scaled_tabs
-            if self.tabs is not None:
-                ans['input_tabs'] = self.tabs
-            else:
-                ans['input_tabs'] = -1
-            # this column should be used in place of tabs from estimate_column_widths
-            # in make html and make tikz
-            ans['tabs'] = np.maximum(ans['input_tabs'],
-                                                       ans['estimated_tabs'])
-            ans['scaled_tabs'] = np.maximum(ans['input_tabs'],
-                                                              ans['estimated_scaled_tabs'])
+            ans['tikz_colw'] += 2  # for \I
 
         # in all cases...assemble the answer  with relevant information
         return_columns = [
@@ -1304,13 +1194,9 @@ class GT(object):
             'minimum_width',
             'raw_recommended',
             'header_tweak',
+            'pre_space_share_recommended',
             'recommended',
             'tikz_colw',
-            'estimated_tabs',
-            'estimated_scaled_tabs',
-            'input_tabs',
-            'tabs',
-            'scaled_tabs',
             ]
         ans = ans[[i for i in return_columns if i in ans.columns]]
         # need recommended to be > 0
@@ -1790,7 +1676,7 @@ class GT(object):
 
         # column and tikz display widths
         colw = self.tex_knowledge_df['tikz_colw'].map(lambda x: np.round(x, 3))
-        tabs = self.tex_knowledge_df['scaled_tabs'].map(lambda x: np.round(x, 3))
+        tabs = self.tex_knowledge_df['recommended'].map(lambda x: np.round(x, 3))
         # these are indexed with pre-TeX mangling names
         # colw.index = df.columns
         # tabs.index = df.columns
diff --git a/greater_tables/etcher.py b/greater_tables/etcher.py
index 8b7d937..e16cc23 100644
--- a/greater_tables/etcher.py
+++ b/greater_tables/etcher.py
@@ -96,7 +96,7 @@ class Etcher:
             path = tmp.with_suffix(ext)
             if path.exists():
                 path.unlink()
-        logger.info('...success...format file built', self.format_file.resolve())
+        logger.info('...success...format file built %s', self.format_file.resolve())
 
     def process_tikz(self):
         """Compile TikZ to PDF and convert to SVG."""
@@ -122,7 +122,7 @@ class Etcher:
             str(tex_path)
         ]
         (tex_path.parent / 'make_tikz.bat').write_text(" ".join(tex_cmd), encoding='utf-8')
-        logger.info("Running:", " ".join(tex_cmd))
+        logger.info("Running: %s", " ".join(tex_cmd))
         if self.run_command(tex_cmd):
             raise ValueError('TeX failed to compile, not pdf or svg output.')
             # no tidying up
@@ -134,7 +134,7 @@ class Etcher:
                 str(pdf_path),
                 str(svg_path)
             ]
-            logger.info("Running:", " ".join(svg_cmd))
+            logger.info("Running: %s", " ".join(svg_cmd))
             self.run_command(svg_cmd, raise_on_error=True)
 
             for ext in ('.aux', '.log', '.pdf'):
@@ -151,7 +151,7 @@ class Etcher:
         with Popen(command, cwd=cwd, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
             stdout, stderr = p.communicate()
             if stdout:
-                logger.info('Run command output ends\n', stdout.strip()[-250:])
+                logger.info('Run command output ends\n %s', stdout.strip()[-250:])
             if stdout:
                 if stdout.find('no output PDF file produced') > 0:
                     logger.error("ERROR no pdf output\n"*5)
diff --git a/greater_tables/fabrications.py b/greater_tables/fabrications.py
index f1415d9..8089d9a 100644
--- a/greater_tables/fabrications.py
+++ b/greater_tables/fabrications.py
@@ -6,7 +6,7 @@ from collections import deque
 from datetime import datetime, timedelta
 from importlib.resources import files
 from itertools import cycle, chain, count, zip_longest, product, islice
-# import logging
+import logging
 from math import prod
 from pathlib import Path
 from typing import Optional, Union
@@ -18,7 +18,7 @@ import numpy as np
 import pandas as pd
 
 
-# logger = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class Fabricator:
@@ -107,13 +107,16 @@ class Fabricator:
                                if len(lvl) == 1]
             if len(drop_levels) == df.index.nlevels:
                 drop_levels.pop()
-            df = df.droplevel(drop_levels)
+            if len(drop_levels):
+                logger.info('dropping empty index levels %s', drop_levels)
+                df = df.droplevel(drop_levels)
         if isinstance(df.columns, pd.MultiIndex):
             drop_levels = [i for i, lvl in enumerate(df.columns.levels) if len(lvl) == 1]
             if len(drop_levels) == df.columns.nlevels:
                 drop_levels.pop()
-            # df.columns = df.columns.droplevel()
-            df = df.droplevel(drop_levels, axis=1)
+            if len(drop_levels):
+                logger.info('dropping empty column levels %s', drop_levels)
+                df = df.droplevel(drop_levels, axis=1)
         return df
 
     def make(self, rows, data_spec, *, index_levels=1, index_names=None,
diff --git a/greater_tables/utilities.py b/greater_tables/utilities.py
index 82f33c5..99e2aa9 100644
--- a/greater_tables/utilities.py
+++ b/greater_tables/utilities.py
@@ -563,15 +563,13 @@ class Width:
         avoid_intra = input_df.groupby('col').min().extra.sum()
         if avoid_intra >= space:
             # that's all we can do
-            print("NO FURTHER IMPROVEMENTS")
+            logger.warning("Insufficient space to avoid ugly wraps -> NO FURTHER IMPROVEMENTS")
         else:
             # can try for a better solution
             sol = Width.optimal_heading(input_df, space)
             adjustments.update(sol[1])
             logger.info('best solution: %s', sol)
-        # global temp
-        # temp = input_df
-        return adjustments, input_df
+        return adjustments
 
     @staticmethod
     def optimal_heading(input_df: pd.DataFrame, total_es_budget: int) -> tuple[int, dict[str, int]]:
diff --git a/pyproject.toml b/pyproject.toml
index a42960b..648d658 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "greater_tables"
-version = "5.0.1.dev1+gc3d2dfc.d20250626"
+version = "5.1.0"
 description = "Perfect tables from pandas dataframes."
 authors = [{name = "Stephen J Mildehall", email = "mynl@me.com" }]
 readme = {file = "README.md", content-type = "text/markdown"}