v 2.0.0alpha => new table widths, text output

2026-06-27 16:15:38 +08:00 · 2025-06-01 10:10:19 +01:00
parent 23061b450d
commit f354ef7b10
2 changed files with 164 additions and 16 deletions
@@ -1,4 +1,4 @@
-__version__ = '1.2.0'
+__version__ = '2.0.0'
 __project__ = 'greater_tables'
 __author__ = 'Stephen J Mildenhall'

@@ -1,21 +1,33 @@
 # table formatting again
-from bs4 import BeautifulSoup
 from decimal import InvalidOperation
+from enum import IntEnum
 from io import StringIO
 from itertools import groupby
 import logging
+from pathlib import Path
+import re
+import sys
+from textwrap import wrap
+import warnings
+
+from bs4 import BeautifulSoup
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
    is_float_dtype   # , is_numeric_dtype
-from pathlib import Path
-import re
-import sys
-import warnings
-

 from .hasher import df_short_hash

+
+class Breakability(IntEnum):
+    """To track if a column should or should not be broken (wrapped)."""
+
+    NEVER = 0
+    DATE = 3
+    MAYBE = 5
+    ACCEPTABLE = 10
+
+
 # turn this fuck-fest off
 pd.set_option('future.no_silent_downcasting', True)
 # pandas complaining about casting columns eg putting object in float column
@@ -322,9 +334,6 @@ class GT(object):
        self.date_col_indices = []
        self.object_col_indices = []  # not actually used, but for neatness
        self.break_penalties = []
-        break_never = 10
-        break_maybe = 5
-        break_ok = 1
        # manage non-unique col names here
        logger.debug('FIGURING TYPES')
        for i in range(self.df.shape[1]):
@@ -332,23 +341,23 @@ class GT(object):
            if is_datetime64_any_dtype(ser):
                logger.debug(f'col {i} = {self.df.columns[i]} is DATE')
                self.date_col_indices.append(i)
-                self.break_penalties.append(break_maybe)
+                self.break_penalties.append(Breakability.DATE)
            elif is_integer_dtype(ser):
                logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER')
                self.integer_col_indices.append(i)
-                self.break_penalties.append(break_never)
+                self.break_penalties.append(Breakability.NEVER)
            elif is_float_dtype(ser):
                logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT')
                self.float_col_indices.append(i)
-                self.break_penalties.append(break_never)
+                self.break_penalties.append(Breakability.NEVER)
            else:
                logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT')
                self.object_col_indices.append(i)
                c = ser.name
                if c in self.year_cols or c in self.ratio_cols:
-                    self.break_penalties.append(break_never)
+                    self.break_penalties.append(Breakability.NEVER)
                else:
-                    self.break_penalties.append(break_ok)
+                    self.break_penalties.append(Breakability.ACCEPTABLE)

        # figure out column and index alignment
        if aligners is not None and np.any(self.df.columns.duplicated()):
@@ -687,6 +696,145 @@ class GT(object):
        """Basic representation."""
        return f"GreaterTable(df_id={self.df_id})"

+    def column_width_df(self, allocate_overage=True):
+        """
+        Return dataframe of width information.
+
+        * natural width, all on one line = max len by col
+        * min width = max length given breaks
+        * break type of column
+        * alignment of column
+        * index natural width
+        * index min width
+        """
+        df = self.df
+        n_row, n_col = df.shape
+        PADDING = 2 # per column
+        # target width INCLUDES padding and column marks |
+        target_width = self.max_table_width - PADDING * n_col - (n_col + 1)
+        print(f'{self.max_table_width = } and {target_width = }')
+        # The width if content didn't wrap (single line)
+        # Series=dict colname->max width of cells in column
+        natural_width = df.map(lambda x: len(x.strip())).max(axis=0).to_dict()
+
+
+        # re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text)
+        # (?<=...) is a lookbehind to preserve the break character with the left-hand fragment.
+        # [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators:
+        # \s = whitespace
+        # . , : ; ! ? = terminal punctuation
+        # () [] {} = brackets
+        # \- = dash
+        # \\/| = slash, backslash, pipe
+        pat =r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--+\s+|\s+'
+        iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})'
+        pat = f'{pat}|{iso_date_split}'
+
+        # Calculate ideal (no wrap) and minimum possible widths for all columns
+        # The absolute minimum width each column can take (e.g., longest word for text)
+        min_acceptable_width = {}
+        for col_name in df.columns:
+            min_acceptable_width[col_name] = (
+                df[col_name].str
+                    .split(pat=pat, regex=True, expand=True)
+                    .fillna('')
+                    .map(len)
+                    .max(axis=1)
+                    .max()
+                    )
+        #
+        ans = pd.DataFrame({
+            'alignment': [i[4:] for i in self.df_aligners],
+            'break_penalties': self.break_penalties,
+            'breakability' : [x.name for x in self.break_penalties],
+            'natural_width': natural_width.values(),
+            'min_acceptable_width': min_acceptable_width.values(),
+            }, index=df.columns)
+        ans['break_acceptable'] = ans.natural_width
+        ans['break_acceptable'] = np.where(ans.break_penalties==Breakability.ACCEPTABLE, ans.min_acceptable_width, ans.natural_width)
+        # DUH - this is min_acceptable_width
+        # ans['break_dates'] = np.where(ans.break_penalties==Breakability.DATE, ans.min_acceptable_width, ans.break_acceptable)
+
+        natural, acceptable, min_acceptable = ans.iloc[:, 3:].sum()
+        if target_width > natural:
+            # everything gets its natural width
+            ans['recommended'] = ans['natural_width']
+            space = target_width - natural
+        elif target_width > acceptable:
+            # strings wrap
+            ans['recommended'] = ans['break_acceptable']
+            # use up extra on the ACCEPTABLE cols
+            space = target_width - acceptable
+            logger.info('Overage to allocated = %s', space)
+        elif target_width > min_acceptable:
+            # strings and dates wrap
+            ans['recommended'] = ans['min_acceptable_width']
+            # use up extra on dates first, then strings
+            space = target_width - min_acceptable
+            logger.info('Overage to allocated = %s', space)
+        else:
+            # OK severely too small
+            ans['recommended'] = ans['min_acceptable_width']
+            logger.warning('Desired width too small for pleasant formatting, table will be too wide.')
+            shortfall = min_acceptable - target_width
+            return ans
+
+        if not allocate_overage:
+            return ans
+
+        # Allocate the excess ------------------------------
+        if df.columns.nlevels == 1:
+            # Step 1: baseline comes in from code above
+            ans['raw_rec'] = ans.recommended
+
+            # Step 2: how much extra would it take to reduce header line count?
+            def header_wrap_cost(header, width):
+                if not isinstance(header, str):
+                    return 1
+                return len(wrap(header, width))
+
+            header_lengths = {col: len(col) for col in df.columns}
+            current_lines = {col: header_wrap_cost(col, ans.loc[col, 'min_acceptable_width']) for col in df.columns}
+            next_wrap_gain = {}
+
+            for col in df.columns:
+                w = ans.loc[col, 'min_acceptable_width']
+                for extra in range(1, 10):  # cap search
+                    new_w = w + extra
+                    if header_wrap_cost(col, new_w) < current_lines[col]:
+                        next_wrap_gain[col] = extra
+                        break
+                else:
+                    next_wrap_gain[col] = 0
+
+            header_budget = min(space, sum(next_wrap_gain.values()))
+            for col in df.columns:
+                gain = next_wrap_gain[col]
+                if gain > 0:
+                    give = min(gain, header_budget)
+                    ans.loc[col, 'recommended'] += give
+                    header_budget -= give
+                    if header_budget <= 0:
+                        break
+
+        # Step 3: distribute remaining slack proportionally
+        remaining = target_width - ans['recommended'].sum()
+        if remaining > 0:
+            slack = ans['natural_width'] - ans['recommended']
+            total_slack = slack.clip(lower=0).sum()
+            if total_slack > 0:
+                fractions = slack.clip(lower=0) / total_slack
+                ans['recommended'] += np.floor(fractions * remaining).astype(int)
+                ans['recommended'] = np.minimum(ans['recommended'], ans['natural_width'])
+
+        # Ensure final constraint
+        ans['recommended'] = ans['recommended'].astype(int)
+        if ans['recommended'].sum() <= target_width:
+            logger.warning("Over-allocated widths slightly: %s vs %s", ans['recommended'].sum(), target_width)
+
+        return ans
+
+
    def optimize_column_widths(self, df=None, all_breakable=False):
        """
        Optimize column widths for a Pandas DataFrame given an overall width constraint.
@@ -726,7 +874,7 @@ class GT(object):
        if all_breakable:
            breakable_cols = dict(zip(df.columns, [True] * len(df.columns)))
        else:
-            breakable_cols = dict(zip(df.columns, [True if i < 5 else False for i in self.break_penalties]))
+            breakable_cols = dict(zip(df.columns, [True if i >= Breakability.MAYBE else False for i in self.break_penalties]))

        # 1. Calculate ideal (no wrap) and minimum possible widths for all columns
        for col_name in df.columns: