v 2.0.0alpha => new table widths, text output

2026-06-27 17:48:45 +08:00 · 2025-06-01 10:10:19 +01:00
parent 23061b450d
commit f354ef7b10
2 changed files with 164 additions and 16 deletions
@@ -1,4 +1,4 @@
-__version__ = '1.2.0'
+__version__ = '2.0.0'
 __project__ = 'greater_tables'
 __author__ = 'Stephen J Mildenhall'
@@ -1,21 +1,33 @@
 # table formatting again
 from bs4 import BeautifulSoup
 from decimal import InvalidOperation
 from enum import IntEnum
 from io import StringIO
 from itertools import groupby
 import logging
 from pathlib import Path
 import re
 import sys
 from textwrap import wrap
 import warnings
 from bs4 import BeautifulSoup
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
    is_float_dtype   # , is_numeric_dtype
 from pathlib import Path
 import re
 import sys
 import warnings
 from .hasher import df_short_hash
 class Breakability(IntEnum):
    """To track if a column should or should not be broken (wrapped)."""
    NEVER = 0
    DATE = 3
    MAYBE = 5
    ACCEPTABLE = 10
 # turn this fuck-fest off
 pd.set_option('future.no_silent_downcasting', True)
 # pandas complaining about casting columns eg putting object in float column
@@ -322,9 +334,6 @@ class GT(object):
        self.date_col_indices = []
        self.object_col_indices = []  # not actually used, but for neatness
        self.break_penalties = []
        break_never = 10
        break_maybe = 5
        break_ok = 1
        # manage non-unique col names here
        logger.debug('FIGURING TYPES')
        for i in range(self.df.shape[1]):
@@ -332,23 +341,23 @@ class GT(object):
            if is_datetime64_any_dtype(ser):
                logger.debug(f'col {i} = {self.df.columns[i]} is DATE')
                self.date_col_indices.append(i)
-                self.break_penalties.append(break_maybe)
+                self.break_penalties.append(Breakability.DATE)
            elif is_integer_dtype(ser):
                logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER')
                self.integer_col_indices.append(i)
-                self.break_penalties.append(break_never)
+                self.break_penalties.append(Breakability.NEVER)
            elif is_float_dtype(ser):
                logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT')
                self.float_col_indices.append(i)
-                self.break_penalties.append(break_never)
+                self.break_penalties.append(Breakability.NEVER)
            else:
                logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT')
                self.object_col_indices.append(i)
                c = ser.name
                if c in self.year_cols or c in self.ratio_cols:
-                    self.break_penalties.append(break_never)
+                    self.break_penalties.append(Breakability.NEVER)
                else:
-                    self.break_penalties.append(break_ok)
+                    self.break_penalties.append(Breakability.ACCEPTABLE)
        # figure out column and index alignment
        if aligners is not None and np.any(self.df.columns.duplicated()):
@@ -687,6 +696,145 @@ class GT(object):
        """Basic representation."""
        return f"GreaterTable(df_id={self.df_id})"
    def column_width_df(self, allocate_overage=True):
        """
        Return dataframe of width information.
        * natural width, all on one line = max len by col
        * min width = max length given breaks
        * break type of column
        * alignment of column
        * index natural width
        * index min width
        """
        df = self.df
        n_row, n_col = df.shape
        PADDING = 2 # per column
        # target width INCLUDES padding and column marks |
        target_width = self.max_table_width - PADDING * n_col - (n_col + 1)
        print(f'{self.max_table_width = } and {target_width = }')
        # The width if content didn't wrap (single line)
        # Series=dict colname->max width of cells in column
        natural_width = df.map(lambda x: len(x.strip())).max(axis=0).to_dict()
        # re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text)
        # (?<=...) is a lookbehind to preserve the break character with the left-hand fragment.
        # [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators:
        # \s = whitespace
        # . , : ; ! ? = terminal punctuation
        # () [] {} = brackets
        # \- = dash
        # \\/| = slash, backslash, pipe
        pat =r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--+\s+|\s+'
        iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})'
        pat = f'{pat}|{iso_date_split}'
        # Calculate ideal (no wrap) and minimum possible widths for all columns
        # The absolute minimum width each column can take (e.g., longest word for text)
        min_acceptable_width = {}
        for col_name in df.columns:
            min_acceptable_width[col_name] = (
                df[col_name].str
                    .split(pat=pat, regex=True, expand=True)
                    .fillna('')
                    .map(len)
                    .max(axis=1)
                    .max()
                    )
        #
        ans = pd.DataFrame({
            'alignment': [i[4:] for i in self.df_aligners],
            'break_penalties': self.break_penalties,
            'breakability' : [x.name for x in self.break_penalties],
            'natural_width': natural_width.values(),
            'min_acceptable_width': min_acceptable_width.values(),
            }, index=df.columns)
        ans['break_acceptable'] = ans.natural_width
        ans['break_acceptable'] = np.where(ans.break_penalties==Breakability.ACCEPTABLE, ans.min_acceptable_width, ans.natural_width)
        # DUH - this is min_acceptable_width
        # ans['break_dates'] = np.where(ans.break_penalties==Breakability.DATE, ans.min_acceptable_width, ans.break_acceptable)
        natural, acceptable, min_acceptable = ans.iloc[:, 3:].sum()
        if target_width > natural:
            # everything gets its natural width
            ans['recommended'] = ans['natural_width']
            space = target_width - natural
        elif target_width > acceptable:
            # strings wrap
            ans['recommended'] = ans['break_acceptable']
            # use up extra on the ACCEPTABLE cols
            space = target_width - acceptable
            logger.info('Overage to allocated = %s', space)
        elif target_width > min_acceptable:
            # strings and dates wrap
            ans['recommended'] = ans['min_acceptable_width']
            # use up extra on dates first, then strings
            space = target_width - min_acceptable
            logger.info('Overage to allocated = %s', space)
        else:
            # OK severely too small
            ans['recommended'] = ans['min_acceptable_width']
            logger.warning('Desired width too small for pleasant formatting, table will be too wide.')
            shortfall = min_acceptable - target_width
            return ans
        if not allocate_overage:
            return ans
        # Allocate the excess ------------------------------
        if df.columns.nlevels == 1:
            # Step 1: baseline comes in from code above
            ans['raw_rec'] = ans.recommended
            # Step 2: how much extra would it take to reduce header line count?
            def header_wrap_cost(header, width):
                if not isinstance(header, str):
                    return 1
                return len(wrap(header, width))
            header_lengths = {col: len(col) for col in df.columns}
            current_lines = {col: header_wrap_cost(col, ans.loc[col, 'min_acceptable_width']) for col in df.columns}
            next_wrap_gain = {}
            for col in df.columns:
                w = ans.loc[col, 'min_acceptable_width']
                for extra in range(1, 10):  # cap search
                    new_w = w + extra
                    if header_wrap_cost(col, new_w) < current_lines[col]:
                        next_wrap_gain[col] = extra
                        break
                else:
                    next_wrap_gain[col] = 0
            header_budget = min(space, sum(next_wrap_gain.values()))
            for col in df.columns:
                gain = next_wrap_gain[col]
                if gain > 0:
                    give = min(gain, header_budget)
                    ans.loc[col, 'recommended'] += give
                    header_budget -= give
                    if header_budget <= 0:
                        break
        # Step 3: distribute remaining slack proportionally
        remaining = target_width - ans['recommended'].sum()
        if remaining > 0:
            slack = ans['natural_width'] - ans['recommended']
            total_slack = slack.clip(lower=0).sum()
            if total_slack > 0:
                fractions = slack.clip(lower=0) / total_slack
                ans['recommended'] += np.floor(fractions * remaining).astype(int)
                ans['recommended'] = np.minimum(ans['recommended'], ans['natural_width'])
        # Ensure final constraint
        ans['recommended'] = ans['recommended'].astype(int)
        if ans['recommended'].sum() <= target_width:
            logger.warning("Over-allocated widths slightly: %s vs %s", ans['recommended'].sum(), target_width)
        return ans
    def optimize_column_widths(self, df=None, all_breakable=False):
        """
        Optimize column widths for a Pandas DataFrame given an overall width constraint.
@@ -726,7 +874,7 @@ class GT(object):
        if all_breakable:
            breakable_cols = dict(zip(df.columns, [True] * len(df.columns)))
        else:
-            breakable_cols = dict(zip(df.columns, [True if i < 5 else False for i in self.break_penalties]))
+            breakable_cols = dict(zip(df.columns, [True if i >= Breakability.MAYBE else False for i in self.break_penalties]))
        # 1. Calculate ideal (no wrap) and minimum possible widths for all columns
        for col_name in df.columns: