v 2.0.0alpha => new table widths, text output

This commit is contained in:
Stephen Mildenhall
2025-06-01 10:10:19 +01:00
parent 23061b450d
commit f354ef7b10
2 changed files with 164 additions and 16 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
__version__ = '1.2.0' __version__ = '2.0.0'
__project__ = 'greater_tables' __project__ = 'greater_tables'
__author__ = 'Stephen J Mildenhall' __author__ = 'Stephen J Mildenhall'
+163 -15
View File
@@ -1,21 +1,33 @@
# table formatting again # table formatting again
from bs4 import BeautifulSoup
from decimal import InvalidOperation from decimal import InvalidOperation
from enum import IntEnum
from io import StringIO from io import StringIO
from itertools import groupby from itertools import groupby
import logging import logging
from pathlib import Path
import re
import sys
from textwrap import wrap
import warnings
from bs4 import BeautifulSoup
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \ from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
is_float_dtype # , is_numeric_dtype is_float_dtype # , is_numeric_dtype
from pathlib import Path
import re
import sys
import warnings
from .hasher import df_short_hash from .hasher import df_short_hash
class Breakability(IntEnum):
"""To track if a column should or should not be broken (wrapped)."""
NEVER = 0
DATE = 3
MAYBE = 5
ACCEPTABLE = 10
# turn this fuck-fest off # turn this fuck-fest off
pd.set_option('future.no_silent_downcasting', True) pd.set_option('future.no_silent_downcasting', True)
# pandas complaining about casting columns eg putting object in float column # pandas complaining about casting columns eg putting object in float column
@@ -322,9 +334,6 @@ class GT(object):
self.date_col_indices = [] self.date_col_indices = []
self.object_col_indices = [] # not actually used, but for neatness self.object_col_indices = [] # not actually used, but for neatness
self.break_penalties = [] self.break_penalties = []
break_never = 10
break_maybe = 5
break_ok = 1
# manage non-unique col names here # manage non-unique col names here
logger.debug('FIGURING TYPES') logger.debug('FIGURING TYPES')
for i in range(self.df.shape[1]): for i in range(self.df.shape[1]):
@@ -332,23 +341,23 @@ class GT(object):
if is_datetime64_any_dtype(ser): if is_datetime64_any_dtype(ser):
logger.debug(f'col {i} = {self.df.columns[i]} is DATE') logger.debug(f'col {i} = {self.df.columns[i]} is DATE')
self.date_col_indices.append(i) self.date_col_indices.append(i)
self.break_penalties.append(break_maybe) self.break_penalties.append(Breakability.DATE)
elif is_integer_dtype(ser): elif is_integer_dtype(ser):
logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER') logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER')
self.integer_col_indices.append(i) self.integer_col_indices.append(i)
self.break_penalties.append(break_never) self.break_penalties.append(Breakability.NEVER)
elif is_float_dtype(ser): elif is_float_dtype(ser):
logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT') logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT')
self.float_col_indices.append(i) self.float_col_indices.append(i)
self.break_penalties.append(break_never) self.break_penalties.append(Breakability.NEVER)
else: else:
logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT') logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT')
self.object_col_indices.append(i) self.object_col_indices.append(i)
c = ser.name c = ser.name
if c in self.year_cols or c in self.ratio_cols: if c in self.year_cols or c in self.ratio_cols:
self.break_penalties.append(break_never) self.break_penalties.append(Breakability.NEVER)
else: else:
self.break_penalties.append(break_ok) self.break_penalties.append(Breakability.ACCEPTABLE)
# figure out column and index alignment # figure out column and index alignment
if aligners is not None and np.any(self.df.columns.duplicated()): if aligners is not None and np.any(self.df.columns.duplicated()):
@@ -687,6 +696,145 @@ class GT(object):
"""Basic representation.""" """Basic representation."""
return f"GreaterTable(df_id={self.df_id})" return f"GreaterTable(df_id={self.df_id})"
def column_width_df(self, allocate_overage=True):
"""
Return dataframe of width information.
* natural width, all on one line = max len by col
* min width = max length given breaks
* break type of column
* alignment of column
* index natural width
* index min width
"""
df = self.df
n_row, n_col = df.shape
PADDING = 2 # per column
# target width INCLUDES padding and column marks |
target_width = self.max_table_width - PADDING * n_col - (n_col + 1)
print(f'{self.max_table_width = } and {target_width = }')
# The width if content didn't wrap (single line)
# Series=dict colname->max width of cells in column
natural_width = df.map(lambda x: len(x.strip())).max(axis=0).to_dict()
# re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text)
# (?<=...) is a lookbehind to preserve the break character with the left-hand fragment.
# [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators:
# \s = whitespace
# . , : ; ! ? = terminal punctuation
# () [] {} = brackets
# \- = dash
# \\/| = slash, backslash, pipe
pat =r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--+\s+|\s+'
iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})'
pat = f'{pat}|{iso_date_split}'
# Calculate ideal (no wrap) and minimum possible widths for all columns
# The absolute minimum width each column can take (e.g., longest word for text)
min_acceptable_width = {}
for col_name in df.columns:
min_acceptable_width[col_name] = (
df[col_name].str
.split(pat=pat, regex=True, expand=True)
.fillna('')
.map(len)
.max(axis=1)
.max()
)
#
ans = pd.DataFrame({
'alignment': [i[4:] for i in self.df_aligners],
'break_penalties': self.break_penalties,
'breakability' : [x.name for x in self.break_penalties],
'natural_width': natural_width.values(),
'min_acceptable_width': min_acceptable_width.values(),
}, index=df.columns)
ans['break_acceptable'] = ans.natural_width
ans['break_acceptable'] = np.where(ans.break_penalties==Breakability.ACCEPTABLE, ans.min_acceptable_width, ans.natural_width)
# DUH - this is min_acceptable_width
# ans['break_dates'] = np.where(ans.break_penalties==Breakability.DATE, ans.min_acceptable_width, ans.break_acceptable)
natural, acceptable, min_acceptable = ans.iloc[:, 3:].sum()
if target_width > natural:
# everything gets its natural width
ans['recommended'] = ans['natural_width']
space = target_width - natural
elif target_width > acceptable:
# strings wrap
ans['recommended'] = ans['break_acceptable']
# use up extra on the ACCEPTABLE cols
space = target_width - acceptable
logger.info('Overage to allocated = %s', space)
elif target_width > min_acceptable:
# strings and dates wrap
ans['recommended'] = ans['min_acceptable_width']
# use up extra on dates first, then strings
space = target_width - min_acceptable
logger.info('Overage to allocated = %s', space)
else:
# OK severely too small
ans['recommended'] = ans['min_acceptable_width']
logger.warning('Desired width too small for pleasant formatting, table will be too wide.')
shortfall = min_acceptable - target_width
return ans
if not allocate_overage:
return ans
# Allocate the excess ------------------------------
if df.columns.nlevels == 1:
# Step 1: baseline comes in from code above
ans['raw_rec'] = ans.recommended
# Step 2: how much extra would it take to reduce header line count?
def header_wrap_cost(header, width):
if not isinstance(header, str):
return 1
return len(wrap(header, width))
header_lengths = {col: len(col) for col in df.columns}
current_lines = {col: header_wrap_cost(col, ans.loc[col, 'min_acceptable_width']) for col in df.columns}
next_wrap_gain = {}
for col in df.columns:
w = ans.loc[col, 'min_acceptable_width']
for extra in range(1, 10): # cap search
new_w = w + extra
if header_wrap_cost(col, new_w) < current_lines[col]:
next_wrap_gain[col] = extra
break
else:
next_wrap_gain[col] = 0
header_budget = min(space, sum(next_wrap_gain.values()))
for col in df.columns:
gain = next_wrap_gain[col]
if gain > 0:
give = min(gain, header_budget)
ans.loc[col, 'recommended'] += give
header_budget -= give
if header_budget <= 0:
break
# Step 3: distribute remaining slack proportionally
remaining = target_width - ans['recommended'].sum()
if remaining > 0:
slack = ans['natural_width'] - ans['recommended']
total_slack = slack.clip(lower=0).sum()
if total_slack > 0:
fractions = slack.clip(lower=0) / total_slack
ans['recommended'] += np.floor(fractions * remaining).astype(int)
ans['recommended'] = np.minimum(ans['recommended'], ans['natural_width'])
# Ensure final constraint
ans['recommended'] = ans['recommended'].astype(int)
if ans['recommended'].sum() <= target_width:
logger.warning("Over-allocated widths slightly: %s vs %s", ans['recommended'].sum(), target_width)
return ans
def optimize_column_widths(self, df=None, all_breakable=False): def optimize_column_widths(self, df=None, all_breakable=False):
""" """
Optimize column widths for a Pandas DataFrame given an overall width constraint. Optimize column widths for a Pandas DataFrame given an overall width constraint.
@@ -726,7 +874,7 @@ class GT(object):
if all_breakable: if all_breakable:
breakable_cols = dict(zip(df.columns, [True] * len(df.columns))) breakable_cols = dict(zip(df.columns, [True] * len(df.columns)))
else: else:
breakable_cols = dict(zip(df.columns, [True if i < 5 else False for i in self.break_penalties])) breakable_cols = dict(zip(df.columns, [True if i >= Breakability.MAYBE else False for i in self.break_penalties]))
# 1. Calculate ideal (no wrap) and minimum possible widths for all columns # 1. Calculate ideal (no wrap) and minimum possible widths for all columns
for col_name in df.columns: for col_name in df.columns: