mirror of
https://github.com/wassname/greater_tables_project.git
synced 2026-06-27 17:48:45 +08:00
v 2.0.0alpha => new table widths, text output
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
__version__ = '1.2.0'
|
__version__ = '2.0.0'
|
||||||
__project__ = 'greater_tables'
|
__project__ = 'greater_tables'
|
||||||
__author__ = 'Stephen J Mildenhall'
|
__author__ = 'Stephen J Mildenhall'
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,33 @@
|
|||||||
# table formatting again
|
# table formatting again
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from decimal import InvalidOperation
|
from decimal import InvalidOperation
|
||||||
|
from enum import IntEnum
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from textwrap import wrap
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
|
from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
|
||||||
is_float_dtype # , is_numeric_dtype
|
is_float_dtype # , is_numeric_dtype
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
|
|
||||||
from .hasher import df_short_hash
|
from .hasher import df_short_hash
|
||||||
|
|
||||||
|
|
||||||
|
class Breakability(IntEnum):
|
||||||
|
"""To track if a column should or should not be broken (wrapped)."""
|
||||||
|
|
||||||
|
NEVER = 0
|
||||||
|
DATE = 3
|
||||||
|
MAYBE = 5
|
||||||
|
ACCEPTABLE = 10
|
||||||
|
|
||||||
|
|
||||||
# turn this fuck-fest off
|
# turn this fuck-fest off
|
||||||
pd.set_option('future.no_silent_downcasting', True)
|
pd.set_option('future.no_silent_downcasting', True)
|
||||||
# pandas complaining about casting columns eg putting object in float column
|
# pandas complaining about casting columns eg putting object in float column
|
||||||
@@ -322,9 +334,6 @@ class GT(object):
|
|||||||
self.date_col_indices = []
|
self.date_col_indices = []
|
||||||
self.object_col_indices = [] # not actually used, but for neatness
|
self.object_col_indices = [] # not actually used, but for neatness
|
||||||
self.break_penalties = []
|
self.break_penalties = []
|
||||||
break_never = 10
|
|
||||||
break_maybe = 5
|
|
||||||
break_ok = 1
|
|
||||||
# manage non-unique col names here
|
# manage non-unique col names here
|
||||||
logger.debug('FIGURING TYPES')
|
logger.debug('FIGURING TYPES')
|
||||||
for i in range(self.df.shape[1]):
|
for i in range(self.df.shape[1]):
|
||||||
@@ -332,23 +341,23 @@ class GT(object):
|
|||||||
if is_datetime64_any_dtype(ser):
|
if is_datetime64_any_dtype(ser):
|
||||||
logger.debug(f'col {i} = {self.df.columns[i]} is DATE')
|
logger.debug(f'col {i} = {self.df.columns[i]} is DATE')
|
||||||
self.date_col_indices.append(i)
|
self.date_col_indices.append(i)
|
||||||
self.break_penalties.append(break_maybe)
|
self.break_penalties.append(Breakability.DATE)
|
||||||
elif is_integer_dtype(ser):
|
elif is_integer_dtype(ser):
|
||||||
logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER')
|
logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER')
|
||||||
self.integer_col_indices.append(i)
|
self.integer_col_indices.append(i)
|
||||||
self.break_penalties.append(break_never)
|
self.break_penalties.append(Breakability.NEVER)
|
||||||
elif is_float_dtype(ser):
|
elif is_float_dtype(ser):
|
||||||
logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT')
|
logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT')
|
||||||
self.float_col_indices.append(i)
|
self.float_col_indices.append(i)
|
||||||
self.break_penalties.append(break_never)
|
self.break_penalties.append(Breakability.NEVER)
|
||||||
else:
|
else:
|
||||||
logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT')
|
logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT')
|
||||||
self.object_col_indices.append(i)
|
self.object_col_indices.append(i)
|
||||||
c = ser.name
|
c = ser.name
|
||||||
if c in self.year_cols or c in self.ratio_cols:
|
if c in self.year_cols or c in self.ratio_cols:
|
||||||
self.break_penalties.append(break_never)
|
self.break_penalties.append(Breakability.NEVER)
|
||||||
else:
|
else:
|
||||||
self.break_penalties.append(break_ok)
|
self.break_penalties.append(Breakability.ACCEPTABLE)
|
||||||
|
|
||||||
# figure out column and index alignment
|
# figure out column and index alignment
|
||||||
if aligners is not None and np.any(self.df.columns.duplicated()):
|
if aligners is not None and np.any(self.df.columns.duplicated()):
|
||||||
@@ -687,6 +696,145 @@ class GT(object):
|
|||||||
"""Basic representation."""
|
"""Basic representation."""
|
||||||
return f"GreaterTable(df_id={self.df_id})"
|
return f"GreaterTable(df_id={self.df_id})"
|
||||||
|
|
||||||
|
def column_width_df(self, allocate_overage=True):
|
||||||
|
"""
|
||||||
|
Return dataframe of width information.
|
||||||
|
|
||||||
|
* natural width, all on one line = max len by col
|
||||||
|
* min width = max length given breaks
|
||||||
|
* break type of column
|
||||||
|
* alignment of column
|
||||||
|
* index natural width
|
||||||
|
* index min width
|
||||||
|
"""
|
||||||
|
df = self.df
|
||||||
|
n_row, n_col = df.shape
|
||||||
|
PADDING = 2 # per column
|
||||||
|
# target width INCLUDES padding and column marks |
|
||||||
|
target_width = self.max_table_width - PADDING * n_col - (n_col + 1)
|
||||||
|
print(f'{self.max_table_width = } and {target_width = }')
|
||||||
|
# The width if content didn't wrap (single line)
|
||||||
|
# Series=dict colname->max width of cells in column
|
||||||
|
natural_width = df.map(lambda x: len(x.strip())).max(axis=0).to_dict()
|
||||||
|
|
||||||
|
|
||||||
|
# re.split(r'(?<=[\s.,:;!?()\[\]{}\-\\/|])\s*', text)
|
||||||
|
# (?<=...) is a lookbehind to preserve the break character with the left-hand fragment.
|
||||||
|
# [\s.,:;!?()\[\]{}\-\\/|] matches common punctuation and separators:
|
||||||
|
# \s = whitespace
|
||||||
|
# . , : ; ! ? = terminal punctuation
|
||||||
|
# () [] {} = brackets
|
||||||
|
# \- = dash
|
||||||
|
# \\/| = slash, backslash, pipe
|
||||||
|
pat =r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--+\s+|\s+'
|
||||||
|
iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})'
|
||||||
|
pat = f'{pat}|{iso_date_split}'
|
||||||
|
|
||||||
|
# Calculate ideal (no wrap) and minimum possible widths for all columns
|
||||||
|
# The absolute minimum width each column can take (e.g., longest word for text)
|
||||||
|
min_acceptable_width = {}
|
||||||
|
for col_name in df.columns:
|
||||||
|
min_acceptable_width[col_name] = (
|
||||||
|
df[col_name].str
|
||||||
|
.split(pat=pat, regex=True, expand=True)
|
||||||
|
.fillna('')
|
||||||
|
.map(len)
|
||||||
|
.max(axis=1)
|
||||||
|
.max()
|
||||||
|
)
|
||||||
|
#
|
||||||
|
ans = pd.DataFrame({
|
||||||
|
'alignment': [i[4:] for i in self.df_aligners],
|
||||||
|
'break_penalties': self.break_penalties,
|
||||||
|
'breakability' : [x.name for x in self.break_penalties],
|
||||||
|
'natural_width': natural_width.values(),
|
||||||
|
'min_acceptable_width': min_acceptable_width.values(),
|
||||||
|
}, index=df.columns)
|
||||||
|
ans['break_acceptable'] = ans.natural_width
|
||||||
|
ans['break_acceptable'] = np.where(ans.break_penalties==Breakability.ACCEPTABLE, ans.min_acceptable_width, ans.natural_width)
|
||||||
|
# DUH - this is min_acceptable_width
|
||||||
|
# ans['break_dates'] = np.where(ans.break_penalties==Breakability.DATE, ans.min_acceptable_width, ans.break_acceptable)
|
||||||
|
|
||||||
|
natural, acceptable, min_acceptable = ans.iloc[:, 3:].sum()
|
||||||
|
if target_width > natural:
|
||||||
|
# everything gets its natural width
|
||||||
|
ans['recommended'] = ans['natural_width']
|
||||||
|
space = target_width - natural
|
||||||
|
elif target_width > acceptable:
|
||||||
|
# strings wrap
|
||||||
|
ans['recommended'] = ans['break_acceptable']
|
||||||
|
# use up extra on the ACCEPTABLE cols
|
||||||
|
space = target_width - acceptable
|
||||||
|
logger.info('Overage to allocated = %s', space)
|
||||||
|
elif target_width > min_acceptable:
|
||||||
|
# strings and dates wrap
|
||||||
|
ans['recommended'] = ans['min_acceptable_width']
|
||||||
|
# use up extra on dates first, then strings
|
||||||
|
space = target_width - min_acceptable
|
||||||
|
logger.info('Overage to allocated = %s', space)
|
||||||
|
else:
|
||||||
|
# OK severely too small
|
||||||
|
ans['recommended'] = ans['min_acceptable_width']
|
||||||
|
logger.warning('Desired width too small for pleasant formatting, table will be too wide.')
|
||||||
|
shortfall = min_acceptable - target_width
|
||||||
|
return ans
|
||||||
|
|
||||||
|
if not allocate_overage:
|
||||||
|
return ans
|
||||||
|
|
||||||
|
# Allocate the excess ------------------------------
|
||||||
|
if df.columns.nlevels == 1:
|
||||||
|
# Step 1: baseline comes in from code above
|
||||||
|
ans['raw_rec'] = ans.recommended
|
||||||
|
|
||||||
|
# Step 2: how much extra would it take to reduce header line count?
|
||||||
|
def header_wrap_cost(header, width):
|
||||||
|
if not isinstance(header, str):
|
||||||
|
return 1
|
||||||
|
return len(wrap(header, width))
|
||||||
|
|
||||||
|
header_lengths = {col: len(col) for col in df.columns}
|
||||||
|
current_lines = {col: header_wrap_cost(col, ans.loc[col, 'min_acceptable_width']) for col in df.columns}
|
||||||
|
next_wrap_gain = {}
|
||||||
|
|
||||||
|
for col in df.columns:
|
||||||
|
w = ans.loc[col, 'min_acceptable_width']
|
||||||
|
for extra in range(1, 10): # cap search
|
||||||
|
new_w = w + extra
|
||||||
|
if header_wrap_cost(col, new_w) < current_lines[col]:
|
||||||
|
next_wrap_gain[col] = extra
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
next_wrap_gain[col] = 0
|
||||||
|
|
||||||
|
header_budget = min(space, sum(next_wrap_gain.values()))
|
||||||
|
for col in df.columns:
|
||||||
|
gain = next_wrap_gain[col]
|
||||||
|
if gain > 0:
|
||||||
|
give = min(gain, header_budget)
|
||||||
|
ans.loc[col, 'recommended'] += give
|
||||||
|
header_budget -= give
|
||||||
|
if header_budget <= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Step 3: distribute remaining slack proportionally
|
||||||
|
remaining = target_width - ans['recommended'].sum()
|
||||||
|
if remaining > 0:
|
||||||
|
slack = ans['natural_width'] - ans['recommended']
|
||||||
|
total_slack = slack.clip(lower=0).sum()
|
||||||
|
if total_slack > 0:
|
||||||
|
fractions = slack.clip(lower=0) / total_slack
|
||||||
|
ans['recommended'] += np.floor(fractions * remaining).astype(int)
|
||||||
|
ans['recommended'] = np.minimum(ans['recommended'], ans['natural_width'])
|
||||||
|
|
||||||
|
# Ensure final constraint
|
||||||
|
ans['recommended'] = ans['recommended'].astype(int)
|
||||||
|
if ans['recommended'].sum() <= target_width:
|
||||||
|
logger.warning("Over-allocated widths slightly: %s vs %s", ans['recommended'].sum(), target_width)
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def optimize_column_widths(self, df=None, all_breakable=False):
|
def optimize_column_widths(self, df=None, all_breakable=False):
|
||||||
"""
|
"""
|
||||||
Optimize column widths for a Pandas DataFrame given an overall width constraint.
|
Optimize column widths for a Pandas DataFrame given an overall width constraint.
|
||||||
@@ -726,7 +874,7 @@ class GT(object):
|
|||||||
if all_breakable:
|
if all_breakable:
|
||||||
breakable_cols = dict(zip(df.columns, [True] * len(df.columns)))
|
breakable_cols = dict(zip(df.columns, [True] * len(df.columns)))
|
||||||
else:
|
else:
|
||||||
breakable_cols = dict(zip(df.columns, [True if i < 5 else False for i in self.break_penalties]))
|
breakable_cols = dict(zip(df.columns, [True if i >= Breakability.MAYBE else False for i in self.break_penalties]))
|
||||||
|
|
||||||
# 1. Calculate ideal (no wrap) and minimum possible widths for all columns
|
# 1. Calculate ideal (no wrap) and minimum possible widths for all columns
|
||||||
for col_name in df.columns:
|
for col_name in df.columns:
|
||||||
|
|||||||
Reference in New Issue
Block a user