From 23061b450d17607a009fbeadc74698fcade42df8 Mon Sep 17 00:00:00 2001 From: Stephen Mildenhall Date: Fri, 30 May 2025 23:44:21 +0100 Subject: [PATCH] half way point on better column width control for text table output --- greater_tables/greater_tables.py | 243 +++++++++++++++++++++++++++++-- 1 file changed, 230 insertions(+), 13 deletions(-) diff --git a/greater_tables/greater_tables.py b/greater_tables/greater_tables.py index fa606f8..ad2262b 100644 --- a/greater_tables/greater_tables.py +++ b/greater_tables/greater_tables.py @@ -82,7 +82,7 @@ class GT(object): large_ok=False, max_str_length=-1, str_table_fmt='mixed_grid', - str_max_width=200, + max_table_width=200, debug=False): """ Create a greater_tables formatting object. @@ -164,10 +164,13 @@ class GT(object): :param large_ok: signal that you are intentionally applying to a large dataframe. Subclasses may restrict or apply .head() to df. :param max_str_length: maximum displayed length of object types, that are cast to strings. Eg if you have nested DataFrames! :param str_table_fmt: table format used for string output (markdown), default mixed_grid - :param str_max_width: max table width used for markdown string output, default 200 + :param max_table_width: max table width used for markdown string output, default 200 :param debug: if True, add id to caption and use colored lines in table, default False. """ # deal with alternative input modes + if df is None: + # don't want None to fail + df = pd.DataFrame([]) if isinstance(df, pd.DataFrame): # usual use case pass @@ -210,7 +213,7 @@ class GT(object): # self.df.columns.names = [None] * self.df.columns.nlevels self.df_id = df_short_hash(self.df) self.str_table_fmt = str_table_fmt - self.str_max_width = str_max_width + self.max_table_width = max_table_width self.debug = debug if self.caption != '' and self.debug: self.caption += f' (id: {self.df_id})' @@ -222,6 +225,7 @@ class GT(object): self.ncols = self.df.shape[1] self.dt = self.df.dtypes + # reset index to put all columns on an equal footing, but note number ofindex cols with warnings.catch_warnings(): if self.show_index: warnings.simplefilter( @@ -292,6 +296,7 @@ class GT(object): return str(x) self.default_formatter = wrapped_default_formatter + # cast as much as possible to floats with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=pd.errors.PerformanceWarning) @@ -311,11 +316,15 @@ class GT(object): logger.debug( f'coercing {i}={c} from {old_type} to float FAILED') - # now can determine types + # now can determine types and infer the break penalties (for column sizes) self.float_col_indices = [] self.integer_col_indices = [] self.date_col_indices = [] - self.object_col_indices = [] # not accually used, but for neatness + self.object_col_indices = [] # not actually used, but for neatness + self.break_penalties = [] + break_never = 10 + break_maybe = 5 + break_ok = 1 # manage non-unique col names here logger.debug('FIGURING TYPES') for i in range(self.df.shape[1]): @@ -323,15 +332,23 @@ class GT(object): if is_datetime64_any_dtype(ser): logger.debug(f'col {i} = {self.df.columns[i]} is DATE') self.date_col_indices.append(i) + self.break_penalties.append(break_maybe) elif is_integer_dtype(ser): logger.debug(f'col {i} = {self.df.columns[i]} is INTEGER') self.integer_col_indices.append(i) + self.break_penalties.append(break_never) elif is_float_dtype(ser): logger.debug(f'col {i} = {self.df.columns[i]} is FLOAT') self.float_col_indices.append(i) + self.break_penalties.append(break_never) else: logger.debug(f'col {i} = {self.df.columns[i]} is OBJECT') self.object_col_indices.append(i) + c = ser.name + if c in self.year_cols or c in self.ratio_cols: + self.break_penalties.append(break_never) + else: + self.break_penalties.append(break_ok) # figure out column and index alignment if aligners is not None and np.any(self.df.columns.duplicated()): @@ -449,6 +466,7 @@ class GT(object): # this radically alters the df, so keep a copy for now... self.df_pre_applying_formatters = self.df.copy() self.df = self.apply_formatters(self.df) + self._debug_col_widths = None # sparsify if sparsify and self.nindex > 1: self.df = GT.sparsify(self.df, self.df.columns[:self.nindex]) @@ -669,8 +687,201 @@ class GT(object): """Basic representation.""" return f"GreaterTable(df_id={self.df_id})" + def optimize_column_widths(self, df=None, all_breakable=False): + """ + Optimize column widths for a Pandas DataFrame given an overall width constraint. + + This function is run twice, once with the original df and once with the headings + as the only row. In the latter case all columns are breakable. + + Widths are in abstract character units. + + Working variables + df <- self.df, the formatted input dataframe + breakable_cols derived from self.break_penalties; a dictionary where keys are column names and values are booleans. + True if column content can wrap (text), False otherwise (numbers/fixed). + self.max_table_width: The total available width for the table (in abstract units). + + Returns: + A dictionary mapping column names to their optimized widths (in abstract units). + + Raises: + ValueError: If a column in the DataFrame is not found in the breakable_cols mapping. + + Gemini code + """ + PAD = 0 # left right padding of one (certainly in mixed_grid) + # df we will work on: this has had all formatting applied (??string pruning?) + df = df if df is not None else self.df + # all dtypes should be object + assert all([i == object for i in df.dtypes.values]) + + col_widths = {} + # The absolute minimum width each column can take (e.g., longest word for text) + min_possible_widths = {} + # The width if content didn't wrap (single line) + # Series=dict colname->max width of cells in column + ideal_widths = (PAD + df.map(len).max(axis=0)).to_dict() + # map break penalties to True (strings) / False (numbers and dates) + if all_breakable: + breakable_cols = dict(zip(df.columns, [True] * len(df.columns))) + else: + breakable_cols = dict(zip(df.columns, [True if i < 5 else False for i in self.break_penalties])) + + # 1. Calculate ideal (no wrap) and minimum possible widths for all columns + for col_name in df.columns: + if col_name not in breakable_cols: + raise ValueError(f"Column '{col_name}' not found in breakable_cols mapping. Please provide a boolean for every column.") + + max_len = ideal_widths[col_name] + + if breakable_cols[col_name]: + # For breakable text, min width is the longest word, or a small default + # Estimate the minimum unbreakable width for a text column. + min_possible_widths[col_name] = ( + df[col_name].str + .split(pat='[^\w]', regex=True, expand=True) + .fillna('') + .map(len) + .max(axis=1) + .max() + ) + PAD + else: + # For non-breakable content, min width is its ideal width + min_possible_widths[col_name] = max_len + + # Ensure a minimum width of 1 unit for all columns, even if content is empty + if min_possible_widths[col_name] == 0: + min_possible_widths[col_name] = 1 + + total_ideal_width = sum(ideal_widths.values()) + total_min_possible_width = sum(min_possible_widths.values()) + + # 2. Distribute width based on self.max_table_width + if total_ideal_width <= self.max_table_width: + # We have enough space for ideal widths (no wrapping). + # Assign ideal widths and distribute any remaining space proportionally. + col_widths = {col: ideal_widths[col] for col in df.columns} + + # DON'T EXPAND + # remaining_space = self.max_table_width - total_ideal_width + + # if remaining_space > 0 and total_ideal_width > 0: + # # Distribute remaining space proportionally to current ideal widths + # proportion_factor = remaining_space / total_ideal_width + # for col in df.columns: + # col_widths[col] += col_widths[col] * proportion_factor + # elif remaining_space > 0 and total_ideal_width == 0 and len(df.columns) > 0: + # # Handle case where all ideal widths are zero (e.g., empty DataFrame) + # # Distribute space equally + # equal_share = self.max_table_width / len(df.columns) + # for col in df.columns: + # col_widths[col] = equal_share + + else: + # We need to shrink. Total ideal width exceeds the constraint. + # This is where the heuristic comes in. + + if self.max_table_width < total_min_possible_width: + # The constraint is tighter than even the absolute minimums. + # In this case, we have to scale down even the minimums. This will + # likely lead to content truncation or severe wrapping. + if total_min_possible_width > 0: + scale_factor = self.max_table_width / total_min_possible_width + for col in df.columns: + col_widths[col] = min_possible_widths[col] * scale_factor + elif len(df.columns) > 0: # All min widths are zero, distribute equally + equal_share = self.max_table_width / len(df.columns) + for col in df.columns: + col_widths[col] = equal_share + else: # No columns to distribute width to + return {} # Empty dictionary + else: + # We can fit the minimums, but not all ideals. + # Assign minimum widths first. + col_widths = {col: min_possible_widths[col] for col in df.columns} + remaining_space = self.max_table_width - total_min_possible_width + + # Identify columns that can expand from their minimums up to their ideal widths. + expandable_cols_capacity = { + col: ideal_widths[col] - min_possible_widths[col] + for col in df.columns + if ideal_widths[col] > min_possible_widths[col] + } + total_expandable_capacity = sum(expandable_cols_capacity.values()) + + if remaining_space > 0 and total_expandable_capacity > 0: + # Distribute the `remaining_space` among expandable columns. + # We distribute proportionally based on their *capacity to expand*. + # This ensures columns that *need* more space (to reach ideal) get more of the available extra space. + distribute_factor = min(1.0, remaining_space / total_expandable_capacity) + + for col in df.columns: + if col in expandable_cols_capacity: + col_widths[col] += expandable_cols_capacity[col] * distribute_factor + + # Round widths to a sensible number of decimal places for practical use + for col in col_widths: + col_widths[col] = round(col_widths[col], 0) + + _debug = pd.DataFrame({ + 'break_penalties': self.break_penalties, + 'breakable_cols': breakable_cols.values(), + 'min_possible_widths': min_possible_widths.values(), + 'ideal_widths': ideal_widths.values(), + 'col_widths': col_widths.values(), + }, index=df.columns) + # _debug.loc['total'] = _debug.sum(axis=0) + try: + _debug.loc['total', :] = _debug.sum(0) + except: + _debug.loc['total', :] = np.nan + self._debug_col_widths = _debug + + return col_widths + def __str__(self): """String representation, for print().""" + if self.df.empty: + return "" + # need to run twice: for df and headers + + df_dummy = self.df.copy() + df_dummy = df_dummy.iloc[:1] + df_dummy.iloc[0] = df_dummy.columns + # TODO assumes all column headers are strings, which is broadly true + colw_hd = self.optimize_column_widths(df_dummy, all_breakable=True) + temp1 = self._debug_col_widths['col_widths'].values + temp2 = self._debug_col_widths['ideal_widths'].values + temp3 = self._debug_col_widths['min_possible_widths'] + + # print(colw_hd) + + colw_df = self.optimize_column_widths() + self._debug_col_widths['headers_cw'] = temp1 + self._debug_col_widths['headers_ideal'] = temp2 + self._debug_col_widths['headers_min'] = temp3.values + + # print(colw_df) + # strip off leading grt- prefix from aligners + dfa = [i[4:] for i in self.df_aligners] + + col_mx = [max(colw_df[i], temp3[i]) for i in self.df.columns] + + return self.df.to_markdown( + index=False, # self.show_index, + colalign=dfa, + tablefmt=self.str_table_fmt, + maxcolwidths=col_mx, + maxheadercolwidths=col_mx, + # maxcolwidths=[colw_df.get(i) for i in self.df.columns], + # maxheadercolwidths=[colw_df.get(i) for i in self.df.columns], + ) + + def __OLDstr__(self): + """String representation, for print().""" + if self.df.empty: + return "" df = self.df # strip off grt- dfa = [i[4:] for i in self.df_aligners] @@ -681,16 +892,22 @@ class GT(object): if len(lens): m = lens.mean() s = lens.std() + x = lens.max() else: - m, s = 1, 0 - cw = min(m + s, lens.max(), np.percentile(lens, 75)) - colw[c] = np.round(cw, 0) + m, s, x = 1, 0, 100 + if x <= 20: + # don't be silly about trimming relatively short columns + colw[c] = x + else: + cw = min(m + s, lens.max(), np.percentile(lens, 75)) + colw[c] = np.round(cw, 0) total_width = sum(colw.values()) - if total_width > self.str_max_width: - scale = self.str_max_width / total_width + scale = 1 + if total_width > self.max_table_width: + scale = self.max_table_width / total_width for k, v in colw.items(): colw[k] = max(1, np.round(colw[k] * scale, 0)) - print(sum(colw.values()), colw) + print(f'{scale=}, {sum(colw.values())=}', colw) return df.to_markdown( index=self.show_index, colalign=dfa, @@ -852,7 +1069,7 @@ class GT(object): colw, tabs = GT.estimate_column_widths( self.df, nc_index=self.nindex, scale=1, equal=self.equal) if self.debug: - print(f'Input {self.tabs=}\nComputed {tabs=}') + print(f'Make html Input {self.tabs=}\nComputed {tabs=}') if self.tabs is not None: if len(tabs) == len(self.tabs): tabs = self.tabs @@ -1250,7 +1467,7 @@ class GT(object): # estimate... originally called guess_column_widths, with more parameters colw, tabs = GT.estimate_column_widths(df, nc_index=nc_index, scale=scale, equal=self.equal) # noqa if self.debug: - print(f'Input {self.tabs=}\nComputed {tabs=}') + print(f'Make TikZ Input {self.tabs=}\nComputed {tabs=}') if self.tabs is not None: if len(tabs) == len(self.tabs): tabs = self.tabs