mirror of
https://github.com/wassname/greater_tables_project.git
synced 2026-06-27 17:30:44 +08:00
983 lines
32 KiB
Python
983 lines
32 KiB
Python
"""
|
||
Text length, column width balancing and other utilities.
|
||
"""
|
||
|
||
import html
|
||
from io import StringIO
|
||
import logging
|
||
import re
|
||
from textwrap import wrap
|
||
|
||
import pandas as pd
|
||
from pybtex.textutils import width
|
||
from rich import box
|
||
from rich.table import Table
|
||
|
||
from .formats import GT_Format, TableFormat, Line, DataRow
|
||
|
||
|
||
__all__ = [
|
||
"MD2DF",
|
||
"Escaping",
|
||
"TextLength",
|
||
"Sparsify",
|
||
"Indexing",
|
||
"Width",
|
||
"TextOutput",
|
||
"RichOutput",
|
||
]
|
||
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class MD2DF:
|
||
"""Convert markdown to dataframe."""
|
||
|
||
@staticmethod
|
||
def md_to_df(txt):
|
||
"""Convert markdown text string table to DataFrame."""
|
||
# extract table and optional caption part
|
||
table, caption = MD2DF.parse_markdown_table_and_caption(txt)
|
||
m = re.search(r"\{#(tbl[:a-zA-Z0-9_-]+)\}", caption)
|
||
if m:
|
||
label = m.group(1)
|
||
if label != "":
|
||
# remove from caption
|
||
caption = caption.replace(f"{{#{label}}}", "").strip()
|
||
else:
|
||
label = ""
|
||
# print(f'{caption = } and {label = }')
|
||
if table == "":
|
||
raise ValueError("Bad markdown table")
|
||
|
||
# remove starting and ending | in each line (optional anyway)
|
||
txt = re.sub(r"^\||\|$", "", table, flags=re.MULTILINE)
|
||
txt = txt.split("\n")
|
||
# remove starting and ending *'s added by hand - but try to avoid * within headings!
|
||
txt[0] = "|".join(
|
||
[re.sub(r"^\*\*?|\*\*?$", "", i.strip()) for i in txt[0].split("|")]
|
||
)
|
||
|
||
# remove the alignment row
|
||
alignment_row = txt.pop(1)
|
||
aligners = []
|
||
for t in alignment_row.split("|"):
|
||
if t[0] == ":" and t[-1] == ":":
|
||
aligners.append("c")
|
||
elif t[0] == ":":
|
||
aligners.append("l")
|
||
elif t[-1] == ":":
|
||
aligners.append("r")
|
||
else:
|
||
# no alignment info
|
||
pass
|
||
if len(aligners) == 0:
|
||
aligners = None
|
||
else:
|
||
aligners = "".join(aligners)
|
||
txt = [[j.strip() for j in i.split("|")] for i in txt]
|
||
df = pd.DataFrame(txt).T
|
||
df = df.set_index(0)
|
||
df = df.T
|
||
return df, aligners, caption, label
|
||
|
||
@staticmethod
|
||
def parse_markdown_table_and_caption(txt: str) -> tuple[str, str | None]:
|
||
"""
|
||
Parses a Markdown table and an optional caption from a given string,
|
||
handling cases where only the caption is present.
|
||
|
||
Args:
|
||
txt: The input string.
|
||
|
||
Returns:
|
||
A tuple containing the table string (empty if not found) and the caption string (or None if no caption).
|
||
"""
|
||
table_match = re.search(r"((?:\|.*\|\s*(?:\n|$))+)", txt, re.DOTALL)
|
||
caption_match = re.search(
|
||
r"^(?:table)?:\s*(.+)", txt, re.MULTILINE + re.IGNORECASE
|
||
)
|
||
|
||
table_part = table_match.group(1).strip() if table_match else ""
|
||
caption_part = caption_match.group(1) if caption_match else ""
|
||
|
||
return table_part.strip(), caption_part.strip()
|
||
|
||
|
||
class Escaping:
|
||
"""Escape html and tex within tables."""
|
||
|
||
@staticmethod
|
||
def clean_name(n):
|
||
"""
|
||
Escape underscores for using a name in a DataFrame index
|
||
and converts to a string. Also escape %.
|
||
|
||
Called by Tikz routines.
|
||
|
||
:param n: input name, str
|
||
:return:
|
||
"""
|
||
try:
|
||
if type(n) == str:
|
||
# quote underscores that are not in dollars
|
||
return "$".join(
|
||
(
|
||
i if n % 2 else i.replace("_", "\\_").replace("%", "\\%")
|
||
for n, i in enumerate(n.split("$"))
|
||
)
|
||
)
|
||
else:
|
||
# can't contain an underscore!
|
||
return str(n)
|
||
except:
|
||
return str(n)
|
||
|
||
@staticmethod
|
||
def clean_index(df):
|
||
"""
|
||
escape _ for columns and index, being careful about subscripts
|
||
in TeX formulas.
|
||
|
||
:param df:
|
||
:return:
|
||
"""
|
||
return df.rename(index=Escaping.clean_name, columns=Escaping.clean_name)
|
||
|
||
@staticmethod
|
||
def clean_html_tex(text):
|
||
r"""
|
||
Clean TeX entries in HTML: $ -> \( and \) and $$ to \[ \].
|
||
|
||
Apply after all other HTML rendering steps. HTML rendering only.
|
||
"""
|
||
text = re.sub(r"\$\$(.*?)\$\$", r"\\[\1\\]", text, flags=re.DOTALL)
|
||
# Convert inline math: $...$ → \(...\)
|
||
text = re.sub(r"(?<!\$)\$(.*?)(?<!\\)\$(?!\$)", r"\\(\1\\)", text)
|
||
return text
|
||
|
||
@staticmethod
|
||
def escape_tex_outside_math(text):
|
||
# Pattern to match math environments: $...$, $$...$$, \[...\]
|
||
if not isinstance(text, str):
|
||
return text
|
||
math_pattern = re.compile(r"(\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\])", re.DOTALL)
|
||
|
||
# def escape_non_math(s):
|
||
# return s.replace('\\', r'\\').replace('%', r'\%')
|
||
# because of use within tikz tables
|
||
def escape_non_math(s):
|
||
return (
|
||
s.replace("\\", r"\textbackslash{}")
|
||
.replace("%", r"\%")
|
||
.replace("_", r"\_")
|
||
)
|
||
|
||
parts = []
|
||
last_end = 0
|
||
for m in math_pattern.finditer(text):
|
||
start, end = m.span()
|
||
parts.append(escape_non_math(text[last_end:start]))
|
||
parts.append(m.group()) # math part, unescaped
|
||
last_end = end
|
||
parts.append(escape_non_math(text[last_end:]))
|
||
|
||
return "".join(parts)
|
||
|
||
@staticmethod
|
||
def escape_df_tex(df):
|
||
# Escape data elements
|
||
df = df.map(Escaping.escape_tex_outside_math)
|
||
|
||
# Escape index and column values
|
||
def escape_index(idx):
|
||
if isinstance(idx, pd.MultiIndex):
|
||
return pd.MultiIndex.from_tuples(
|
||
[
|
||
tuple(Escaping.escape_tex_outside_math(x) for x in tup)
|
||
for tup in idx
|
||
],
|
||
names=[
|
||
Escaping.escape_tex_outside_math(n) if n else n
|
||
for n in idx.names
|
||
],
|
||
)
|
||
else:
|
||
return pd.Index(
|
||
[Escaping.escape_tex_outside_math(x) for x in idx],
|
||
name=Escaping.escape_tex_outside_math(idx.name)
|
||
if idx.name
|
||
else None,
|
||
)
|
||
|
||
df.index = escape_index(df.index)
|
||
df.columns = escape_index(df.columns)
|
||
|
||
return df
|
||
|
||
|
||
class TextLength:
|
||
"""Estimate length in em of displayed text."""
|
||
|
||
# TeX control sequence display widths (heuristic)
|
||
TEX_SIMPLE_GLYPHS = {
|
||
"alpha",
|
||
"beta",
|
||
"gamma",
|
||
"delta",
|
||
"epsilon",
|
||
"zeta",
|
||
"eta",
|
||
"theta",
|
||
"iota",
|
||
"kappa",
|
||
"lambda",
|
||
"mu",
|
||
"nu",
|
||
"xi",
|
||
"omicron",
|
||
"pi",
|
||
"rho",
|
||
"sigma",
|
||
"tau",
|
||
"upsilon",
|
||
"phi",
|
||
"chi",
|
||
"psi",
|
||
"omega",
|
||
"infty",
|
||
"sum",
|
||
"prod",
|
||
"int",
|
||
"cup",
|
||
"cap",
|
||
"vee",
|
||
"wedge",
|
||
"forall",
|
||
"exists",
|
||
"neg",
|
||
"leq",
|
||
"geq",
|
||
"neq",
|
||
"approx",
|
||
"to",
|
||
"leftarrow",
|
||
"rightarrow",
|
||
}
|
||
TEX_WIDE = {"frac", "sqrt", "sum", "int", "prod"}
|
||
TEX_SPACING = {"quad", "qquad", ",", ";", " ", "!"}
|
||
_WIDTH_TABLE = {
|
||
"a": 0.444,
|
||
"b": 0.5,
|
||
"c": 0.444,
|
||
"d": 0.5,
|
||
"e": 0.444,
|
||
"f": 0.333,
|
||
"g": 0.5,
|
||
"h": 0.5,
|
||
"i": 0.278,
|
||
"j": 0.278,
|
||
"k": 0.5,
|
||
"l": 0.278,
|
||
"m": 0.778,
|
||
"n": 0.5,
|
||
"o": 0.5,
|
||
"p": 0.5,
|
||
"q": 0.5,
|
||
"r": 0.333,
|
||
"s": 0.389,
|
||
"t": 0.278,
|
||
"u": 0.5,
|
||
"v": 0.5,
|
||
"w": 0.722,
|
||
"x": 0.5,
|
||
"y": 0.5,
|
||
"z": 0.444,
|
||
"A": 0.722,
|
||
"B": 0.667,
|
||
"C": 0.667,
|
||
"D": 0.722,
|
||
"E": 0.611,
|
||
"F": 0.556,
|
||
"G": 0.722,
|
||
"H": 0.722,
|
||
"I": 0.333,
|
||
"J": 0.389,
|
||
"K": 0.722,
|
||
"L": 0.611,
|
||
"M": 0.889,
|
||
"N": 0.722,
|
||
"O": 0.722,
|
||
"P": 0.556,
|
||
"Q": 0.722,
|
||
"R": 0.667,
|
||
"S": 0.556,
|
||
"T": 0.611,
|
||
"U": 0.722,
|
||
"V": 0.722,
|
||
"W": 0.944,
|
||
"X": 0.722,
|
||
"Y": 0.722,
|
||
"Z": 0.611,
|
||
"0": 0.5,
|
||
"1": 0.5,
|
||
"2": 0.5,
|
||
"3": 0.5,
|
||
"4": 0.5,
|
||
"5": 0.5,
|
||
"6": 0.5,
|
||
"7": 0.5,
|
||
"8": 0.5,
|
||
"9": 0.5,
|
||
".": 0.25,
|
||
",": 0.25,
|
||
":": 0.278,
|
||
";": 0.278,
|
||
"(": 0.333,
|
||
")": 0.333,
|
||
"[": 0.333,
|
||
"]": 0.333,
|
||
"’": 0.333,
|
||
'"': 0.444,
|
||
"!": 0.333,
|
||
"?": 0.444,
|
||
" ": 0.25,
|
||
"|": 0.2,
|
||
"‘": 0.333,
|
||
"{": 0.48,
|
||
"}": 0.48,
|
||
"-": 0.5, # 0.333,
|
||
}
|
||
_DEFAULT_WIDTH = 0.6
|
||
|
||
@staticmethod
|
||
def approximate_char_width_em(c: str) -> float:
|
||
"""Estimate character width in em."""
|
||
return TextLength._WIDTH_TABLE.get(c, TextLength._DEFAULT_WIDTH)
|
||
|
||
@staticmethod
|
||
def text_display_len(s: str) -> float:
|
||
"""Estimate display width in ems, ignoring HTML tags, interpreting TeX, and HTML entities."""
|
||
# can be called on an index that may not be a string??
|
||
if not isinstance(s, str):
|
||
# print(f'{s} is {type(s)}!!')
|
||
s = str(s)
|
||
|
||
def strip_html_tags(text):
|
||
return re.sub(r"<[^>]*>", "", text)
|
||
|
||
def decode_entities(text):
|
||
return html.unescape(text)
|
||
|
||
if "$" not in s and "<" not in s and "&" not in s:
|
||
return sum(TextLength.approximate_char_width_em(c) for c in s)
|
||
|
||
parts = re.split(r"(\$\$.*?\$\$)|(\$.*?\$)", s)
|
||
total = 0.0
|
||
for part in parts:
|
||
if part is None:
|
||
continue
|
||
if part.startswith("$$") and part.endswith("$$"):
|
||
total += TextLength.estimate_math_width(part[2:-2])
|
||
elif part.startswith("$") and part.endswith("$"):
|
||
total += TextLength.estimate_math_width(part[1:-1])
|
||
else:
|
||
visible = strip_html_tags(part)
|
||
decoded = decode_entities(visible)
|
||
total += sum(TextLength.approximate_char_width_em(c) for c in decoded)
|
||
return total
|
||
|
||
@staticmethod
|
||
def estimate_math_width(tex: str) -> float:
|
||
tokens = re.findall(r"\\[a-zA-Z]+|[a-zA-Z0-9]|.", tex)
|
||
width = 0.0
|
||
for tok in tokens:
|
||
if tok.startswith("\\"):
|
||
name = tok[1:]
|
||
if name in TextLength.TEX_SIMPLE_GLYPHS:
|
||
width += 0.6
|
||
elif name in TextLength.TEX_WIDE:
|
||
width += 1.5
|
||
elif name in TextLength.TEX_SPACING:
|
||
width += 0.4
|
||
else:
|
||
width += 1.0 # unknown control sequences
|
||
elif tok in "{}":
|
||
continue # grouping only
|
||
elif tok in "^_":
|
||
width += 0.3 # assume sub/superscript takes some space
|
||
else:
|
||
width += TextLength.approximate_char_width_em(tok)
|
||
return width
|
||
|
||
# original
|
||
# @staticmethod
|
||
# def text_display_len(s: str) -> int:
|
||
# """Estimate text display length in em of a string allowing for TeX constructs."""
|
||
# # note you DO WANT SPACES! So, no strip applied ever.
|
||
# if s.find('$') < 0:
|
||
# return len(s)
|
||
# parts = re.split(r'(\$\$.*?\$\$)|(\$.*?\$)', s)
|
||
# total = 0
|
||
# for part in parts:
|
||
# if part is None:
|
||
# continue
|
||
# if part.startswith('$$') and part.endswith('$$'):
|
||
# total += TextLength._estimate_math_width(part[2:-2])
|
||
# elif part.startswith('$') and part.endswith('$'):
|
||
# total += TextLength._estimate_math_width(part[1:-1])
|
||
# else:
|
||
# total += len(part)
|
||
# return total
|
||
|
||
# @staticmethod
|
||
# def _estimate_math_width(tex: str) -> int:
|
||
# tokens = re.findall(r'\\[a-zA-Z]+|[a-zA-Z0-9]|.', tex)
|
||
# width = 0
|
||
# for tok in tokens:
|
||
# if tok.startswith('\\'):
|
||
# name = tok[1:]
|
||
# if name in TextLength.TEX_SIMPLE_GLYPHS:
|
||
# width += 1
|
||
# elif name in TextLength.TEX_WIDE:
|
||
# width += 3
|
||
# elif name in TextLength.TEX_SPACING:
|
||
# width += 1
|
||
# else:
|
||
# width += 2 # unknown control sequences
|
||
# elif tok in '{}^_':
|
||
# continue # grouping, sub/superscripts: ignore
|
||
# else:
|
||
# width += 1
|
||
# return width
|
||
|
||
|
||
class Sparsify:
|
||
"""Sparsify multiindex rows and columns."""
|
||
|
||
@staticmethod
|
||
def sparsify(df, cs):
|
||
out = df.copy()
|
||
for i, c in enumerate(cs):
|
||
mask = df[cs[: i + 1]].ne(df[cs[: i + 1]].shift()).any(axis=1)
|
||
out.loc[~mask, c] = ""
|
||
return out
|
||
|
||
@staticmethod
|
||
def sparsify_mi(mi, bottom_level=False):
|
||
"""
|
||
as above for a multi index level, without the benefit of the index...
|
||
really all should use this function
|
||
:param mi:
|
||
:param bottom_level: for the lowest level ... all values repeated, no sparsificaiton
|
||
:return:
|
||
"""
|
||
last = mi[0]
|
||
new_col = list(mi)
|
||
rules = []
|
||
for k, v in enumerate(new_col[1:]):
|
||
if v == last and not bottom_level:
|
||
new_col[k + 1] = ""
|
||
else:
|
||
last = v
|
||
rules.append(k + 1)
|
||
new_col[k + 1] = v
|
||
return new_col, rules
|
||
|
||
|
||
class Indexing:
|
||
"""Changed column and level from a multi-index."""
|
||
|
||
@staticmethod
|
||
def changed_column(bit):
|
||
"""Return the column that changes with each row."""
|
||
tf = bit.ne(bit.shift())
|
||
tf = tf.loc[tf.any(axis=1)]
|
||
return tf.idxmax(axis=1)
|
||
|
||
@staticmethod
|
||
def changed_level(idx):
|
||
"""
|
||
Return the level of index that changes with each row.
|
||
|
||
Very ingenious GTP code with some SM enhancements.
|
||
"""
|
||
# otherwise you alter the actual index
|
||
idx = idx.copy()
|
||
idx.names = [i for i in range(idx.nlevels)]
|
||
# Determine at which level the index changes
|
||
# Convert MultiIndex to a DataFrame
|
||
index_df = idx.to_frame(index=False)
|
||
# true / false match last row
|
||
tf = index_df.ne(index_df.shift())
|
||
# changes need at least one true
|
||
tf = tf.loc[tf.any(axis=1)]
|
||
level_changes = tf.idxmax(axis=1)
|
||
return level_changes
|
||
|
||
|
||
class Width:
|
||
"""Adjust column widths based on heading widths."""
|
||
|
||
@staticmethod
|
||
def header_adjustment(df, min_widths, space, max_extra):
|
||
"""
|
||
Fine-adjust heading for optimal config.spacing.
|
||
|
||
Return a dict with per-column recommended width adjustments to avoid
|
||
intra-word breaks and reduce overall header height.
|
||
|
||
Parameters:
|
||
df: DataFrame with 1-level string column names
|
||
min_widths: dict of column name -> minimal acceptable width
|
||
space: amount of space available to be allocated
|
||
max_extra: max extra characters to consider allocating per column
|
||
|
||
Returns:
|
||
dict: column -> additional width to allocate
|
||
"""
|
||
colnames = list(df.columns)
|
||
adjustments = {col: 0 for col in colnames}
|
||
num_lines = 0
|
||
|
||
def has_intra_word_break(text: str, width: int) -> bool:
|
||
"""
|
||
Determine if textwrap.wrap breaks any words in the given text.
|
||
|
||
Gemini - GPT code did not work, even after seveal iterations.
|
||
This is a nice approach to the problem.
|
||
|
||
Args:
|
||
text: The input string.
|
||
width: The maximum width for wrapping.
|
||
|
||
Returns:
|
||
True if any word is broken across lines, False otherwise.
|
||
"""
|
||
nonlocal num_lines
|
||
wrapped_lines = wrap(text, width=width) if width else text
|
||
num_lines = len(wrapped_lines)
|
||
original_words = text.split()
|
||
|
||
reconstructed_text_from_wrapped = " ".join(wrapped_lines)
|
||
reconstructed_words = reconstructed_text_from_wrapped.split()
|
||
|
||
# If the number of words differs, it means some words were split.
|
||
# This catches cases where a word might be split and then later re-joined
|
||
# due to subsequent wrapping logic, leading to a different number of words.
|
||
if len(original_words) != len(reconstructed_words):
|
||
return True
|
||
|
||
# Compare word by word. If any word from the original doesn't exactly match
|
||
# a word from the reconstructed list, it implies a split.
|
||
for i in range(len(original_words)):
|
||
if original_words[i] != reconstructed_words[i]:
|
||
return True
|
||
|
||
return False
|
||
|
||
# First pass: avoid ugly intraword breaks
|
||
# make dict of col -> longest word length
|
||
min_acceptable = {
|
||
c: v
|
||
for c, v in zip(
|
||
colnames,
|
||
map(lambda x: max(len(i) for i in re.split(r"[ \-/]", x)), colnames),
|
||
)
|
||
}
|
||
options = []
|
||
for col in colnames:
|
||
if not isinstance(col, str):
|
||
continue
|
||
base_width = min_widths[col]
|
||
if not has_intra_word_break(col, base_width):
|
||
options.append([col, 0, num_lines])
|
||
# nothing to be gained, move to next col
|
||
continue
|
||
extra0 = max(0, min_acceptable[col] - base_width)
|
||
if extra0 > max_extra:
|
||
# ok, can't flatten word because it is too long
|
||
extra0 = 0
|
||
elif extra0 == max_extra:
|
||
# go with that
|
||
adjustments[col] = max_extra
|
||
continue
|
||
# see if col can be flattened within max_extra chars, starting
|
||
# at extra0, which is enough to avoid intraword breaks
|
||
for extra in range(extra0, max_extra + 1):
|
||
if not has_intra_word_break(col, base_width + extra):
|
||
options.append([col, extra, num_lines])
|
||
if adjustments[col] == 0:
|
||
# take first, but compute rest...
|
||
adjustments[col] = extra
|
||
# temporary diagnostic DEBUG information - comment in prod
|
||
# from IPython.display import display
|
||
# config.debug = pd.Series([col, min_acceptable[col], base_width, has_intra_word_break(col, base_width), extra0, max_extra,
|
||
# wrap(col, base_width), extra],
|
||
# index=['col name', 'min acceptable', 'base_width (from data)', 'intra word break', 'extra0', 'max_extra', 'split', 'selected extra']).to_frame('Value')
|
||
# display(config.debug)
|
||
# make df[col name, amount of extra space for col, resulting number of lines]
|
||
# this is needed as input for the optimal heading function (next)
|
||
input_df = pd.DataFrame(options, columns=["col", "extra", "num_lines"])
|
||
# min amount to avoid intra work breaks
|
||
avoid_intra = input_df.groupby("col").min().extra.sum()
|
||
if avoid_intra >= space:
|
||
# that's all we can do
|
||
logger.warning(
|
||
"Insufficient space to avoid ugly wraps -> NO FURTHER IMPROVEMENTS"
|
||
)
|
||
else:
|
||
# can try for a better solution
|
||
sol = Width.optimal_heading(input_df, space)
|
||
adjustments.update(sol[1])
|
||
logger.info("best solution: %s", sol)
|
||
return adjustments
|
||
|
||
@staticmethod
|
||
def optimal_heading(
|
||
input_df: pd.DataFrame, total_es_budget: int
|
||
) -> tuple[int, dict[str, int]]:
|
||
"""
|
||
Optimize extra config.spacing for best heading.
|
||
|
||
Finds the best way to allocate extra space to minimize max_lines in heading.
|
||
|
||
Gemini solution.
|
||
|
||
Args:
|
||
input_df: DataFrame with 'col', 'extra', 'num_lines'.
|
||
total_es_budget: The total extra space to allocate.
|
||
|
||
Returns:
|
||
A tuple: (min_max_lines, optimal_extra_allocation_per_column).
|
||
|
||
Why this approach is effective:
|
||
---------------------------------
|
||
|
||
* **Optimal Solution:** The binary search guarantees finding the absolute minimum possible `max_lines` because it systematically explores the entire solution space.
|
||
* **Efficiency:** The `check` function runs in time proportional to the number of columns times the average number of `extra` options per column. The binary search itself performs `log(range_of_num_lines)` iterations. This makes the overall complexity efficient for typical table sizes.
|
||
* **Flexibility:** It does not assume any particular mathematical function relating `extra` space to `num_lines`. It works with arbitrary discrete relationships provided in the input DataFrame, as long as `num_lines` is non-increasing as `extra` increases (which is the natural expectation for this problem).
|
||
|
||
"""
|
||
# Pre-processing
|
||
unique_cols = input_df["col"].unique().tolist()
|
||
|
||
col_extra_num_lines_options = {}
|
||
for col_name in unique_cols:
|
||
col_data = input_df[input_df["col"] == col_name].sort_values(by="extra")
|
||
col_extra_num_lines_options[col_name] = list(
|
||
zip(col_data["extra"], col_data["num_lines"])
|
||
)
|
||
|
||
def check(target_max_lines: int) -> bool:
|
||
current_extra_needed = 0
|
||
for col_name in unique_cols:
|
||
min_extra_for_col = float("inf")
|
||
found_suitable_extra = False
|
||
for extra_val, num_lines_val in col_extra_num_lines_options[col_name]:
|
||
if num_lines_val <= target_max_lines:
|
||
min_extra_for_col = extra_val
|
||
found_suitable_extra = True
|
||
break
|
||
|
||
if not found_suitable_extra:
|
||
return False
|
||
|
||
current_extra_needed += min_extra_for_col
|
||
|
||
return current_extra_needed <= total_es_budget
|
||
|
||
all_num_lines = input_df["num_lines"].unique()
|
||
|
||
# Corrected line: Check length of the numpy array
|
||
if len(all_num_lines) == 0:
|
||
return 0, {}
|
||
|
||
L = all_num_lines.min()
|
||
R = all_num_lines.max()
|
||
|
||
optimal_max_lines = R
|
||
best_allocation = {}
|
||
|
||
while L <= R:
|
||
mid_max_lines = L + (R - L) // 2
|
||
|
||
temp_current_extra_needed = 0
|
||
temp_current_allocation = {}
|
||
possible = True
|
||
for col_name in unique_cols:
|
||
min_extra_for_col = float("inf")
|
||
found_suitable_extra = False
|
||
for extra_val, num_lines_val in col_extra_num_lines_options[col_name]:
|
||
if num_lines_val <= mid_max_lines:
|
||
min_extra_for_col = extra_val
|
||
found_suitable_extra = True
|
||
break
|
||
|
||
if not found_suitable_extra:
|
||
possible = False
|
||
break
|
||
|
||
temp_current_extra_needed += min_extra_for_col
|
||
temp_current_allocation[col_name] = min_extra_for_col
|
||
|
||
if possible and temp_current_extra_needed <= total_es_budget:
|
||
optimal_max_lines = mid_max_lines
|
||
best_allocation = temp_current_allocation.copy()
|
||
R = mid_max_lines - 1
|
||
else:
|
||
L = mid_max_lines + 1
|
||
|
||
return optimal_max_lines, best_allocation
|
||
|
||
|
||
class TextOutput:
|
||
"""Convert dataframe to text, replaces pd.DataFrame.to_markdown."""
|
||
|
||
@staticmethod
|
||
def make_text_table(
|
||
df: pd.DataFrame,
|
||
data_col_widths: list[int],
|
||
data_col_aligns: list[str],
|
||
*,
|
||
index_levels: int = 1,
|
||
fmt: TableFormat = GT_Format,
|
||
) -> str:
|
||
"""
|
||
Render self.df as a wrapped, boxed table.
|
||
|
||
Output like tabulate's mixed_grid with support for:
|
||
- Multi-level column headers (always shown, bottom-aligned, can wrap)
|
||
- Split index vs. body section with heavy vertical separator
|
||
- Per-column width and alignment
|
||
- Wrapped body cells with top alignment
|
||
|
||
Custom code to print a dataframe to text.
|
||
|
||
pd.DataFrame.to_string uses tabulate.tabulate which is hard to
|
||
control. This modoule provides similar functionality with greater
|
||
control over column widths and the ability to demark the index
|
||
columns.
|
||
|
||
Returns:
|
||
str: A fully formatted table as a string (useful for print, logs, or files).
|
||
"""
|
||
buf = StringIO()
|
||
|
||
def _write_line(line: str) -> None:
|
||
"""Writes a line to the buffer followed by a newline."""
|
||
buf.write(line + "\n")
|
||
|
||
def _format_cell(text: str, width: int, align: str) -> list[str]:
|
||
"""
|
||
Formats a single cell, wrapping text and applying padding and alignment.
|
||
Returns a list of strings, each representing a line of the cell.
|
||
"""
|
||
width = int(width)
|
||
lines = wrap(str(text), width=width) or [""]
|
||
padded_width = width + 2 * fmt.padding
|
||
return [
|
||
(" " * fmt.padding)
|
||
+ (
|
||
line.ljust(width)
|
||
if align == "left"
|
||
else line.center(width)
|
||
if align == "center"
|
||
else line.rjust(width)
|
||
)
|
||
+ (" " * fmt.padding)
|
||
for line in lines
|
||
]
|
||
|
||
def _make_horizontal_line(line_fmt: Line, col_widths: list[int]) -> str:
|
||
"""Constructs a full horizontal line for the table."""
|
||
parts = []
|
||
for i, w in enumerate(col_widths):
|
||
total = w + 2 * fmt.padding
|
||
if index_levels and i == index_levels:
|
||
parts.append(line_fmt.index_sep)
|
||
elif i > 0:
|
||
parts.append(line_fmt.sep)
|
||
try:
|
||
parts.append(line_fmt.hline * int(total))
|
||
except:
|
||
# print('ERROR')
|
||
# print(w, fmt.padding)
|
||
# print(total, type(total))
|
||
# print(line_fmt.hline)
|
||
raise
|
||
return f"{line_fmt.begin}{''.join(parts)}{line_fmt.end}"
|
||
|
||
def _make_data_row(row_fmt: DataRow, line_cells: list[str]) -> str:
|
||
"""Constructs a single data row from formatted cell strings."""
|
||
parts = []
|
||
for i, cell in enumerate(line_cells):
|
||
if index_levels and i == index_levels:
|
||
parts.append(row_fmt.index_sep)
|
||
elif i > 0:
|
||
parts.append(row_fmt.sep)
|
||
parts.append(cell)
|
||
return f"{row_fmt.begin}{''.join(parts)}{row_fmt.end}"
|
||
|
||
def _render_header_level(
|
||
wrapped_cells: list[list[str]], level_widths: list[int]
|
||
) -> list[str]:
|
||
"""
|
||
Renders a single level of the header, ensuring cells are bottom-aligned.
|
||
Returns a list of strings, each representing a line of the header.
|
||
"""
|
||
max_height = max(len(c) for c in wrapped_cells)
|
||
padded_cells = [
|
||
[" " * int(w + 2 * fmt.padding)] * (max_height - len(cell)) + cell
|
||
for cell, w in zip(wrapped_cells, level_widths)
|
||
]
|
||
return [
|
||
_make_data_row(fmt.headerrow, [col[i] for col in padded_cells])
|
||
for i in range(max_height)
|
||
]
|
||
|
||
col_levels = df.columns.nlevels
|
||
col_tuples = df.columns if col_levels > 1 else [(c,) for c in df.columns]
|
||
|
||
# Step 1: format each level of the column headers (one header line per level)
|
||
# header alignment is left in index and center in body
|
||
index_col_aligns = [
|
||
"left" if i < index_levels else "center"
|
||
for i in range(len(data_col_aligns))
|
||
]
|
||
_write_line(_make_horizontal_line(fmt.lineabove, data_col_widths))
|
||
# collect all wrapped + bottom-aligned rows for each level
|
||
for level in range(col_levels):
|
||
level_texts = [str(t[level] if level < len(t) else "") for t in col_tuples]
|
||
wrapped_cells = [
|
||
_format_cell(txt, w, a)
|
||
for txt, w, a in zip(level_texts, data_col_widths, index_col_aligns)
|
||
]
|
||
level_rows = _render_header_level(wrapped_cells, data_col_widths)
|
||
for row in level_rows:
|
||
_write_line(row)
|
||
if level < col_levels - 1:
|
||
_write_line(_make_horizontal_line(fmt.linebetweenrows, data_col_widths))
|
||
_write_line(_make_horizontal_line(fmt.linebelowheader, data_col_widths))
|
||
|
||
for row_idx, (_, row) in enumerate(df.iterrows()):
|
||
data_cells = [
|
||
_format_cell(val, w, a)
|
||
for val, w, a in zip(row.values, data_col_widths, data_col_aligns)
|
||
]
|
||
max_height = max(len(c) for c in data_cells)
|
||
padded = [
|
||
c + [" " * int(w + 2 * fmt.padding)] * (max_height - len(c))
|
||
for c, w in zip(data_cells, data_col_widths)
|
||
]
|
||
for i in range(max_height):
|
||
_write_line(_make_data_row(fmt.datarow, [col[i] for col in padded]))
|
||
|
||
if row_idx < len(df) - 1:
|
||
_write_line(_make_horizontal_line(fmt.linebetweenrows, data_col_widths))
|
||
else:
|
||
_write_line(_make_horizontal_line(fmt.linebelow, data_col_widths))
|
||
|
||
return buf.getvalue()
|
||
|
||
|
||
class RichOutput:
|
||
"""Render to a rich table."""
|
||
|
||
@staticmethod
|
||
def make_rich_table(
|
||
df,
|
||
column_widths,
|
||
column_alignments=None,
|
||
num_index_columns=0,
|
||
title=None,
|
||
show_lines=False,
|
||
box_style=box.SIMPLE_HEAVY,
|
||
):
|
||
"""
|
||
Render a preformatted DataFrame as a Rich table.
|
||
|
||
Parameters:
|
||
df (pd.DataFrame): DataFrame with all string values.
|
||
column_widths (dict or list): Widths by column name or position.
|
||
column_alignments (dict or list): Alignments ('left', 'center', 'right').
|
||
num_index_columns (int): Number of left-most columns to treat as index-like.
|
||
title (str): Optional title.
|
||
show_lines (bool): Add row separator lines.
|
||
box_style (rich.box.Box): Border style (see below).
|
||
"""
|
||
colnames = list(df.columns)
|
||
|
||
if isinstance(column_widths, list):
|
||
column_widths = {colnames[i]: w for i, w in enumerate(column_widths)}
|
||
|
||
if column_alignments is None:
|
||
column_alignments = {}
|
||
elif isinstance(column_alignments, list):
|
||
column_alignments = {
|
||
colnames[i]: a for i, a in enumerate(column_alignments)
|
||
}
|
||
|
||
table = Table(title=title, box=box_style, show_lines=show_lines, expand=True)
|
||
|
||
for i, col in enumerate(colnames):
|
||
is_index = i < num_index_columns
|
||
table.add_column(
|
||
header=str(col),
|
||
width=column_widths.get(col, None),
|
||
justify=column_alignments.get(col, "left"),
|
||
style="dim" if is_index else None,
|
||
header_style="bold dim" if is_index else "bold",
|
||
no_wrap=False,
|
||
overflow="fold",
|
||
vertical="middle",
|
||
# divider=divider,
|
||
)
|
||
|
||
for _, row in df.iterrows():
|
||
table.add_row(*row.tolist())
|
||
|
||
return table
|
||
|
||
|
||
class SmartTitle:
|
||
"""Support reasonable Title case for text."""
|
||
|
||
# TODO: Implement smart titling!
|
||
@staticmethod
|
||
def smart_title(text):
|
||
"""Slightly smart title capitalization (GPT4o)."""
|
||
small_words = {
|
||
"a",
|
||
"an",
|
||
"and",
|
||
"as",
|
||
"at",
|
||
"but",
|
||
"by",
|
||
"for",
|
||
"in",
|
||
"is",
|
||
"of",
|
||
"on",
|
||
"or",
|
||
"the",
|
||
"to",
|
||
"up",
|
||
"via",
|
||
"vs",
|
||
}
|
||
words = text.split()
|
||
result = []
|
||
|
||
for i, word in enumerate(words):
|
||
if len(word) <= 3 and word.isupper():
|
||
result.append(word) # already acronym-like
|
||
elif word.lower() in small_words and i != 0:
|
||
result.append(word.lower())
|
||
else:
|
||
result.append(word.capitalize())
|
||
|
||
return " ".join(result)
|