mirror of
https://github.com/wassname/greater_tables_project.git
synced 2026-06-27 16:15:38 +08:00
5.0.0alpha rationalized widths, removed staticmethods from GT into utilities
This commit is contained in:
@@ -1,6 +1,22 @@
|
||||
Versions
|
||||
==========
|
||||
|
||||
5.0.0
|
||||
-------
|
||||
* Introduced ``knowledge_df`` as source of all formatting instructions
|
||||
* ``df_tex`` and ``df_html`` computed before ``knowledge`` applying TeX and HTML specific transformations which are reflected in the estimated widths
|
||||
* Introduced gtutilities to pull out text width estimation, cleaning and escaping, etc.
|
||||
* Delete rich table output format?
|
||||
* Moved logging setup to separate file, called from cli but optional for use in other situation. GPT recommended approach.
|
||||
|
||||
* test cases
|
||||
* tex and or html in data, index, columns, escaped/unescaped
|
||||
* long cols
|
||||
* text with multiindex columns
|
||||
|
||||
* TODO
|
||||
* self.padl and r / 12 in make html width adj s/b elsewhere
|
||||
|
||||
4.0.0
|
||||
-------
|
||||
* Moved constants out of magic strings and into config
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__version__ = '4.0.0'
|
||||
__version__ = '5.0.0'
|
||||
__project__ = 'greater_tables'
|
||||
__author__ = 'Stephen J Mildenhall'
|
||||
|
||||
|
||||
@@ -1,6 +1,15 @@
|
||||
"""
|
||||
Command line interface: convert files to HTML, text, svg or pdf.
|
||||
"""
|
||||
|
||||
import click
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
from . gtlogging import setup_logging
|
||||
|
||||
setup_logging() # <-- must come before using your package
|
||||
|
||||
from .gtconfig import Configurator, write_template
|
||||
from .gtcore import GT
|
||||
|
||||
|
||||
@@ -85,10 +85,9 @@ class Configurator(BaseModel):
|
||||
padding_trbl: Optional[tuple[int, int, int, int]] = Field(
|
||||
None, description="Manual padding in the order (top, right, bottom, left)"
|
||||
)
|
||||
tex_to_html: Optional[Callable[[str], str]] = Field(
|
||||
default=None, description="Function to map non-math TeX to HTML, eg remap \\textbf{...}")
|
||||
|
||||
tikz_scale: float = Field(
|
||||
1.0, description="Scaling factor applied to LaTeX TikZ tables"
|
||||
)
|
||||
font_body: float = Field(
|
||||
0.9, description="Font size for body text (in em units)"
|
||||
)
|
||||
@@ -132,6 +131,9 @@ class Configurator(BaseModel):
|
||||
-1, description="Maximum length for stringified objects (e.g. nested DataFrames); -1 = unlimited"
|
||||
)
|
||||
|
||||
header_alignment: Literal["few", "center"] = Field(
|
||||
default='few', description="NYI!! TOOD Alignment of header cells, few=follow column, center=center." )
|
||||
|
||||
max_table_width: int = Field(
|
||||
200, description="Maximum table width for markdown/text output mode"
|
||||
)
|
||||
@@ -153,10 +155,13 @@ class Configurator(BaseModel):
|
||||
)
|
||||
|
||||
# tikz specific options
|
||||
tikz_scale: float = Field(
|
||||
1.0, description="Scaling factor applied to LaTeX TikZ tables"
|
||||
)
|
||||
tikz_column_sep: float = Field(
|
||||
0.5, description="Separation between columns")
|
||||
1, description="Separation between columns")
|
||||
tikz_row_sep: float = Field(
|
||||
0.125, description="Separation between rows")
|
||||
0.25, description="Separation between rows")
|
||||
tikz_container_env: Literal["table", "figure", "sidewaysfigure"] = Field(
|
||||
default="table",
|
||||
description="Type of element: 'table', 'figure', or 'sidewaysfigure'"
|
||||
@@ -175,7 +180,8 @@ class Configurator(BaseModel):
|
||||
'', description="non-line commands put at bottom of table")
|
||||
tikz_latex: Optional[str] = Field(
|
||||
None, description="arguments at top of table \\begin{table}[tikz_latex]")
|
||||
|
||||
tikz_escape_tex: Optional[bool] = Field(
|
||||
True, description="If true, escape non-math TeX like %, _ and \\. Default no escaping, allowing \\textbf{...} to work.")
|
||||
# meta
|
||||
debug: bool = Field(
|
||||
False, description="Run in debug mode with more reporting, include internal ID in caption and use colored output lines")
|
||||
|
||||
+440
-1204
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@ Good for testing. Outputs are cached by hash. PDF→SVG uses pdf2svg.
|
||||
GPT re-write of my old great2.blog code.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from subprocess import run, Popen, PIPE
|
||||
@@ -14,6 +15,9 @@ from IPython.display import SVG, display
|
||||
from .gthasher import txt_short_hash
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Etcher:
|
||||
"""Create PDF and SVG files from Tikz blocks."""
|
||||
# Full TeX preamble to generate a .fmt if needed
|
||||
@@ -76,13 +80,13 @@ class Etcher:
|
||||
'&pdflatex',
|
||||
tmp.name,
|
||||
]
|
||||
print(f'Running {" ".join(cmd)} to build format file...')
|
||||
logger.info(f'Running {" ".join(cmd)} to build format file...')
|
||||
(self.file_path.parent / 'make_format.bat').write_text(" ".join(cmd), encoding='utf-8')
|
||||
self.run_command(cmd, raise_on_error=True, cwd=self.out_path)
|
||||
# tidy up ... to some extent
|
||||
# tmp.unlink()
|
||||
(self.out_path / f'{self.format_file.stem}.log').unlink()
|
||||
print('...success...format file built', self.format_file.resolve())
|
||||
logger.info('...success...format file built', self.format_file.resolve())
|
||||
|
||||
def process_tikz(self):
|
||||
"""Compile TikZ to PDF and convert to SVG."""
|
||||
@@ -109,7 +113,7 @@ class Etcher:
|
||||
]
|
||||
(tex_path.parent / 'make_tikz.bat').write_text(" ".join(tex_cmd), encoding='utf-8')
|
||||
if self.debug:
|
||||
print("Running:", " ".join(tex_cmd))
|
||||
logger.info("Running:", " ".join(tex_cmd))
|
||||
if self.run_command(tex_cmd):
|
||||
raise ValueError('TeX failed to compile, not pdf or svg output.')
|
||||
# no tidying up
|
||||
@@ -122,7 +126,7 @@ class Etcher:
|
||||
str(svg_path)
|
||||
]
|
||||
if self.debug:
|
||||
print("Running:", " ".join(svg_cmd))
|
||||
logger.info("Running:", " ".join(svg_cmd))
|
||||
self.run_command(svg_cmd, raise_on_error=True)
|
||||
|
||||
if not self.debug:
|
||||
@@ -140,15 +144,15 @@ class Etcher:
|
||||
with Popen(command, cwd=cwd, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
|
||||
stdout, stderr = p.communicate()
|
||||
if stdout and self.debug:
|
||||
print('Run command output ends\n', stdout.strip()[-250:])
|
||||
logger.info('Run command output ends\n', stdout.strip()[-250:])
|
||||
if stdout:
|
||||
if stdout.find('no output PDF file produced') > 0:
|
||||
print("ERROR no pdf output\n"*5)
|
||||
logger.error("ERROR no pdf output\n"*5)
|
||||
return -1
|
||||
if stderr:
|
||||
if raise_on_error:
|
||||
raise RuntimeError(stderr.strip())
|
||||
else:
|
||||
print(stderr.strip())
|
||||
logger.error(stderr.strip())
|
||||
return -2
|
||||
return 0
|
||||
|
||||
@@ -6,6 +6,7 @@ from collections import deque
|
||||
from datetime import datetime, timedelta
|
||||
from importlib.resources import files
|
||||
from itertools import cycle, chain, count, zip_longest, product, islice
|
||||
# import logging
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
@@ -16,7 +17,8 @@ import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from IPython.display import display
|
||||
|
||||
# logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Fabricator:
|
||||
@@ -62,7 +64,8 @@ class Fabricator:
|
||||
with data_path.open('r', encoding='utf-8') as f:
|
||||
tex_list = pd.read_csv(f, index_col=0)['expr'].to_list()
|
||||
# trim down slightly
|
||||
pat = re.compile(r'(?<!\\)\b[a-z]{4,}\b')
|
||||
# dont' want | in tex...messes up tables!
|
||||
pat = re.compile(r'(?<!\\)\b[a-z]{4,}\b|\|')
|
||||
tex_list = [i for i in tex_list if not pat.search(i) and len(i)<=50]
|
||||
self.rng.shuffle(tex_list)
|
||||
self._tex_gen = cycle(tex_list)
|
||||
@@ -113,7 +116,10 @@ class Fabricator:
|
||||
df = df.droplevel(drop_levels, axis=1)
|
||||
return df
|
||||
|
||||
def make(self, rows, data_spec, *, index_levels=1, index_names=None, column_groups=1, column_levels=1, column_names=None, decorate=False, simplify=True, oversample=1):
|
||||
def make(self, rows, data_spec, *, index_levels=1, index_names=None,
|
||||
column_groups=1, column_levels=1, column_names=None,
|
||||
metric_name_spec='',
|
||||
decorate=False, simplify=True, oversample=1):
|
||||
"""
|
||||
Fabricate a dataframe with the given specification.
|
||||
|
||||
@@ -171,7 +177,12 @@ class Fabricator:
|
||||
index = pd.MultiIndex.from_tuples(islice(product(*(self._generate_column('s', v) for v in self.primes_for_product(rows, index_levels))), rows), names=inames)
|
||||
|
||||
# create with col groups and drop later if needed
|
||||
metric_names = [self.metric_name(t) for t in data_spec]
|
||||
if metric_name_spec == '':
|
||||
metric_names = [self.metric_name(t) for t in data_spec]
|
||||
else:
|
||||
metric_name_spec = self._parse_colspec(metric_name_spec)
|
||||
assert len(metric_name_spec) == len(data_spec), "metric name spec not consistent with data spec"
|
||||
metric_names = [self._generate_column(dt, 1).iloc[0] for dt in metric_name_spec]
|
||||
if column_groups > 0:
|
||||
cnames = (column_names or [f'c_{i}' for i in range(column_levels)]) + ['metric']
|
||||
columns_pfp = self.primes_for_product(column_groups, column_levels)
|
||||
|
||||
@@ -8,6 +8,7 @@ import time
|
||||
import base64
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def df_short_hash(df, length=12):
|
||||
"""Generate a short, time-dependent hash for a DataFrame (safe for HTML IDs)."""
|
||||
hasher = hashlib.blake2b(digest_size=8) # Smaller output
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Manage logger setup for CLI. Users have the option of
|
||||
calling this if they want similar logging.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
def setup_logging(level=logging.INFO):
|
||||
# Disable log propagation to prevent duplicates
|
||||
# logger.propagate = False
|
||||
root = logging.getLogger()
|
||||
if root.hasHandlers():
|
||||
root.handlers.clear()
|
||||
root.setLevel(level)
|
||||
handler = logging.StreamHandler(sys.stderr)
|
||||
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
root.addHandler(handler)
|
||||
|
||||
|
||||
# # Disable log propagation to prevent duplicates
|
||||
# logger.propagate = False
|
||||
# if logger.hasHandlers():
|
||||
# # Clear existing handlers
|
||||
# logger.handlers.clear()
|
||||
# # SET DEGBUGGER LEVEL
|
||||
# LEVEL = logging.INFO # DEBUG or INFO, WARNING, ERROR, CRITICAL
|
||||
# logger.setLevel(LEVEL)
|
||||
# handler = logging.StreamHandler(sys.stderr)
|
||||
# handler.setLevel(LEVEL)
|
||||
# formatter = logging.Formatter(
|
||||
# '%(asctime)s | %(levelname)s | %(funcName)-15s | %(message)s')
|
||||
# handler.setFormatter(formatter)
|
||||
# logger.addHandler(handler)
|
||||
# logger.info(f'Logger Setup; {__name__} module recompiled.')
|
||||
@@ -0,0 +1,784 @@
|
||||
"""
|
||||
Text length, column width balancing and other utilities.
|
||||
"""
|
||||
|
||||
import html
|
||||
from io import StringIO
|
||||
import logging
|
||||
import re
|
||||
from textwrap import wrap
|
||||
|
||||
import pandas as pd
|
||||
from rich import box
|
||||
from rich.table import Table
|
||||
|
||||
from . gtformats import GT_Format, TableFormat, Line, DataRow
|
||||
|
||||
|
||||
__all__ = ['MD2DF', 'Escaping', 'TextLength',
|
||||
'Sparsify', 'Indexing', 'Width', 'TextOutput',
|
||||
'RichOutput']
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MD2DF:
|
||||
"""Convert markdown to dataframe."""
|
||||
@staticmethod
|
||||
def md_to_df(txt):
|
||||
"""Convert markdown text string table to DataFrame."""
|
||||
# extract table and optional caption part
|
||||
table, caption = MD2DF.parse_markdown_table_and_caption(txt)
|
||||
m = re.search(r'\{#(tbl[:a-zA-Z0-9_-]+)\}', caption)
|
||||
if m:
|
||||
label = m.group(1)
|
||||
if label != '':
|
||||
# remove from caption
|
||||
caption = caption.replace(f'{{#{label}}}', '').strip()
|
||||
else:
|
||||
label = ''
|
||||
# print(f'{caption = } and {label = }')
|
||||
if table == '':
|
||||
raise ValueError('Bad markdown table')
|
||||
|
||||
# remove starting and ending | in each line (optional anyway)
|
||||
txt = re.sub(r'^\||\|$', '', table, flags=re.MULTILINE)
|
||||
txt = txt.split('\n')
|
||||
# remove starting and ending *'s added by hand - but try to avoid * within headings!
|
||||
txt[0] = '|'.join([re.sub(r'^\*\*?|\*\*?$', '', i.strip())
|
||||
for i in txt[0].split('|')])
|
||||
|
||||
# remove the alignment row
|
||||
alignment_row = txt.pop(1)
|
||||
aligners = []
|
||||
for t in alignment_row.split('|'):
|
||||
if t[0] == ':' and t[-1] == ':':
|
||||
aligners.append('c')
|
||||
elif t[0] == ':':
|
||||
aligners.append('l')
|
||||
elif t[-1] == ':':
|
||||
aligners.append('r')
|
||||
else:
|
||||
# no alignment info
|
||||
pass
|
||||
if len(aligners) == 0:
|
||||
aligners = None
|
||||
else:
|
||||
aligners = ''.join(aligners)
|
||||
txt = [[j.strip() for j in i.split('|')] for i in txt]
|
||||
df = pd.DataFrame(txt).T
|
||||
df = df.set_index(0)
|
||||
df = df.T
|
||||
return df, aligners, caption, label
|
||||
|
||||
@staticmethod
|
||||
def parse_markdown_table_and_caption(txt: str) -> tuple[str, str | None]:
|
||||
"""
|
||||
Parses a Markdown table and an optional caption from a given string,
|
||||
handling cases where only the caption is present.
|
||||
|
||||
Args:
|
||||
txt: The input string.
|
||||
|
||||
Returns:
|
||||
A tuple containing the table string (empty if not found) and the caption string (or None if no caption).
|
||||
"""
|
||||
table_match = re.search(r"((?:\|.*\|\s*(?:\n|$))+)", txt, re.DOTALL)
|
||||
caption_match = re.search(
|
||||
r"^(?:table)?:\s*(.+)", txt, re.MULTILINE + re.IGNORECASE)
|
||||
|
||||
table_part = table_match.group(1).strip() if table_match else ""
|
||||
caption_part = caption_match.group(1) if caption_match else ""
|
||||
|
||||
return table_part.strip(), caption_part.strip()
|
||||
|
||||
|
||||
class Escaping:
|
||||
"""Escape html and tex within tables."""
|
||||
@staticmethod
|
||||
def clean_name(n):
|
||||
"""
|
||||
Escape underscores for using a name in a DataFrame index
|
||||
and converts to a string. Also escape %.
|
||||
|
||||
Called by Tikz routines.
|
||||
|
||||
:param n: input name, str
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if type(n) == str:
|
||||
# quote underscores that are not in dollars
|
||||
return '$'.join((i if n % 2 else i.replace('_', '\\_').replace('%', '\\%')
|
||||
for n, i in enumerate(n.split('$'))))
|
||||
else:
|
||||
# can't contain an underscore!
|
||||
return str(n)
|
||||
except:
|
||||
return str(n)
|
||||
|
||||
@staticmethod
|
||||
def clean_index(df):
|
||||
"""
|
||||
escape _ for columns and index, being careful about subscripts
|
||||
in TeX formulas.
|
||||
|
||||
:param df:
|
||||
:return:
|
||||
"""
|
||||
return df.rename(index=Escaping.clean_name, columns=Escaping.clean_name)
|
||||
|
||||
@staticmethod
|
||||
def clean_html_tex(text):
|
||||
r"""
|
||||
Clean TeX entries in HTML: $ -> \( and \) and $$ to \[ \].
|
||||
|
||||
Apply after all other HTML rendering steps. HTML rendering only.
|
||||
"""
|
||||
text = re.sub(r'\$\$(.*?)\$\$', r'\\[\1\\]', text, flags=re.DOTALL)
|
||||
# Convert inline math: $...$ → \(...\)
|
||||
text = re.sub(r'(?<!\$)\$(.*?)(?<!\\)\$(?!\$)', r'\\(\1\\)', text)
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def escape_tex_outside_math(text):
|
||||
# Pattern to match math environments: $...$, $$...$$, \[...\]
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
math_pattern = re.compile(
|
||||
r'(\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\])', re.DOTALL)
|
||||
|
||||
# def escape_non_math(s):
|
||||
# return s.replace('\\', r'\\').replace('%', r'\%')
|
||||
# because of use within tikz tables
|
||||
def escape_non_math(s):
|
||||
return s.replace('\\', r'\textbackslash{}').replace('%', r'\%').replace('_', r'\_')
|
||||
|
||||
parts = []
|
||||
last_end = 0
|
||||
for m in math_pattern.finditer(text):
|
||||
start, end = m.span()
|
||||
parts.append(escape_non_math(text[last_end:start]))
|
||||
parts.append(m.group()) # math part, unescaped
|
||||
last_end = end
|
||||
parts.append(escape_non_math(text[last_end:]))
|
||||
|
||||
return ''.join(parts)
|
||||
|
||||
@staticmethod
|
||||
def escape_df_tex(df):
|
||||
# Escape data elements
|
||||
df = df.map(Escaping.escape_tex_outside_math)
|
||||
|
||||
# Escape index and column values
|
||||
def escape_index(idx):
|
||||
if isinstance(idx, pd.MultiIndex):
|
||||
return pd.MultiIndex.from_tuples(
|
||||
[tuple(Escaping.escape_tex_outside_math(x)
|
||||
for x in tup) for tup in idx],
|
||||
names=[Escaping.escape_tex_outside_math(
|
||||
n) if n else n for n in idx.names]
|
||||
)
|
||||
else:
|
||||
return pd.Index([Escaping.escape_tex_outside_math(x) for x in idx],
|
||||
name=Escaping.escape_tex_outside_math(idx.name) if idx.name else None)
|
||||
|
||||
df.index = escape_index(df.index)
|
||||
df.columns = escape_index(df.columns)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
class TextLength:
|
||||
"""Estimate length of displayed text."""
|
||||
# TeX control sequence display widths (heuristic)
|
||||
TEX_SIMPLE_GLYPHS = {
|
||||
'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta',
|
||||
'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho',
|
||||
'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega', 'infty',
|
||||
'sum', 'prod', 'int', 'cup', 'cap', 'vee', 'wedge', 'forall', 'exists',
|
||||
'neg', 'leq', 'geq', 'neq', 'approx', 'to', 'leftarrow', 'rightarrow'
|
||||
}
|
||||
TEX_WIDE = {'frac', 'sqrt', 'sum', 'int', 'prod'}
|
||||
TEX_SPACING = {'quad', 'qquad', ',', ';', ' ', '!'}
|
||||
|
||||
@staticmethod
|
||||
def approximate_char_width_em(c: str) -> float:
|
||||
width_table = {
|
||||
"il.':|!`": 0.3,
|
||||
"frtJ(){}[]*": 0.5,
|
||||
"abcdeghknopqsuvxyz": 0.6,
|
||||
"LCDEFHISTUZ": 0.7,
|
||||
"ABGKNOPQRXYV": 0.8,
|
||||
"mwMW": 0.9,
|
||||
"0123456789": 0.6,
|
||||
"-_=+<>": 0.5,
|
||||
"#$%^&@~": 0.6,
|
||||
",;": 0.25,
|
||||
'"': 0.4,
|
||||
"/\\": 0.5,
|
||||
"?": 0.6,
|
||||
" ": 0.4,
|
||||
}
|
||||
char_width = {c: w for chars, w in width_table.items() for c in chars}
|
||||
return char_width.get(c, 0.6)
|
||||
|
||||
@staticmethod
|
||||
def text_display_len(s: str) -> float:
|
||||
"""Estimate display width in ems, ignoring HTML tags, interpreting TeX, and HTML entities."""
|
||||
def strip_html_tags(text):
|
||||
return re.sub(r'<[^>]*>', '', text)
|
||||
|
||||
def decode_entities(text):
|
||||
return html.unescape(text)
|
||||
|
||||
if '$' not in s and '<' not in s and '&' not in s:
|
||||
return sum(TextLength.approximate_char_width_em(c) for c in s)
|
||||
|
||||
parts = re.split(r'(\$\$.*?\$\$)|(\$.*?\$)', s)
|
||||
total = 0.0
|
||||
for part in parts:
|
||||
if part is None:
|
||||
continue
|
||||
if part.startswith('$$') and part.endswith('$$'):
|
||||
total += TextLength.estimate_math_width(part[2:-2])
|
||||
elif part.startswith('$') and part.endswith('$'):
|
||||
total += TextLength.estimate_math_width(part[1:-1])
|
||||
else:
|
||||
visible = strip_html_tags(part)
|
||||
decoded = decode_entities(visible)
|
||||
total += sum(TextLength.approximate_char_width_em(c) for c in decoded)
|
||||
return total
|
||||
|
||||
@staticmethod
|
||||
def estimate_math_width(tex: str) -> float:
|
||||
tokens = re.findall(r'\\[a-zA-Z]+|[a-zA-Z0-9]|.', tex)
|
||||
width = 0.0
|
||||
for tok in tokens:
|
||||
if tok.startswith('\\'):
|
||||
name = tok[1:]
|
||||
if name in TextLength.TEX_SIMPLE_GLYPHS:
|
||||
width += 0.6
|
||||
elif name in TextLength.TEX_WIDE:
|
||||
width += 1.5
|
||||
elif name in TextLength.TEX_SPACING:
|
||||
width += 0.4
|
||||
else:
|
||||
width += 1.0 # unknown control sequences
|
||||
elif tok in '{}':
|
||||
continue # grouping only
|
||||
elif tok in '^_':
|
||||
width += 0.3 # assume sub/superscript takes some space
|
||||
else:
|
||||
width += TextLength.approximate_char_width_em(tok)
|
||||
return width
|
||||
|
||||
# original
|
||||
# @staticmethod
|
||||
# def text_display_len(s: str) -> int:
|
||||
# """Estimate text display length in em of a string allowing for TeX constructs."""
|
||||
# # note you DO WANT SPACES! So, no strip applied ever.
|
||||
# if s.find('$') < 0:
|
||||
# return len(s)
|
||||
# parts = re.split(r'(\$\$.*?\$\$)|(\$.*?\$)', s)
|
||||
# total = 0
|
||||
# for part in parts:
|
||||
# if part is None:
|
||||
# continue
|
||||
# if part.startswith('$$') and part.endswith('$$'):
|
||||
# total += TextLength._estimate_math_width(part[2:-2])
|
||||
# elif part.startswith('$') and part.endswith('$'):
|
||||
# total += TextLength._estimate_math_width(part[1:-1])
|
||||
# else:
|
||||
# total += len(part)
|
||||
# return total
|
||||
|
||||
# @staticmethod
|
||||
# def _estimate_math_width(tex: str) -> int:
|
||||
# tokens = re.findall(r'\\[a-zA-Z]+|[a-zA-Z0-9]|.', tex)
|
||||
# width = 0
|
||||
# for tok in tokens:
|
||||
# if tok.startswith('\\'):
|
||||
# name = tok[1:]
|
||||
# if name in TextLength.TEX_SIMPLE_GLYPHS:
|
||||
# width += 1
|
||||
# elif name in TextLength.TEX_WIDE:
|
||||
# width += 3
|
||||
# elif name in TextLength.TEX_SPACING:
|
||||
# width += 1
|
||||
# else:
|
||||
# width += 2 # unknown control sequences
|
||||
# elif tok in '{}^_':
|
||||
# continue # grouping, sub/superscripts: ignore
|
||||
# else:
|
||||
# width += 1
|
||||
# return width
|
||||
|
||||
|
||||
class Sparsify:
|
||||
"""Sparsify multiindex rows and columns."""
|
||||
@staticmethod
|
||||
def sparsify(df, cs):
|
||||
out = df.copy()
|
||||
for i, c in enumerate(cs):
|
||||
mask = df[cs[:i + 1]].ne(df[cs[:i + 1]].shift()).any(axis=1)
|
||||
out.loc[~mask, c] = ''
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def sparsify_mi(mi, bottom_level=False):
|
||||
"""
|
||||
as above for a multi index level, without the benefit of the index...
|
||||
really all should use this function
|
||||
:param mi:
|
||||
:param bottom_level: for the lowest level ... all values repeated, no sparsificaiton
|
||||
:return:
|
||||
"""
|
||||
last = mi[0]
|
||||
new_col = list(mi)
|
||||
rules = []
|
||||
for k, v in enumerate(new_col[1:]):
|
||||
if v == last and not bottom_level:
|
||||
new_col[k + 1] = ''
|
||||
else:
|
||||
last = v
|
||||
rules.append(k + 1)
|
||||
new_col[k + 1] = v
|
||||
return new_col, rules
|
||||
|
||||
|
||||
class Indexing:
|
||||
"""Changed column and level from a multi-index."""
|
||||
@staticmethod
|
||||
def changed_column(bit):
|
||||
"""Return the column that changes with each row."""
|
||||
tf = bit.ne(bit.shift())
|
||||
tf = tf.loc[tf.any(axis=1)]
|
||||
return tf.idxmax(axis=1)
|
||||
|
||||
@staticmethod
|
||||
def changed_level(idx):
|
||||
"""
|
||||
Return the level of index that changes with each row.
|
||||
|
||||
Very ingenious GTP code with some SM enhancements.
|
||||
"""
|
||||
# otherwise you alter the actual index
|
||||
idx = idx.copy()
|
||||
idx.names = [i for i in range(idx.nlevels)]
|
||||
# Determine at which level the index changes
|
||||
# Convert MultiIndex to a DataFrame
|
||||
index_df = idx.to_frame(index=False)
|
||||
# true / false match last row
|
||||
tf = index_df.ne(index_df.shift())
|
||||
# changes need at least one true
|
||||
tf = tf.loc[tf.any(axis=1)]
|
||||
level_changes = tf.idxmax(axis=1)
|
||||
return level_changes
|
||||
|
||||
|
||||
class Width:
|
||||
"""Adjust column widths based on heading widths."""
|
||||
|
||||
@staticmethod
|
||||
def header_adjustment(df, min_widths, space, max_extra):
|
||||
"""
|
||||
Fine-adjust heading for optimal config.spacing.
|
||||
|
||||
Return a dict with per-column recommended width adjustments to avoid
|
||||
intra-word breaks and reduce overall header height.
|
||||
|
||||
Parameters:
|
||||
df: DataFrame with 1-level string column names
|
||||
min_widths: dict of column name -> minimal acceptable width
|
||||
space: amount of space available to be allocated
|
||||
max_extra: max extra characters to consider allocating per column
|
||||
|
||||
Returns:
|
||||
dict: column -> additional width to allocate
|
||||
"""
|
||||
colnames = list(df.columns)
|
||||
adjustments = {col: 0 for col in colnames}
|
||||
num_lines = 0
|
||||
|
||||
def has_intra_word_break(text: str, width: int) -> bool:
|
||||
"""
|
||||
Determine if textwrap.wrap breaks any words in the given text.
|
||||
|
||||
Gemini - GPT code did not work, even after seveal iterations.
|
||||
This is a nice approach to the problem.
|
||||
|
||||
Args:
|
||||
text: The input string.
|
||||
width: The maximum width for wrapping.
|
||||
|
||||
Returns:
|
||||
True if any word is broken across lines, False otherwise.
|
||||
"""
|
||||
nonlocal num_lines
|
||||
wrapped_lines = wrap(text, width=width)
|
||||
num_lines = len(wrapped_lines)
|
||||
original_words = text.split()
|
||||
|
||||
reconstructed_text_from_wrapped = " ".join(wrapped_lines)
|
||||
reconstructed_words = reconstructed_text_from_wrapped.split()
|
||||
|
||||
# If the number of words differs, it means some words were split.
|
||||
# This catches cases where a word might be split and then later re-joined
|
||||
# due to subsequent wrapping logic, leading to a different number of words.
|
||||
if len(original_words) != len(reconstructed_words):
|
||||
return True
|
||||
|
||||
# Compare word by word. If any word from the original doesn't exactly match
|
||||
# a word from the reconstructed list, it implies a split.
|
||||
for i in range(len(original_words)):
|
||||
if original_words[i] != reconstructed_words[i]:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
# First pass: avoid ugly intraword breaks
|
||||
# make dict of col -> longest word length
|
||||
min_acceptable = {c: v for c, v in
|
||||
zip(colnames, map(lambda x: max(len(i) for i in re.split(r'[ \-/]', x)), colnames))}
|
||||
options = []
|
||||
for col in colnames:
|
||||
if not isinstance(col, str):
|
||||
continue
|
||||
base_width = min_widths[col]
|
||||
if not has_intra_word_break(col, base_width):
|
||||
options.append([col, 0, num_lines])
|
||||
# nothing to be gained, move to next col
|
||||
continue
|
||||
extra0 = max(0, min_acceptable[col] - base_width)
|
||||
if extra0 > max_extra:
|
||||
# ok, can't flatten word because it is too long
|
||||
extra0 = 0
|
||||
elif extra0 == max_extra:
|
||||
# go with that
|
||||
adjustments[col] = max_extra
|
||||
continue
|
||||
# see if col can be flattened within max_extra chars, starting
|
||||
# at extra0, which is enough to avoid intraword breaks
|
||||
for extra in range(extra0, max_extra + 1):
|
||||
if not has_intra_word_break(col, base_width + extra):
|
||||
options.append([col, extra, num_lines])
|
||||
if adjustments[col] == 0:
|
||||
# take first, but compute rest...
|
||||
adjustments[col] = extra
|
||||
# temporary diagnostic DEBUG information - comment in prod
|
||||
# from IPython.display import display
|
||||
# config.debug = pd.Series([col, min_acceptable[col], base_width, has_intra_word_break(col, base_width), extra0, max_extra,
|
||||
# wrap(col, base_width), extra],
|
||||
# index=['col name', 'min acceptable', 'base_width (from data)', 'intra word break', 'extra0', 'max_extra', 'split', 'selected extra']).to_frame('Value')
|
||||
# display(config.debug)
|
||||
# make df[col name, amount of extra space for col, resulting number of lines]
|
||||
# this is needed as input for the optimal heading function (next)
|
||||
input_df = pd.DataFrame(options, columns=['col', 'extra', 'num_lines'])
|
||||
# min amount to avoid intra work breaks
|
||||
avoid_intra = input_df.groupby('col').min().extra.sum()
|
||||
if avoid_intra >= space:
|
||||
# that's all we can do
|
||||
print("NO FURTHER IMPROVEMENTS")
|
||||
else:
|
||||
# can try for a better solution
|
||||
sol = Width.optimal_heading(input_df, space)
|
||||
adjustments.update(sol[1])
|
||||
logger.info('best solution: %s', sol)
|
||||
# global temp
|
||||
# temp = input_df
|
||||
return adjustments, input_df
|
||||
|
||||
@staticmethod
|
||||
def optimal_heading(input_df: pd.DataFrame, total_es_budget: int) -> tuple[int, dict[str, int]]:
|
||||
"""
|
||||
Optimize extra config.spacing for best heading.
|
||||
|
||||
Finds the best way to allocate extra space to minimize max_lines in heading.
|
||||
|
||||
Gemini solution.
|
||||
|
||||
Args:
|
||||
input_df: DataFrame with 'col', 'extra', 'num_lines'.
|
||||
total_es_budget: The total extra space to allocate.
|
||||
|
||||
Returns:
|
||||
A tuple: (min_max_lines, optimal_extra_allocation_per_column).
|
||||
|
||||
Why this approach is effective:
|
||||
---------------------------------
|
||||
|
||||
* **Optimal Solution:** The binary search guarantees finding the absolute minimum possible `max_lines` because it systematically explores the entire solution space.
|
||||
* **Efficiency:** The `check` function runs in time proportional to the number of columns times the average number of `extra` options per column. The binary search itself performs `log(range_of_num_lines)` iterations. This makes the overall complexity efficient for typical table sizes.
|
||||
* **Flexibility:** It does not assume any particular mathematical function relating `extra` space to `num_lines`. It works with arbitrary discrete relationships provided in the input DataFrame, as long as `num_lines` is non-increasing as `extra` increases (which is the natural expectation for this problem).
|
||||
|
||||
"""
|
||||
# Pre-processing
|
||||
unique_cols = input_df['col'].unique().tolist()
|
||||
|
||||
col_extra_num_lines_options = {}
|
||||
for col_name in unique_cols:
|
||||
col_data = input_df[input_df['col'] ==
|
||||
col_name].sort_values(by='extra')
|
||||
col_extra_num_lines_options[col_name] = list(
|
||||
zip(col_data['extra'], col_data['num_lines']))
|
||||
|
||||
def check(target_max_lines: int) -> bool:
|
||||
current_extra_needed = 0
|
||||
for col_name in unique_cols:
|
||||
min_extra_for_col = float('inf')
|
||||
found_suitable_extra = False
|
||||
for extra_val, num_lines_val in col_extra_num_lines_options[col_name]:
|
||||
if num_lines_val <= target_max_lines:
|
||||
min_extra_for_col = extra_val
|
||||
found_suitable_extra = True
|
||||
break
|
||||
|
||||
if not found_suitable_extra:
|
||||
return False
|
||||
|
||||
current_extra_needed += min_extra_for_col
|
||||
|
||||
return current_extra_needed <= total_es_budget
|
||||
|
||||
all_num_lines = input_df['num_lines'].unique()
|
||||
|
||||
# Corrected line: Check length of the numpy array
|
||||
if len(all_num_lines) == 0:
|
||||
return 0, {}
|
||||
|
||||
L = all_num_lines.min()
|
||||
R = all_num_lines.max()
|
||||
|
||||
optimal_max_lines = R
|
||||
best_allocation = {}
|
||||
|
||||
while L <= R:
|
||||
mid_max_lines = L + (R - L) // 2
|
||||
|
||||
temp_current_extra_needed = 0
|
||||
temp_current_allocation = {}
|
||||
possible = True
|
||||
for col_name in unique_cols:
|
||||
min_extra_for_col = float('inf')
|
||||
found_suitable_extra = False
|
||||
for extra_val, num_lines_val in col_extra_num_lines_options[col_name]:
|
||||
if num_lines_val <= mid_max_lines:
|
||||
min_extra_for_col = extra_val
|
||||
found_suitable_extra = True
|
||||
break
|
||||
|
||||
if not found_suitable_extra:
|
||||
possible = False
|
||||
break
|
||||
|
||||
temp_current_extra_needed += min_extra_for_col
|
||||
temp_current_allocation[col_name] = min_extra_for_col
|
||||
|
||||
if possible and temp_current_extra_needed <= total_es_budget:
|
||||
optimal_max_lines = mid_max_lines
|
||||
best_allocation = temp_current_allocation.copy()
|
||||
R = mid_max_lines - 1
|
||||
else:
|
||||
L = mid_max_lines + 1
|
||||
|
||||
return optimal_max_lines, best_allocation
|
||||
|
||||
|
||||
class TextOutput:
|
||||
"""Convert dataframe to text, replaces pd.DataFrame.to_markdown."""
|
||||
|
||||
@staticmethod
|
||||
def make_text_table(
|
||||
df: pd.DataFrame,
|
||||
data_col_widths: list[int],
|
||||
data_col_aligns: list[str],
|
||||
*,
|
||||
index_levels: int = 1,
|
||||
fmt: TableFormat = GT_Format
|
||||
) -> str:
|
||||
"""
|
||||
Render self.df as a wrapped, boxed table.
|
||||
|
||||
Output like tabulate's mixed_grid with support for:
|
||||
- Multi-level column headers (always shown, bottom-aligned, can wrap)
|
||||
- Split index vs. body section with heavy vertical separator
|
||||
- Per-column width and alignment
|
||||
- Wrapped body cells with top alignment
|
||||
|
||||
Custom code to print a dataframe to text.
|
||||
|
||||
pd.DataFrame.to_string uses tabulate.tabulate which is hard to
|
||||
control. This modoule provides similar functionality with greater
|
||||
control over column widths and the ability to demark the index
|
||||
columns.
|
||||
|
||||
Returns:
|
||||
str: A fully formatted table as a string (useful for print, logs, or files).
|
||||
"""
|
||||
buf = StringIO()
|
||||
|
||||
def _write_line(line: str) -> None:
|
||||
"""Writes a line to the buffer followed by a newline."""
|
||||
buf.write(line + '\n')
|
||||
|
||||
def _format_cell(text: str, width: int, align: str) -> list[str]:
|
||||
"""
|
||||
Formats a single cell, wrapping text and applying padding and alignment.
|
||||
Returns a list of strings, each representing a line of the cell.
|
||||
"""
|
||||
lines = wrap(str(text), width=width) or ['']
|
||||
padded_width = width + 2 * fmt.padding
|
||||
return [
|
||||
(" " * fmt.padding)
|
||||
+ (line.ljust(width) if align == 'left'
|
||||
else line.center(width) if align == 'center'
|
||||
else line.rjust(width)) +
|
||||
(" " * fmt.padding)
|
||||
for line in lines
|
||||
]
|
||||
|
||||
def _make_horizontal_line(line_fmt: Line, col_widths: list[int]) -> str:
|
||||
"""Constructs a full horizontal line for the table."""
|
||||
parts = []
|
||||
for i, w in enumerate(col_widths):
|
||||
total = w + 2 * fmt.padding
|
||||
if index_levels and i == index_levels:
|
||||
parts.append(line_fmt.index_sep)
|
||||
elif i > 0:
|
||||
parts.append(line_fmt.sep)
|
||||
parts.append(line_fmt.hline * total)
|
||||
return f"{line_fmt.begin}{''.join(parts)}{line_fmt.end}"
|
||||
|
||||
def _make_data_row(row_fmt: DataRow, line_cells: list[str]) -> str:
|
||||
"""Constructs a single data row from formatted cell strings."""
|
||||
parts = []
|
||||
for i, cell in enumerate(line_cells):
|
||||
if index_levels and i == index_levels:
|
||||
parts.append(row_fmt.index_sep)
|
||||
elif i > 0:
|
||||
parts.append(row_fmt.sep)
|
||||
parts.append(cell)
|
||||
return f"{row_fmt.begin}{''.join(parts)}{row_fmt.end}"
|
||||
|
||||
def _render_header_level(wrapped_cells: list[list[str]], level_widths: list[int]) -> list[str]:
|
||||
"""
|
||||
Renders a single level of the header, ensuring cells are bottom-aligned.
|
||||
Returns a list of strings, each representing a line of the header.
|
||||
"""
|
||||
max_height = max(len(c) for c in wrapped_cells)
|
||||
padded_cells = [
|
||||
[' ' * (w + 2 * fmt.padding)] * (max_height - len(cell)) + cell
|
||||
for cell, w in zip(wrapped_cells, level_widths)
|
||||
]
|
||||
return [_make_data_row(fmt.headerrow, [col[i] for col in padded_cells]) for i in range(max_height)]
|
||||
|
||||
col_levels = df.columns.nlevels
|
||||
col_tuples = df.columns if col_levels > 1 else [
|
||||
(c,) for c in df.columns]
|
||||
|
||||
# Step 1: format each level of the column headers (one header line per level)
|
||||
# header alignment is left in index and center in body
|
||||
index_col_aligns = [
|
||||
'left' if i < index_levels else 'center' for i in range(len(data_col_aligns))]
|
||||
_write_line(_make_horizontal_line(fmt.lineabove, data_col_widths))
|
||||
# collect all wrapped + bottom-aligned rows for each level
|
||||
for level in range(col_levels):
|
||||
level_texts = [str(t[level] if level < len(t) else '')
|
||||
for t in col_tuples]
|
||||
wrapped_cells = [_format_cell(txt, w, a) for txt, w, a in zip(
|
||||
level_texts, data_col_widths, index_col_aligns)]
|
||||
level_rows = _render_header_level(wrapped_cells, data_col_widths)
|
||||
for row in level_rows:
|
||||
_write_line(row)
|
||||
if level < col_levels - 1:
|
||||
_write_line(_make_horizontal_line(
|
||||
fmt.linebetweenrows, data_col_widths))
|
||||
_write_line(_make_horizontal_line(
|
||||
fmt.linebelowheader, data_col_widths))
|
||||
|
||||
for row_idx, (_, row) in enumerate(df.iterrows()):
|
||||
data_cells = [
|
||||
_format_cell(val, w, a)
|
||||
for val, w, a in zip(row.values, data_col_widths, data_col_aligns)
|
||||
]
|
||||
max_height = max(len(c) for c in data_cells)
|
||||
padded = [
|
||||
c + [' ' * (w + 2 * fmt.padding)] * (max_height - len(c))
|
||||
for c, w in zip(data_cells, data_col_widths)
|
||||
]
|
||||
for i in range(max_height):
|
||||
_write_line(_make_data_row(
|
||||
fmt.datarow, [col[i] for col in padded]))
|
||||
|
||||
if row_idx < len(df) - 1:
|
||||
_write_line(_make_horizontal_line(
|
||||
fmt.linebetweenrows, data_col_widths))
|
||||
else:
|
||||
_write_line(_make_horizontal_line(
|
||||
fmt.linebelow, data_col_widths))
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
class RichOutput:
|
||||
"""Render to a rich table."""
|
||||
|
||||
@staticmethod
|
||||
def make_rich_table(
|
||||
df,
|
||||
column_widths,
|
||||
column_alignments=None,
|
||||
num_index_columns=0,
|
||||
title=None,
|
||||
show_lines=False,
|
||||
box_style=box.SIMPLE_HEAVY,
|
||||
):
|
||||
"""
|
||||
Render a preformatted DataFrame as a Rich table.
|
||||
|
||||
Parameters:
|
||||
df (pd.DataFrame): DataFrame with all string values.
|
||||
column_widths (dict or list): Widths by column name or position.
|
||||
column_alignments (dict or list): Alignments ('left', 'center', 'right').
|
||||
num_index_columns (int): Number of left-most columns to treat as index-like.
|
||||
title (str): Optional title.
|
||||
show_lines (bool): Add row separator lines.
|
||||
box_style (rich.box.Box): Border style (see below).
|
||||
"""
|
||||
colnames = list(df.columns)
|
||||
|
||||
if isinstance(column_widths, list):
|
||||
column_widths = {colnames[i]: w for i,
|
||||
w in enumerate(column_widths)}
|
||||
|
||||
if column_alignments is None:
|
||||
column_alignments = {}
|
||||
elif isinstance(column_alignments, list):
|
||||
column_alignments = {
|
||||
colnames[i]: a for i, a in enumerate(column_alignments)}
|
||||
|
||||
table = Table(title=title,
|
||||
box=box_style,
|
||||
show_lines=show_lines,
|
||||
expand=True)
|
||||
|
||||
for i, col in enumerate(colnames):
|
||||
is_index = i < num_index_columns
|
||||
table.add_column(
|
||||
header=str(col),
|
||||
width=column_widths.get(col, None),
|
||||
justify=column_alignments.get(col, "left"),
|
||||
style="dim" if is_index else None,
|
||||
header_style="bold dim" if is_index else "bold",
|
||||
no_wrap=False,
|
||||
overflow="fold",
|
||||
vertical="middle",
|
||||
# divider=divider,
|
||||
)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
table.add_row(*row.tolist())
|
||||
|
||||
return table
|
||||
@@ -0,0 +1,11 @@
|
||||
Obtaining file:///C:/S/TELOS/Python/greater_tables_project
|
||||
Installing build dependencies: started
|
||||
Installing build dependencies: finished with status 'done'
|
||||
Checking if build backend supports build_editable: started
|
||||
Checking if build backend supports build_editable: finished with status 'done'
|
||||
Getting requirements to build editable: started
|
||||
Getting requirements to build editable: finished with status 'done'
|
||||
Preparing editable metadata (pyproject.toml): started
|
||||
Preparing editable metadata (pyproject.toml): finished with status 'done'
|
||||
Collecting bs4 (from greater_tables==4.0.0)
|
||||
Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
|
||||
Reference in New Issue
Block a user