Files

1446 lines
55 KiB
Python

"""
Core rendering logic for GreaterTables (PyArrow Enhanced Edition).
Defines the `GT` class, which formats and renders pandas DataFrames
to HTML, plain text, or LaTeX output using a validated configuration model.
Refactored [2025-03-06] to support PyArrow-backed dataframes natively.
"""
from collections import namedtuple
from decimal import InvalidOperation
from io import StringIO
from itertools import groupby
import logging
import os
from pathlib import Path
import re
import tempfile
from typing import Optional, Union, Literal
import warnings
import yaml
from bs4 import BeautifulSoup
from cachetools import LRUCache
import numpy as np
import pandas as pd
from pandas.errors import IntCastingNaNError
from pandas.api.types import (
is_datetime64_any_dtype,
is_integer_dtype,
is_float_dtype,
is_string_dtype,
is_extension_array_dtype, # Crucial for PyArrow detection
is_numeric_dtype
)
from pydantic import ValidationError
from rich import box
from IPython.display import display, SVG
from . enums import Breakability
from . config import Configurator
from . hasher import df_short_hash
from . etcher import Etcher
from . utilities import *
# Modern pandas settings
pd.set_option('future.no_silent_downcasting', True)
warnings.simplefilter(action='ignore', category=FutureWarning)
logger = logging.getLogger(__name__)
class GT(object):
"""
Create a greater_tables formatting object.
Provides html and latex output in quarto/Jupyter accessible manner.
Wraps AND COPIES the dataframe df. WILL NOT REFLECT CHANGES TO DF.
**PyArrow / 2.0 Architecture Note:**
This class now uses an "Inspect-then-Dispatch" philosophy. It does not
coerce inputs to floats. It respects int64[pyarrow] and string[pyarrow]
types natively, handling nulls via validity masks rather than NaN-casting.
"""
def __init__(
self,
df,
*,
caption='',
label='',
aligners: dict[str, callable] | None = None,
formatters: dict[str, callable] | None = None,
tabs: Optional[Union[list[float], float, int]] | None = None,
unbreakable=None,
ratio_cols=None,
year_cols=None,
date_cols=None,
raw_cols=None,
show_index=True,
config: Configurator | None = None,
config_path: Path | None = None,
**overrides,
):
# --- Config Loading ---
if config and config_path:
raise ValueError("Pass either 'config' or 'config_path', not both.")
if config:
base_config = config
elif config_path:
config_path = Path(config_path)
try:
raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
base_config = Configurator.model_validate(raw)
except (ValidationError, OSError) as e:
raise ValueError(f"Failed to load config from {config_path}") from e
else:
base_config = Configurator()
merged = base_config.model_dump() | overrides
self.config = Configurator(**merged)
# --- Data Ingestion ---
if df is None:
df = pd.DataFrame([])
if isinstance(df, pd.DataFrame):
pass
elif isinstance(df, pd.Series):
df = df.to_frame()
elif isinstance(df, list):
df = pd.DataFrame(df)
show_index = False
if self.config.header_row:
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)
elif isinstance(df, str):
df = df.strip()
if df == '':
df = pd.DataFrame([])
else:
df, aligners, caption, label = MD2DF.md_to_df(df)
show_index = False
elif GT._is_namedtuple_instance(df):
df = GT._ntdf(df)
else:
raise ValueError('df must be a DataFrame, a list of lists, or a markdown table string')
if len(df) > self.config.large_warning and not self.config.large_ok:
raise ValueError(
f'Large dataframe (>{self.config.large_warning} rows). Set large_ok=True.')
if not df.columns.is_unique:
raise ValueError('df column names are not unique')
if caption != '':
self.caption = caption
else:
self.caption = getattr(df, 'gt_caption', '')
self.label = label
# --- PHASE 1: Data Preparation (Modified for Arrow) ---
self.df = df.copy()
self.raw_df = df.copy()
self.df_id = df_short_hash(self.df)
if self.caption != '' and self.config.debug:
self.caption += f' (id: {self.df_id})'
self.show_index = show_index
self.nindex = self.df.index.nlevels if self.show_index else 0
self.ncolumns = self.df.columns.nlevels
self.ncols = self.df.shape[1]
self.dt = self.df.dtypes
# Handle Index
with warnings.catch_warnings():
if self.show_index:
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
self.df = self.df.reset_index(drop=False, col_level=self.df.columns.nlevels - 1)
# Ensure index is essentially a row number for internal tracking
self.df.index = np.arange(self.df.shape[0], dtype=int)
self.index_change_level = Indexing.changed_column(self.df.iloc[:, :self.nindex])
if self.ncolumns > 1:
self.index_change_level = pd.Series([i[-1] for i in self.index_change_level])
self.column_change_level = Indexing.changed_level(self.raw_df.columns)
# --- Column Tagging ---
# Helper to process column lists/regex
def resolve_cols(cols):
if cols is None: return []
if cols == 'all': return list(self.df.columns)
if not isinstance(cols, (tuple, list)): return self.cols_from_regex(cols)
return cols
# Check for non-unique collision first
if not self.df.columns.is_unique:
logger.warning('Cols specified with non-unique column names: ignoring request.')
self.ratio_cols, self.year_cols, self.date_cols, self.raw_cols = [], [], [], []
else:
self.ratio_cols = resolve_cols(ratio_cols)
self.year_cols = resolve_cols(year_cols)
self.date_cols = resolve_cols(date_cols)
self.raw_cols = resolve_cols(raw_cols)
# --- REMOVED: Aggressive Float Coercion Loop ---
# The previous version looped through all columns and tried to cast them
# to floats to detect types. This broke PyArrow strings/timestamps.
# We now trust the dtypes provided by the dataframe.
# --- Type Detection for Breakability ---
if unbreakable is None: unbreakable = []
elif isinstance(unbreakable, str): unbreakable = [unbreakable]
self.float_col_indices = []
self.integer_col_indices = []
self.date_col_indices = []
self.object_col_indices = []
self.break_penalties = []
logger.debug('FIGURING TYPES (Arrow-Aware)')
for i, cn in enumerate(self.df.columns):
# We look at the actual series to determine type
ser = self.df.iloc[:, i]
dtype = ser.dtype
# Use pandas.api.types for robust checking across backends
is_date = (cn in self.date_cols) or is_datetime64_any_dtype(dtype)
is_int = is_integer_dtype(dtype)
is_flt = is_float_dtype(dtype)
if is_date:
self.date_col_indices.append(i)
self.break_penalties.append(
Breakability.NEVER if cn in unbreakable else Breakability.DATE)
elif is_int:
self.integer_col_indices.append(i)
self.break_penalties.append(Breakability.NEVER)
elif is_flt:
self.float_col_indices.append(i)
self.break_penalties.append(Breakability.NEVER)
else:
# String / Object / Boolean
self.object_col_indices.append(i)
if cn in self.year_cols or cn in self.ratio_cols:
self.break_penalties.append(Breakability.NEVER)
else:
self.break_penalties.append(
Breakability.NEVER if cn in unbreakable else Breakability.ACCEPTABLE)
# --- Alignment Logic ---
if aligners is not None and np.any(self.df.columns.duplicated()):
logger.warning('aligners specified with non-unique column names: ignoring request.')
aligners = None
if aligners is None: aligners = []
elif isinstance(aligners, str):
aligners = {c: a for c, a in zip(self.df.columns, aligners)}
self.df_aligners = []
lrc = {'l': 'grt-left', 'r': 'grt-right', 'c': 'grt-center'}
for i, c in enumerate(self.df.columns):
if c in aligners:
self.df_aligners.append(lrc.get(aligners[c], 'grt-center'))
elif i < self.nindex:
self.df_aligners.append('grt-left')
elif c in self.year_cols:
self.df_aligners.append('grt-center')
elif c in self.raw_cols:
self.df_aligners.append('grt-left')
elif i in self.date_col_indices:
self.df_aligners.append('grt-center')
elif c in self.ratio_cols or i in self.float_col_indices or i in self.integer_col_indices:
self.df_aligners.append('grt-right')
else:
self.df_aligners.append('grt-left')
self.df_idx_aligners = self.df_aligners[:self.nindex]
# --- Formatter Setup ---
self.user_formatters_override = {}
if formatters:
if callable(formatters):
for k in self.df.columns:
self.user_formatters_override[k] = formatters
else:
for k, v in formatters.items():
if callable(v): self.user_formatters_override[k] = v
elif isinstance(v, str): self.user_formatters_override[k] = lambda x: v.format(x=x)
elif isinstance(v, int):
fmt = f'{{x:.{v}f}}'
self.user_formatters_override[k] = lambda x: fmt.format(x=x)
else:
raise ValueError('Formatters must be dict of callables, ints, or strings')
# --- Tabs ---
if tabs is None:
self.tabs = None
elif isinstance(tabs, (int, float)):
self.tabs = (tabs,) * (self.nindex + self.ncols)
elif isinstance(tabs, (np.ndarray, pd.Series, list, tuple)):
self.tabs = tabs if len(tabs) == self.nindex + self.ncols else None
else:
self.tabs = None
# --- Padding / Config ---
if self.config.padding_trbl is not None:
padding_trbl = self.config.padding_trbl
else:
spacing_map = {'tight': (0,5,0,5), 'medium': (2,10,2,10), 'wide': (4,15,4,15)}
padding_trbl = spacing_map.get(self.config.spacing, (2,10,2,10))
self.padt, self.padr, self.padb, self.padl = padding_trbl
self.max_table_width_em = self.config.max_table_inch_width * 72 / self.config.table_font_pt_size
# --- Initialization State ---
self._pef = None
self._df_formatters = None
self.df_style = ''
self.df_html = ''
self._clean_html = ''
self._clean_tex = ''
self._rich_table = None
self._string = ''
self._df_html_text = ""
self._df_style_text = ""
self._cache = LRUCache(20)
self._text_knowledge_df = None
self._html_knowledge_df = None
self._tex_knowledge_df = None
self._knowledge_dfs = None
# --- Apply Formatters (THE BIG CHANGE) ---
# We keep the raw dataframe mostly untouched.
# self.df becomes the Formatted DataFrame (Strings).
self.df_pre_applying_formatters = self.df.copy()
# Apply formatters modifies self.df to contain strings
self.df = self.apply_formatters(self.df)
# Sparsify
if self.config.sparsify and self.nindex > 1:
self.df = Sparsify.sparsify(self.df, self.df.columns[:self.nindex])
# LaTeX / HTML mapping
if self.config.tex_to_html is not None:
self.df_html = self.df.map(self.config.tex_to_html)
else:
self.df_html = self.df
if self.config.tikz_escape_tex:
self.df_tex = Escaping.escape_df_tex(self.df)
else:
self.df_tex = self.df
def __repr__(self):
return f"GT(df_id={self.df_id})"
def __str__(self):
return self.make_string()
def _repr_html_(self):
return self.html
def _repr_latex_(self):
if self._clean_tex == '':
self._clean_tex = self.make_tikz()
logger.info('CREATED LATEX')
return self._clean_tex
def cache_get(self, key):
"""Retrieve item from cache."""
return self._cache.get(key, None)
def cache_set(self, key, value):
"""Add item to cache."""
self._cache[key] = value
def cols_from_regex(self, regex):
pattern = re.compile(regex)
matching_cols = [
col for col in self.df.columns
if any(pattern.search(str(level))
for level in (col if isinstance(col, tuple) else (col,)))
]
return matching_cols
# --- New Robust Formatters (Arrow-Safe) ---
def _fmt_int_safe(self, x):
"""Handle Integers (PyArrow or NumPy) safely with nulls."""
if pd.isna(x): return ""
try:
return self.config.default_integer_str.format(x=int(x))
except (ValueError, TypeError):
return str(x)
def _fmt_date_iso(self, x):
"""Handle Dates/Timestamps safely."""
if pd.isna(x): return ""
# If it's a Timestamp object (Arrow or Pandas), it has strftime
if hasattr(x, "strftime"):
return x.strftime(self.config.default_date_str)
return str(x)
def _fmt_float_smart(self, x):
"""Smart float formatting that handles 'Year-like' floats."""
if pd.isna(x): return ""
try:
# Check if it's effectively an integer (e.g. 2021.0)
# This preserves the "Float as Int" behavior for legacy data
if isinstance(x, float) and x.is_integer():
return self.config.default_integer_str.format(x=int(x))
return self.config.default_float_str.format(x=x)
except (ValueError, TypeError):
return str(x)
def _fmt_ratio(self, x):
if pd.isna(x): return ""
try: return self.config.default_ratio_str.format(x=x)
except: return str(x)
def _fmt_year(self, x):
if pd.isna(x): return ""
try: return f'{int(x):d}'
except: return str(x)
def _fmt_raw(self, x):
if pd.isna(x): return ""
return str(x)
# --- Legacy Float Formatter Factory (Preserved but safe) ---
def pef(self, x):
if self._pef is None:
self._pef = pd.io.formats.format.EngFormatter(
accuracy=self.config.pef_precision, use_eng_prefix=True)
return self._pef(x)
def make_float_formatter(self, ser):
"""
Create a customized float formatter based on column statistics.
Works with Arrow columns as .mean(), .abs() dispatch correctly.
"""
try:
# Drop nulls for stats calculation to avoid issues
ser_valid = ser.dropna()
if len(ser_valid) == 0:
return self._fmt_float_smart
amean = ser_valid.abs().mean()
# amn = ser_valid.abs().min()
# amx = ser_valid.abs().max()
pl, pu = 10. ** self.config.pef_lower, 10. ** self.config.pef_upper
if amean < 1: precision = 5
elif amean < 10: precision = 3
elif amean < 20000: precision = 2
else: precision = 0
fmt = f'{{x:,.{precision}f}}'
def ff(x):
if pd.isna(x): return ""
try:
# Check for Engineering Format conditions
val_abs = abs(x)
if (val_abs > 0) and (val_abs < pl or val_abs > pu):
return self.pef(x)
return fmt.format(x=x)
except (ValueError, TypeError, InvalidOperation):
return str(x)
return ff
except Exception as e:
logger.debug(f"Float formatter factory failed: {e}. Using default.")
return self._fmt_float_smart
@property
def df_formatters(self):
"""
Dispatcher: Inspects types and assigns formatters.
"""
if self._df_formatters is None:
self._df_formatters = []
# Pre-calc custom table-wide float format if it exists
custom_float = None
if self.config.table_float_format:
if callable(self.config.table_float_format):
# Wrap to handle safe calls
def safe_custom_float(x):
try:
return self.config.table_float_format(x=x)
except ValueError:
return str(x)
except Exception as e:
logger.error(f'Custom float function raised {e=}')
return str(x)
custom_float = safe_custom_float
else:
fmt = self.config.table_float_format
def safe_custom_float_str(x):
try:
return fmt.format(x=x)
except ValueError:
return str(x)
except Exception as e:
logger.error(f'Custom float format string raised {e=}')
return str(x)
custom_float = safe_custom_float_str
for i, col_name in enumerate(self.df.columns):
# 1. User overrides (highest priority)
if col_name in self.user_formatters_override:
self._df_formatters.append(self.user_formatters_override[col_name])
continue
# 2. Semantic Tags
if col_name in self.ratio_cols:
self._df_formatters.append(self._fmt_ratio)
continue
if col_name in self.year_cols:
self._df_formatters.append(self._fmt_year)
continue
if col_name in self.raw_cols:
self._df_formatters.append(self._fmt_raw)
continue
# 3. Type-Based Dispatch (The "PyArrow" logic)
dtype = self.df[col_name].dtype
# Date/Time
if (i in self.date_col_indices) or is_datetime64_any_dtype(dtype):
self._df_formatters.append(self._fmt_date_iso)
# Integer (NumPy or Arrow)
elif is_integer_dtype(dtype):
self._df_formatters.append(self._fmt_int_safe)
# Float (NumPy or Arrow)
elif is_float_dtype(dtype):
if custom_float:
self._df_formatters.append(custom_float)
else:
# Use the smart factory
self._df_formatters.append(self.make_float_formatter(self.df.iloc[:, i]))
# Default / String / Object
else:
self._df_formatters.append(self._fmt_raw)
if len(self._df_formatters) != self.df.shape[1]:
raise ValueError(f'Formatter count mismatch: {len(self._df_formatters)} != {self.df.shape[1]}')
return self._df_formatters
@staticmethod
def apply_formatters_work(df, formatters):
"""Apply formatters to a DataFrame."""
try:
# This applies the lambda functions to every cell
new_df = pd.DataFrame({
i: map(f, df.iloc[:, i])
for i, f in enumerate(formatters)
}, index=df.index) # Preserve index!
except TypeError:
print('NASTY TYPE ERROR')
raise
new_df.columns = df.columns
# OPTIMIZATION: Convert to PyArrow strings immediately.
# This enables vectorized width calculations later.
try:
return new_df.astype("string[pyarrow]")
except ImportError:
# Fallback if pyarrow not installed (unlikely given context)
return new_df.astype(str)
def apply_formatters(self, df, mode='adjusted'):
"""
Replace df (the raw df) with formatted string df.
"""
if mode == 'adjusted':
return GT.apply_formatters_work(df, self.df_formatters)
elif mode == 'raw':
data_formatters = self.df_formatters[self.nindex:]
new_body = GT.apply_formatters_work(df, data_formatters)
if not self.show_index:
return new_body
index_formatters = self.df_formatters[:self.nindex]
df_index = df.reset_index(
drop=False, col_level=self.df.columns.nlevels - 1).iloc[:, :self.nindex]
new_index = GT.apply_formatters_work(df_index, index_formatters)
new_df = pd.concat([new_index, new_body], axis=1)
new_df = new_df.set_index(list(df_index.columns))
new_df.index.names = df.index.names
return new_df
else:
raise ValueError(f'unknown mode {mode}')
# --- Knowledge DFs ---
@property
def text_knowledge_df(self):
if self._text_knowledge_df is None:
self._text_knowledge_df = self.estimate_column_widths_by_mode('text')
return self._text_knowledge_df
@property
def html_knowledge_df(self):
if self._html_knowledge_df is None:
self._html_knowledge_df = self.estimate_column_widths_by_mode('html')
return self._html_knowledge_df
@property
def tex_knowledge_df(self):
if self._tex_knowledge_df is None:
if not self.config.tikz:
self._tex_knowledge_df = self.html_knowledge_df
else:
self._tex_knowledge_df = self.estimate_column_widths_by_mode('tex')
return self._tex_knowledge_df
@property
def knowledge_dfs(self):
if self._knowledge_dfs is None:
self._knowledge_dfs = pd.concat((self.text_knowledge_df.T,
self.html_knowledge_df.T, self.tex_knowledge_df.T),
keys=['text','html', 'tex'], names=['mode', 'measure'])
self._knowledge_dfs['Total'] = self._knowledge_dfs.fillna(0.).apply(
lambda row: sum(x for x in row if pd.api.types.is_number(x)), axis=1)
idx = self._knowledge_dfs.query('Total == 0').index
self._knowledge_dfs.loc[idx, 'Total'] = ''
self._knowledge_dfs = self._knowledge_dfs.fillna('')
return self._knowledge_dfs
def width_report(self):
"""Return a report summarizing the width information."""
natural = self.text_knowledge_df.natural_width.sum()
minimum = self.text_knowledge_df.minimum_width.sum()
text = self.text_knowledge_df.recommended.sum()
h = self.html_knowledge_df.recommended.sum()
tex = self.tex_knowledge_df.recommended.sum()
tikz = self.tex_knowledge_df.tikz_colw.sum()
mtw = self.max_table_width_em
mtiw = self.config.max_table_inch_width
pts = self.config.table_font_pt_size
bit = pd.DataFrame({
'text natural': self.text_knowledge_df.natural_width,
'text minimum': self.text_knowledge_df.minimum_width,
'text recommended': self.text_knowledge_df.recommended,
'html recommended': self.html_knowledge_df.recommended,
'tex recommended': self.tex_knowledge_df.recommended,
'tikz recommended': self.tex_knowledge_df.tikz_colw,
}).fillna(0)
ser = pd.Series({
'text natural': natural,
'text minimum': minimum,
'text recommended': text,
'html recommended': h,
'tex recommended': tex,
'tikz recommended': tikz,
})
bit.loc['total', :] = ser
print(f"requested width = {mtw} em\n"
f"max tbl inch w = {mtiw} inches\n"
f"font pts = {pts} pts\n"
f"width in em chk = {mtiw * 72 / pts} em\n"
f"width mode = {self.config.table_width_mode}\n"
f"header relax = {self.config.table_width_header_adjust}\n"
f"header chars = {self.config.table_width_header_relax}")
return bit
def estimate_column_widths_by_mode(self, mode):
"""
Return dataframe of width information.
OPTIMIZED: Uses vectorized string operations for 'text' mode.
"""
assert mode in ('text', 'html', 'tex'), 'Only html, text and tex modes valid.'
if mode == 'text':
df = self.df # This is now string[pyarrow] thanks to apply_formatters
# Vectorized length calculation (Much faster than map(len))
# We assume df is already string[pyarrow]
try:
# Ideally: df.apply(lambda x: x.str.len().max())
# Since df is objects/strings, .str accessor works if dtype is string
if is_string_dtype(df.iloc[:,0]):
natural_width = df.apply(lambda x: x.str.len().max()).to_dict()
else:
natural_width = df.map(len).max(axis=0).to_dict()
except:
natural_width = df.map(len).max(axis=0).to_dict()
len_function = len
bold_adjustment = 1.0
elif mode == 'html':
df = self.df_html
len_function = TextLength.text_display_len
bold_adjustment = 1.1
natural_width = df.map(lambda x: len_function(x.strip())).max(axis=0).to_dict()
else:
df = self.df_tex
len_function = TextLength.text_display_len
bold_adjustment = 1.1
natural_width = df.map(lambda x: len_function(x.strip())).max(axis=0).to_dict()
n_row, n_col = df.shape
# in text mode: figure out where you can break; pat breaks after punctuation or at -
pat = r'(?<=[.,;:!?)\]}\u2014\u2013])\s+|--*\s+|\s+'
iso_date_split = r'(?<=\b\d{4})-(?=\d{2}-\d{2})'
pat = f'{pat}|{iso_date_split}'
minimum_width = {}
header_natural = {}
header_minimum = {}
for col_name in df.columns:
# For minimum width, we still need splitting
minimum_width[col_name] = (
df[col_name].astype(str).str # Ensure str accessor
.split(pat=pat, regex=True, expand=True)
.fillna('')
.map(len_function)
.max(axis=1)
.max()
)
ctuple = col_name if isinstance(col_name, tuple) else (col_name, )
header_natural[col_name] = bold_adjustment * max(map(len_function, ctuple))
header_minimum[col_name] = bold_adjustment * min(len_function(part) for i in ctuple for part in re.split(pat, str(i)))
ans = pd.DataFrame({
'alignment': [i[4:] for i in self.df_aligners],
'break_penalties': self.break_penalties,
'breakability': [x.name for x in self.break_penalties],
'natural_width': natural_width.values(),
'minimum_width': minimum_width.values(),
}, index=df.columns)
ans['acceptable_width'] = np.where(
ans.break_penalties == Breakability.ACCEPTABLE, ans.minimum_width, ans.natural_width)
ans['header_natural'] = header_natural
ans['header_minimum'] = header_minimum
if mode in ('html', 'tex'):
ans['natural_width'] += 1
ans['minimum_width'] += 1
ans['header_natural'] += 1
ans['header_minimum'] += 1
natural, acceptable, minimum = ans.iloc[:, 3:6].sum()
head_natural, head_minimum = ans.iloc[:, 6:8].sum()
if mode == 'text':
PADDING = 2
pad_adjustment = (PADDING + 1) * n_col - 1
else:
PADDING = 1
pad_adjustment = PADDING * n_col
if self.config.table_width_mode == 'explicit':
target_width = self.max_table_width_em - pad_adjustment
elif self.config.table_width_mode == 'natural':
target_width = natural + pad_adjustment
elif self.config.table_width_mode == 'breakable':
target_width = acceptable + pad_adjustment
elif self.config.table_width_mode == 'minimum':
target_width = minimum + pad_adjustment
logger.info('table_width_mode = %s', self.config.table_width_mode)
logger.info('config self.max_table_width_em %s', self.max_table_width_em)
logger.info('target width after column spacer adjustment %s', target_width)
if self.config.table_width_header_adjust > 0:
max_extra = int(self.config.table_width_header_adjust * target_width)
else:
max_extra = 0
if target_width > natural:
ans['recommended'] = ans['natural_width']
space = target_width - natural
logger.info('Space for NATURAL! Spare space = %s', space)
elif target_width > acceptable:
ans['recommended'] = ans['acceptable_width']
space = target_width - acceptable
logger.info('Using "breaks acceptable" (dates not wrapped), spare space = %s', space)
elif target_width > minimum:
ans['recommended'] = ans['minimum_width']
space = target_width - minimum
logger.info('Using "minimum" (all breakable incl dates), spare space = %s', space)
else:
ans['recommended'] = ans['minimum_width']
space = target_width - minimum
logger.info('Mode %s, desired width too small, table too wide by %s em.', mode, space)
logger.info(f'{mode=} {target_width=}, {natural=}, {acceptable=}, {minimum=}, {max_extra=}, {space=}')
if mode == "text" and space > 0 and df.columns.nlevels == 1:
ans['raw_recommended'] = ans['recommended']
if max_extra > 0:
adj = Width.header_adjustment(df, ans['recommended'], space, max_extra)
ans['header_tweak'] = pd.Series(adj)
else:
ans['header_tweak'] = 0
ans['recommended'] = ans['recommended'] + ans['header_tweak']
ans['header_natural'] = ans['recommended']
ans['header_minimum'] = ans['recommended']
remaining = target_width - ans['recommended'].sum()
ans['pre_shortfall_recommended'] = ans['recommended']
if remaining > 0:
shortfall = ans[['natural_width', 'header_natural']].max(axis=1) - ans['recommended']
total_shortfall = shortfall.clip(lower=0).sum()
if total_shortfall > 0:
logger.info('total shortfall to allocate after header adjustments = %s', total_shortfall)
fractions = shortfall.clip(lower=0) / total_shortfall
ans['proto_recommended'] = ans['recommended'] + np.floor(fractions * remaining).astype(int)
ans['recommended'] = np.minimum(ans[['natural_width', 'header_natural']].max(axis=1),
ans['proto_recommended'])
else:
logger.info('no shortfall to allocate after header adjustments')
if mode == 'tex':
tikz_colw = dict.fromkeys(df.columns, 0)
tikz_headw = dict.fromkeys(df.columns, 0)
for i, c in enumerate(df.columns):
c0 = c
if not isinstance(c, tuple): c = (c,)
c = [str(i) for i in c]
tikz_headw[c0] = max(map(len, c))
tikz_colw[c0] = df.iloc[:, i].map(lambda x: len(str(x))).max()
for c in df.columns:
tikz_colw[c] = max(tikz_colw[c], tikz_headw[c])
ans['tikz_colw'] = tikz_colw
ans['tikz_colw'] += 2
return_columns = [
'alignment', 'break_penalties', 'breakability', 'natural_width',
'acceptable_width', 'minimum_width', 'header_natural', 'header_minimum',
'raw_recommended', 'header_tweak', 'pre_space_share_recommended',
'proto_recommended', 'recommended', 'tikz_colw',
]
ans = ans[[i for i in return_columns if i in ans.columns]]
ans['recommended'] = np.maximum(ans['recommended'], 1)
return ans
def make_style(self, tabs):
"""Write out custom CSS for the table."""
if self.config.debug:
head_tb, body_b = '#0ff', '#f0f'
h0, h1, h2 = '#f00', '#b00', '#900'
bh0, bh1, v0, v1, v2 = '#f00', '#b00', '#0f0', '#0a0', '#090'
else:
head_tb = body_b = h0 = h1 = h2 = bh0 = bh1 = v0 = v1 = v2 = '#000'
table_hrule = self.config.table_hrule_width
table_vrule = self.config.table_vrule_width
padt, padr, padb, padl = self.padt, self.padr, self.padb, self.padl
style = [f'''
<style>
#{self.df_id} {{
border-collapse: collapse;
font-family: "Roboto", "Open Sans Condensed", "Arial", 'Segoe UI', sans-serif;
font-size: {self.config.font_body}em;
width: auto;
/* tb and lr
width: fit-content; */
margin: 10px auto;
border: none;
overflow: auto;
margin-left: auto;
margin-right: auto;
}}
/* center tables in quarto context
.greater-table {{
display: block;
text-align: center;
}}
.greater-table > table {{
display: inline-table;
}} */
/* try to turn off Jupyter and other formats for greater-table
all: unset => reset all inherited styles
display: revert -> put back to defaults
#greater-table * {{
all: unset;
display: revert;
}}
*/
/* tag formats */
#{self.df_id} caption {{
padding: {2 * padt}px {padr}px {padb}px {padl}px;
font-size: {self.config.font_caption}em;
text-align: {self.config.caption_align};
font-weight: normal;
caption-side: top;
}}
#{self.df_id} thead {{
/* top and bottom of header */
border-top: {table_hrule}px solid {head_tb};
border-bottom: {table_hrule}px solid {head_tb};
font-size: {self.config.font_head}em;
}}
#{self.df_id} tbody {{
/* bottom of body */
border-bottom: {table_hrule}px solid {body_b};
}}
#{self.df_id} th {{
vertical-align: bottom;
padding: {2 * padt}px {padr}px {2 * padb}px {padl}px;
}}
#{self.df_id} td {{
/* top, right, bottom left cell padding */
padding: {padt}px {padr}px {padb}px {padl}px;
vertical-align: top;
}}
/* class overrides */
#{self.df_id} .grt-hrule-0 {{
border-top: {self.config.hrule_widths[0]}px solid {h0};
}}
#{self.df_id} .grt-hrule-1 {{
border-top: {self.config.hrule_widths[1]}px solid {h1};
}}
#{self.df_id} .grt-hrule-2 {{
border-top: {self.config.hrule_widths[2]}px solid {h2};
}}
/* for the header, there if you have v lines you want h lines
hence use config.vrule_widths */
#{self.df_id} .grt-bhrule-0 {{
border-bottom: {self.config.vrule_widths[0]}px solid {bh0};
}}
#{self.df_id} .grt-bhrule-1 {{
border-bottom: {self.config.vrule_widths[1]}px solid {bh1};
}}
#{self.df_id} .grt-vrule-index {{
border-left: {table_vrule}px solid {v0};
}}
#{self.df_id} .grt-vrule-0 {{
border-left: {self.config.vrule_widths[0]}px solid {v0};
}}
#{self.df_id} .grt-vrule-1 {{
border-left: {self.config.vrule_widths[1]}px solid {v1};
}}
#{self.df_id} .grt-vrule-2 {{
border-left: {self.config.vrule_widths[2]}px solid {v2};
}}
#{self.df_id} .grt-left {{
text-align: left;
}}
#{self.df_id} .grt-center {{
text-align: center;
}}
#{self.df_id} .grt-right {{
text-align: right;
font-variant-numeric: tabular-nums;
}}
#{self.df_id} .grt-head {{
font-family: "Times New Roman", 'Courier New';
font-size: {self.config.font_head}em;
}}
#{self.df_id} .grt-bold {{
font-weight: bold;
}}
''']
style.append('</style>')
logger.info('CREATED CSS')
return '\n'.join(style)
def make_html(self):
"""Convert a pandas DataFrame to an HTML table."""
index_name_to_level = dict(
zip(self.raw_df.index.names, range(self.nindex)))
index_change_level = self.index_change_level.map(index_name_to_level)
column_change_level = self.column_change_level
html = [f'<table id="{self.df_id}">']
if self.caption != '':
html.append(f'<caption>{self.caption}</caption>')
bit = self.df_html.T.reset_index(drop=False, allow_duplicates=True)
idx_header = bit.iloc[:self.nindex, :self.ncolumns]
columns = bit.iloc[self.nindex:, :self.ncolumns]
tabs = self.html_knowledge_df['recommended'].map(lambda x: np.round(x, 3))
tabs = np.array(tabs) + (self.padl + self.padr) / 12
html.append('<colgroup>')
for w in tabs:
html.append(f'<col style="width: {w}em;">')
html.append('</colgroup>')
if self.config.sparsify_columns:
html.append("<thead>")
for i in range(self.ncolumns):
html.append("<tr>")
if self.show_index:
for j, r in enumerate(idx_header.iloc[:, i]):
html.append(f'<th class="grt-left">{r}</th>')
cum_col = 0
for j, (nm, g) in enumerate(groupby(columns.iloc[:, :i + 1].
apply(lambda x: ':::'.join(str(i) for i in x), axis=1))):
nm = nm.split(':::')[-1]
hrule = f'grt-bhrule-{i}' if i < self.ncolumns - 1 else ''
colspan = sum(1 for _ in g)
if 0 < j:
vrule = f'grt-vrule-{column_change_level[cum_col]}'
elif j == 0 and self.show_index:
vrule = f'grt-vrule-index'
else:
vrule = ''
if j == 0 and not self.show_index:
html.append(
f'<th colspan="{colspan}" class="grt-left {hrule} {vrule}">{nm}</th>')
else:
html.append(
f'<th colspan="{colspan}" class="grt-center {hrule} {vrule}">{nm}</th>')
cum_col += colspan
html.append("</tr>")
html.append("</thead>")
else:
html.append("<thead>")
for i in range(self.ncolumns):
html.append("<tr>")
if self.show_index:
for j, r in enumerate(idx_header.iloc[:, i]):
html.append(f'<th class="grt-left">{r}</th>')
for j, r in enumerate(columns.iloc[:, i]):
hrule = f'grt-bhrule-{i}' if i < self.ncolumns - 1 else ''
if 0 < j < self.ncols and i >= column_change_level[j]:
vrule = f'grt-vrule-{column_change_level[j]}'
elif j == 0 and self.show_index:
vrule = f'grt-vrule-index'
else:
vrule = ''
html.append(
f'<th class="grt-center {hrule} {vrule}">{r}</th>')
html.append("</tr>")
html.append("</thead>")
bold_idx = 'grt-bold' if self.config.font_bold_index else ''
html.append("<tbody>")
for i, (n, r) in enumerate(self.df_html.iterrows()):
html.append("<tr>")
hrule = ''
if self.show_index:
for j, c in enumerate(r.iloc[:self.nindex]):
if i > 0 and hrule == '' and i in index_change_level and j == index_change_level[i]:
hrule = f'grt-hrule-{j}'
col_id = f'grt-c-{j}'
html.append(
f'<td class="{col_id} {bold_idx} {self.df_aligners[j]} {hrule}">{c}</td>')
for j, c in enumerate(r.iloc[self.nindex:]):
if 0 < j < self.ncols:
vrule = f'grt-vrule-{column_change_level[j]}'
elif j == 0 and self.show_index:
vrule = f'grt-vrule-index'
else:
vrule = ''
col_id = f'grt-c-{j+self.nindex}'
html.append(
f'<td class="{col_id} {self.df_aligners[j+self.nindex]} {hrule} {vrule}">{c}</td>')
html.append("</tr>")
html.append("</tbody>")
text = '\n'.join(html)
self._df_html_text = Escaping.clean_html_tex(text)
logger.info('CREATED HTML')
self._df_style_text = self.make_style(tabs)
def clean_style(self, soup):
"""Minify CSS inside <style> blocks and remove slash-star comments."""
if not self.config.debug:
for style_tag in soup.find_all("style"):
if style_tag.string:
cleaned_css = re.sub(r'/\*.*?\*/', '', style_tag.string, flags=re.DOTALL)
style_tag.string.replace_with(cleaned_css)
return soup
@property
def html(self):
if self._clean_html == '':
if self._df_html_text == '':
self.make_html()
code = ["<div class='greater-table'>", self._df_style_text, self._df_html_text, "</div>"]
soup = BeautifulSoup('\n'.join(code), 'html.parser')
soup = self.clean_style(soup)
self._clean_html = str(soup)
logger.info('CREATED COMBINED HTML and STYLE')
return self._clean_html
def make_tikz(self):
"""
Write DataFrame to custom tikz matrix.
"""
if not self.config.tikz:
return ''
column_sep = self.config.tikz_column_sep
row_sep = self.config.tikz_row_sep
container_env = self.config.tikz_container_env
hrule = self.config.tikz_hrule
vrule = self.config.tikz_vrule
post_process = self.config.tikz_post_process
latex = self.config.tikz_latex
df = self.df_tex.copy()
caption = self.caption
label = self.label
if label == '':
lt = ''
label = ''
else:
lt = label
label = f'\\label{{{label}}}'
if caption == '':
if lt != '':
logger.info(
f'You have a label but no caption; the label {label} will be ignored.')
caption = '% caption placeholder'
else:
caption = f'\\caption{{{self.caption}}}\n{label}'
if not df.columns.is_unique:
raise ValueError('tikz routine requires unique column names')
header = """
\\begin{{{container_env}}}{latex}
{caption}
% \\centering{{
\\begin{{tikzpicture}}[
auto,
transform shape,
nosep/.style={{inner sep=0}},
table/.style={{
matrix of nodes,
row sep={row_sep}em,
column sep={column_sep}em,
nodes in empty cells,
nodes={{rectangle, scale={scale}, text badly ragged {debug}}},
"""
footer = """
{post_process}
\\end{{tikzpicture}}
% }} % close centering
\\end{{{container_env}}}
"""
nc_index = self.nindex
nr_columns = self.ncolumns
if vrule is None:
vrule = set()
else:
vrule = set(vrule)
vrule.add(nc_index + 1)
logger.info(
f'rows in columns {nr_columns}, columns in index {nc_index}')
matrix_name = self.df_id
colw = self.tex_knowledge_df['tikz_colw'].fillna(0).round(3)
tabs = self.tex_knowledge_df['recommended'].map(lambda x: np.round(x, 3))
ad = {'l': 'left', 'r': 'right', 'c': 'center'}
ad2 = {'l': '<', 'r': '>', 'c': '^'}
align = []
for n, i in zip(df.columns, self.df_aligners):
if i == 'grt-left':
align.append('l')
elif i == 'grt-right':
align.append('r')
elif i == 'grt-center':
align.append('c')
else:
align.append('l')
sio = StringIO()
if latex is None:
latex = ''
else:
latex = f'[{latex}]'
if self.config.debug:
debug = ', draw=blue!10'
else:
debug = ''
sio.write(header.format(container_env=container_env,
caption=caption,
scale=self.config.tikz_scale,
column_sep=column_sep,
row_sep=row_sep,
latex=latex,
debug=debug))
i = 1
sio.write(
f'\trow {i}/.style={{nodes={{text=black, anchor=north, inner ysep=0, text height=0, text depth=0}}}},\n')
for i in range(2, nr_columns + 2):
sio.write(
f'\trow {i}/.style={{nodes={{text=black, anchor=south, inner ysep=.2em, minimum height=1.3em, font=\\bfseries, align=center}}}},\n')
for i in range(2, nr_columns + 2):
for j in range(1, 1+nc_index):
sio.write(
f'\trow {i} column {j}/.style='
'{nodes={font=\\bfseries\\itshape, align=left}},\n'
)
for i, w, al in zip(range(1, len(align) + 1), tabs, align):
if i == 1:
sio.write(f'\tcolumn {i:>2d}/.style={{'
f'nodes={{align={ad[al]:<6s}}}, '
'text height=0.9em, text depth=0.2em, '
f'inner xsep={column_sep}em, inner ysep=0, '
f'text width={max(2, w):.2f}em}},\n')
else:
sio.write(f'\tcolumn {i:>2d}/.style={{'
f'nodes={{align={ad[al]:<6s}}}, nosep, text width={max(2, w):.2f}em}},\n')
sio.write(
f'\tcolumn {i+1:>2d}/.style={{text height=0.9em, text depth=0.2em, nosep, text width=0em}}\n')
sio.write('\t}]\n')
sio.write("\\matrix ({matrix_name}) [table, ampersand replacement=\\&]{{\n".format(
matrix_name=matrix_name))
nl = ''
for cn, al in zip(df.columns, align):
s = f'{nl} {{cell:{ad2[al]}{colw[cn]}s}} '
nl = '\\&'
sio.write(s.format(cell=' '))
sio.write('\\& \\\\\n')
mi_vrules = {}
sparse_columns = {}
if isinstance(df.columns, pd.MultiIndex):
for lvl in range(len(df.columns.levels)):
nl = ''
sparse_columns[lvl], mi_vrules[lvl] = Sparsify.sparsify_mi(df.columns.get_level_values(lvl),
lvl == len(df.columns.levels) - 1)
for cn, c, al in zip(df.columns, sparse_columns[lvl], align):
s = f'{nl} {{cell:{ad2[al]}{colw[cn]}s}} '
nl = '\\&'
sio.write(s.format(cell=c + '\\I'))
sio.write('\\& \\\\\n')
else:
nl = ''
for c, al in zip(df.columns, align):
s = f'{nl} {{cell:{ad2[al]}{colw[c]}s}} '
nl = '\\&'
sio.write(s.format(cell=str(c) + '\\I'))
sio.write('\\& \\\\\n')
for idx, row in df.iterrows():
nl = ''
for c, cell, al in zip(df.columns, row, align):
s = f'{nl} {{cell:{ad2[al]}{colw[c]}s}} '
nl = '\\&'
sio.write(s.format(cell=cell))
sio.write('\\& \\\\\n')
sio.write(f'}};\n\n')
nr, nc = df.shape
nr += nr_columns + 1
def python_2_tex(x):
return x + nr_columns + 2 if x >= 0 else nr + x + 3
tb_rules = [nr_columns + 1, python_2_tex(-1)]
if hrule:
hrule = set(map(python_2_tex, hrule)).union(tb_rules)
else:
hrule = list(tb_rules)
logger.debug(f'hlines: {hrule}')
yshift = row_sep / 2
xshift = -column_sep / 2
descender_proportion = 0.25
ls = 'thick'
ln = 1
sio.write(
f'\\path[draw, {ls}] ({matrix_name}-{ln}-1.south west) -- ({matrix_name}-{ln}-{nc+1}.south east);\n')
for ln in hrule:
ls = 'thick' if ln == nr + nr_columns + \
1 else ('semithick' if ln == 1 + nr_columns else 'very thin')
if ln < nr:
sio.write(f'\\path[draw, {ls}] ([yshift={-yshift}em]{matrix_name}-{ln}-1.south west) -- '
f'([yshift={-yshift}em]{matrix_name}-{ln}-{nc+1}.south east);\n')
else:
ln = nr
sio.write(f'\\path[draw, thick] ([yshift={-descender_proportion-yshift}em]{matrix_name}-{ln}-1.base west) -- '
f'([yshift={-descender_proportion-yshift}em]{matrix_name}-{ln}-{nc+1}.base east);\n')
if nr_columns > 1:
for ln in range(2, nr_columns + 1):
sio.write(f'\\path[draw, very thin] ([xshift={xshift}em, yshift={-yshift}em]'
f'{matrix_name}-{ln}-{nc_index+1}.south west) -- '
f'([yshift={-yshift}em]{matrix_name}-{ln}-{nc+1}.south east);\n')
written = set(range(1, nc_index + 1))
if vrule and self.show_index:
ls = 'very thin'
for cn in vrule:
if cn not in written:
sio.write(f'\\path[draw, {ls}] ([xshift={xshift}em]{matrix_name}-1-{cn}.south west) -- '
f'([yshift={-descender_proportion-yshift}em, xshift={xshift}em]{matrix_name}-{nr}-{cn}.base west);\n')
written.add(cn - 1)
if len(mi_vrules) > 0:
logger.debug(
f'Generated vlines {mi_vrules}; already written {written}')
ls = 'ultra thin'
for k, cols in mi_vrules.items():
if k == len(mi_vrules) - 1:
break
for cn in cols:
if cn in written:
pass
else:
written.add(cn)
top = k + 1
if top == 0:
sio.write(f'\\path[draw, {ls}] ([xshift={-xshift}em]{matrix_name}-{top}-{cn}.south east) -- '
f'([yshift={-descender_proportion-yshift}em, xshift={-xshift}em]{matrix_name}-{nr}-{cn}.base east);\n')
else:
sio.write(f'\\path[draw, {ls}] ([xshift={-xshift}em, yshift={-yshift}em]{matrix_name}-{top}-{cn}.south east) -- '
f'([yshift={-descender_proportion-yshift}em, xshift={-xshift}em]{matrix_name}-{nr}-{cn}.base east);\n')
sio.write(footer.format(container_env=container_env,
post_process=post_process))
if not all(df == self.df_tex):
logger.error('In tikz and df has changed...')
return sio.getvalue()
def make_rich(self, console, box_style=box.SQUARE):
"""Render to a rich table using Console object console."""
cw = self.text_knowledge_df['recommended']
aligners = self.text_knowledge_df['alignment']
show_lines = self.config.hrule_widths[0] > 0
self._rich_table = table = (
RichOutput.make_rich_table(self.df, cw, aligners, num_index_columns=self.nindex,
title=self.caption, show_lines=show_lines,
box_style=box_style))
return table
def make_string(self):
"""Print to string using custom (i.e., not Tabulate) functionality."""
if self.df.empty:
return ""
if self._string == "":
cw = self.text_knowledge_df['recommended']
aligners = self.text_knowledge_df['alignment']
self._string = TextOutput.make_text_table(
self.df, cw, aligners, index_levels=self.nindex)
return self._string
def make_svg(self):
"""Render tikz into svg text."""
tz = Etcher(self._repr_latex_(),
self.config.table_font_pt_size,
file_name=self.df_id
)
p = tz.file_path.with_suffix('.svg')
if not p.exists():
try:
tz.process_tikz()
except ValueError as e:
print(e)
return "no svg output"
txt = p.read_text()
return txt
def save_html(self, fn):
"""Save HTML to file."""
html_boiler_plate = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Greater Table</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="https://fonts.googleapis.com/css2?family=Roboto&family=Open+Sans+Condensed:ital,wght@0,300;1,300&display=swap" rel="stylesheet">
<script defer src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<style>
body {
font-family: "Roboto", "Open Sans Condensed", "Arial", 'Segoe UI', sans-serif;
margin: 2em;
background: #fff;
color: #000;
}
</style>
</head>
<body>
<h1>Rendered Table</h1>
{table_html}
</body>
</html>
'''
p = Path(fn)
p.parent.mkdir(parents=True, exist_ok=True)
p = p.with_suffix('.html')
print(p)
html = html_boiler_plate.replace('{table_html}', self.html)
soup = BeautifulSoup(html, 'html.parser')
p.write_text(soup.prettify(), encoding='utf-8')
logger.info(f'Saved to {p}')
def show_svg(self):
"""Display svg in Jupyter."""
svg = self.make_svg()
if svg != 'no svg output':
display(SVG(svg))
else:
print('No SVG file available (TeX compile error).')
def show_html(self, fn=''):
if fn == '':
with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp:
tmp_path = Path(tmp.name)
else:
tmp_path = Path(fn)
self.save_html(fn=tmp_path)
os.startfile(tmp_path) # native Windows way to open in default browser
return tmp_path
@staticmethod
def uber_test(df, show_html=False, **kwargs):
"""
Print various diagnostics and all the formats.
show_html -> run show_html to display in new browser tab.
"""
f = GT(df, **kwargs)
display(f)
if show_html:
f.show_html()
print(f)
f.show_svg()
display(df)
display(f.width_report())
print(f.make_tikz())
return f
@staticmethod
def _is_namedtuple_instance(x) -> bool:
"""Heuristic: namedtuple instances are tuples whose class defines _fields."""
return isinstance(x, tuple) and isinstance(getattr(type(x), "_fields", None), tuple)
@staticmethod
def _ntdf(t):
"""Convert named tuple to pandas dataframe to display."""
return pd.Series(t, index=pd.Index(t._fields, name="Item")).to_frame('Value')