Working draft of 3.0.0

Added robust testdf
This commit is contained in:
Stephen Mildenhall
2025-06-14 13:25:26 +01:00
parent ec3ac66552
commit 863ee43b43
12 changed files with 6361 additions and 5825 deletions
View File
File diff suppressed because it is too large Load Diff
+119
View File
@@ -0,0 +1,119 @@
"""
Find and process blobs of TeX.
Change target directory to find other blobs.
"""
from pathlib import Path
import re
import subprocess
import pandas as pd
class TeXMacros():
"""
A class for dealing with TeX macros.
made out of PublisherBase in blog_tools.py
from great2.blog
"""
_macros = r"""
\def\AA{\mathcal{A}}
\def\atan{\mathrm{atan}}
\def\AVaR{\mathsf{AVaR}}
\def\bbeta{\mathbf{\beta}}
\def\bb{\mathbf b}
\def\bm{\mathbf }
\def\biTVaR{\mathsf{biTVaR}}
\def\corr{\mathsf{Corr}}
\def\cov{\mathsf{cov}}
\def\cp{\mathsf{CP}}
\def\CTE{\mathsf{CTE}}
\def\CVaR{\mathsf{CVaR}}
\def\dint{\displaystyle\int}
\def\dsum{\displaystyle\sum}
\def\ecirc{\accentset{\circ} e}
\def\ecirc{\accentset{\circ} e}
\def\EPD{\mathsf{EPD}}
\def\ES{\mathsf{ES}}
\def\E{\mathsf{E}}
\def\FFF{\mathscr{F}}
\def\FF{\mathcal{F}}
\def\HH{\mathbf{H}}
\def\kpx{{{}_kp_x}}
\def\MM{\mathcal{M}}
\def\NN{\mathbb{N}}
\def\nudge{2}
\def\norm{}
\def\OO{\mathscr{O}}
\def\PPP{\mathscr{P}}
\def\PP{\mathsf{P}}
\def\Pr{\mathsf{Pr}}
\def\QQ{\mathsf{Q}}
\def\RR{\mathbb{R}}
\def\SD{\mathsf{SD}}
\def\TCE{\mathsf{TCE}}
\def\TVaR{\mathsf{TVaR}}
\def\Var{\mathsf{Var}}
\def\var{\mathsf{var}}
\def\VaR{\mathsf{VaR}}
\def\WCE{\mathsf{WCE}}
\def\ww{\mathbf{w}}
\def\XXX{\mathcal{X}}
\def\xx{\mathbf{x}}
\def\XX{\mathbf{X}}
\def\yy{\mathbf{y}}
\def\ZZZ{\mathcal{Z}}
\def\ZZ{\mathbb{Z}}
"""
@staticmethod
def process_tex_macros(text):
"""Expand standard general.tex macros in the text."""
m, regex = TeXMacros.tex_to_dict(TeXMacros._macros.strip())
return re.sub(regex, lambda x: m.get(x[0]), text, flags=re.MULTILINE)
@staticmethod
def tex_to_dict(text):
"""
Convert text, a series of def{} macros into a dictionary
returns the dictionary and the regex of all keys
"""
smacros = text.split('\n')
smacros = [TeXMacros.tex_splitter(i) for i in smacros]
m = {i: j for (i, j) in smacros}
regex = '|'.join([re.escape(k) for k in m.keys()])
return m, regex
@staticmethod
def tex_splitter(x):
"""
x is a single def style tex macro
"""
x = x.replace('\\def', '')
i = x.find('{')
return x[:i], x[i + 1:-1]
def find_tex_snippeets(in_dir='\\S\\TELOS\\PIR\\docs',
out_file='tex_list.csv'):
"""Ripgrep / TeX macro expand list of TeX snippets."""
result = subprocess.run(
['rg', '-N', '-o', '--no-filename', '-g', '*.md', r'\$.+?\$', in_dir],
capture_output=True,
text=True,
check=True
)
output_text = result.stdout
tm = TeXMacros()
txt = tm.process_tex_macros(output_text)
tex = txt.split('\n')
stex = set(tex)
stext = [i for i in stex if len(i) and i.find('\\PP') < 0 and i.find('$$') < 0]
df = pd.DataFrame({'expr': stext})
if out_file != '':
p = Path(__file__).parent / out_file
print(p)
df.to_csv(p, encoding='utf-8')
return df
+4 -4
View File
@@ -62,11 +62,11 @@ class GTConfigModel(BaseModel):
table_vrule_width: int = Field(
1, description="Width of vertical rule separating index from body"
)
hrule_widths: Optional[tuple[int, int, int]] = Field(
(0, 0, 0), description="Tuple of three ints for horizontal rule widths (for multiindex use)"
hrule_widths: Optional[tuple[float, float, float]] = Field(
(0, 0, 0), description="Tuple of three floats for horizontal rule widths (for multiindex use)"
)
vrule_widths: Optional[tuple[int, int, int]] = Field(
(0, 0, 0), description="Tuple of three ints for vertical rule widths (for multiindex columns)"
vrule_widths: Optional[tuple[float, float, float]] = Field(
(0, 0, 0), description="Tuple of three floats for vertical rule widths (for multiindex columns)"
)
sparsify: bool = Field(
+5 -1
View File
@@ -261,7 +261,11 @@ class GT(object):
base_config = GTConfigModel()
# access through config
self.config = base_config.model_copy(update=overrides)
# update and validate; need to merge to avoid repeated args
merged = dict(base_config.model_dump(), **overrides)
self.config = GTConfigModel(**merged)
# no validation
# self.config = base_config.model_copy(update=overrides)
# deal with alternative input modes for df: None, DataFrame, Series, markdown text table
if df is None:
+8
View File
@@ -26,3 +26,11 @@ def df_short_hash(df, length=12):
hash_str = base64.b32encode(hash_bytes).decode("utf-8").rstrip("=") # Trim padding
return f"T{hash_str[:length]}" # Prefix with 'T' to ensure a valid ID
def txt_short_hash(txt):
hasher = hashlib.md5()
hasher.update(txt.encode('utf-8'))
hash_bytes = hasher.digest()
hash_str = base64.b32encode(hash_bytes).decode("utf-8").rstrip("=") # Trim padding
return hash_str[::2]
+52 -16
View File
@@ -5,7 +5,8 @@ GPT from SJMM design.
"""
from datetime import datetime, timedelta
from itertools import cycle
from importlib.resources import files
from itertools import cycle, chain
from math import prod
from pathlib import Path
from typing import Optional, Union
@@ -81,21 +82,39 @@ class TestDataFrameFactory:
self._index_namer = cycle(nwl)
# read words and create cycler
p = Path(__file__).parent / 'words-12.md'
assert p.exists()
txt = p.read_text(encoding='utf-8')
data_path = files('greater_tables').joinpath('data', 'words-12.md')
with data_path.open('r', encoding='utf-8') as f:
txt = f.read()
word_list = txt.split('\n')
temp = word_list[:]
random.shuffle(temp)
self._word_gen = cycle(temp)
# read tex expressions and create cycler
tex_list = pd.read_csv(Path(__file__).parent /
'tex_list.csv')['expr'].to_list()
data_path = files('greater_tables').joinpath('data', 'tex_list.csv')
with data_path.open('r', encoding='utf-8') as f:
tex_list = pd.read_csv(f, index_col=0)['expr'].to_list()
# trim down slightly
tex_list = [i for i in tex_list if len(i) < 50]
random.shuffle(tex_list)
self._tex_gen = cycle(tex_list)
self.simple_namer = {
'd': 'date',
'f': 'float',
'h': 'hash',
'i': 'integer',
'l': 'large_float',
'm': 'yr-mo',
'p': 'path',
'r': 'ratio',
's': 'string',
't': 'time',
'v': 'extreme_float',
'x': 'tex',
'y': 'year',
}
# lengths of index (word count) sampled from:
self.index_value_lengths = [1]*10 + [2] * 4 + [3]
@@ -113,13 +132,14 @@ class TestDataFrameFactory:
l log float (greater range than float)
m year - month
p path (filename)
r ratio (smaller floats, for percents)
sx string length x
t time
v very large range float
x tex text - an equation
y year
Args:
Args:
rows: Number of rows.
columns: Column type spec (int for all float cols, or string type codes).
index: Index level types (int for RangeIndex or string like 'ti').
@@ -168,7 +188,7 @@ class TestDataFrameFactory:
['d', 'f', 'i', 's3', 'l', 'h', 't', 'p'], size=self.rng.integers(3, 7))
missing = round(float(self.rng.uniform(0, 0.15)), 2)
index = ''.join(self.rng.choice(
['t', 'd', 'i', 's2'], size=index_levels))
['t', 'd', 'y', 'i', 's2'], size=index_levels))
col_index = ''.join(self.rng.choice(
['s', 's2', 's2', 's3'], size=column_levels))
return self.make(rows=rows, columns=''.join(col_types), index=index, col_index=col_index, missing=missing)
@@ -182,18 +202,22 @@ class TestDataFrameFactory:
else:
col_types = self._parse_colspec(columns)
# if col_index is an int then use all strings of that depth
if isinstance(col_index, int):
col_index_types = ['s'] * col_index
if col_index == 'simple':
col_idx = map(self.simple_namer.get, [i[0] for i in col_types])
col_idx = pd.Index(col_idx, name='simple')
else:
col_index_types = self._parse_colspec(col_index)
if isinstance(col_index, int):
col_index_types = ['s'] * col_index
else:
col_index_types = self._parse_colspec(col_index)
col_idx = self._make_index(col_index_types, len(col_types))
if isinstance(index, int):
index = ['s'] * index
else:
index = self._parse_colspec(index)
print(index)
# print(index)
# col names are a transposed index.
df = pd.DataFrame(index=range(rows))
col_idx = self._make_index(col_index_types, len(col_types))
for dt, c in zip(col_types, range(len(col_idx))):
df[c] = self._generate_column(dt, rows)
df.columns = col_idx
@@ -210,15 +234,23 @@ class TestDataFrameFactory:
return pd.Series([" ".join(self.word() for i in range(max_words)) for j in range(n)])
if dtype == 'f':
return pd.Series(self.rng.normal(loc=100000, scale=250000, size=n))
if dtype == 'r':
return pd.Series(self.rng.normal(loc=0.5, scale=0.35, size=n))
if dtype == 'l':
# log float (greater range)
return pd.Series(np.exp(self.rng.normal(loc=-4 / 2 + 4, scale=4, size=n)))
if dtype == 'v':
# log float (greater range)
sc = 5
return pd.Series(np.exp(self.rng.normal(loc=-sc**2 / 2 + 10, scale=sc, size=n)))
if dtype == 'i':
return pd.Series(self.rng.integers(-1e4, 1e6, size=n), dtype='int64')
if dtype == 'd':
start_date = TestDataFrameFactory.random_date_within_last_n_years(
10)
return pd.Series(pd.date_range(start=start_date, periods=n, freq='D'))
if dtype == 'y':
return pd.Series(random.sample(range(1990, 2031), n))
if dtype == 't':
start_dt = datetime.now() - timedelta(days=365 * 2)
return pd.Series([
@@ -347,7 +379,11 @@ class TestDataFrameFactory:
for w, k in zip(level_value_lengths, level_choices)]
x = [[next(j) for j in r] for i in range(rows)]
names = random.sample(name_word_list, levels)
idx = pd.MultiIndex.from_tuples(
random.sample(x, rows), names=names).sort_values()
if levels == 1:
idx = pd.Index(
list(chain.from_iterable(random.sample(x, rows))), name=names[0]).sort_values()
else:
idx = pd.MultiIndex.from_tuples(
random.sample(x, rows), names=names).sort_values()
assert idx.is_unique
return idx
File diff suppressed because it is too large Load Diff
+188
View File
@@ -0,0 +1,188 @@
"""
Create and display svg files from tikz tex tables.
Good for testing.
From great2.blog
"""
from datetime import datetime
import pandas as pd
from pathlib import Path
import re
import yaml
from itertools import count
from subprocess import Popen, PIPE
from IPython.display import display, Markdown, SVG
from . hasher import txt_short_hash
class TikzProcessor():
_tex_template_full = """\\documentclass[10pt, border=5mm]{{standalone}}
% needs lualatex - uncomment for Wiley fonts
%\\usepackage{{fontspec}}
%\\setmainfont{{Stix Two Text}}
%\\usepackage{{unicode-math}}
%\\setmathfont{{Stix Two Math}}
\\usepackage{{amsfonts}}
\\usepackage{{url}}
\\usepackage{{tikz}}
\\usepackage{{color}}
\\usetikzlibrary{{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}}
\\usetikzlibrary{{automata}}
\\usetikzlibrary{{fit}}
\\usetikzlibrary{{snakes}}
\\usetikzlibrary{{intersections}}
\\usetikzlibrary{{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}}
\\usetikzlibrary{{decorations.fractals,decorations.footprints}}
\\usetikzlibrary{{graphs}}
\\usetikzlibrary{{matrix}}
\\usetikzlibrary{{shapes.geometric}}
\\usetikzlibrary{{mindmap, shadows}}
\\usetikzlibrary{{backgrounds}}
\\usetikzlibrary{{cd}}
% really common macros
\\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}}
\\def\\dfrac{{\\displaystyle\\frac}}
\\def\\dint{{\\displaystyle\\int}}
\\begin{{document}}
{tikz_begin}{tikz_code}{tikz_end}
\\end{{document}}
"""
# --------------------------------------------
_tex_template = """
% really common macros
\\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}}
\\def\\dfrac{{\\displaystyle\\frac}}
\\def\\dint{{\\displaystyle\\int}}
\\begin{{document}}
{tikz_begin}{tikz_code}{tikz_end}
\\end{{document}}
"""
def split_tikz(self):
"""
Split text to get the tikzpicture. Format is
initial text pip then groups of four:
1. begin tag ``(1::4)``
2. tikz code ``(2::4)``
3. end tag ``(3::4)``
4. non-related text ``(4::4)``
"""
return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', self.txt)
def __init__(self, txt, base_path='.', tex_engine='pdflatex'):
"""
TikzProcessor (from TikzConvertyer): process a tex tikz text string into svg.
The program
* creates a pdf and svg from the tikz blob
lualatex is more robust, but slower...
pdflatex can't handle the fancy wiley fonts
"""
self.txt = txt
self.tex_engine = tex_engine
# directory for TeX and images
self.base_path = Path(base_path).resolve()
self.out_path = self.base_path / 'tikz'
self.out_path.mkdir(exist_ok=True)
self.file_path = self.out_path / txt_short_hash(txt)
def process_tikz(self, verbose=False):
"""
Process the tikz into pdf and svg
"""
# container contains a tikzpicture
svg_path = self.file_path.with_suffix('.svg')
tex_path = self.file_path.with_suffix('.tex')
# make tex code for a stand-alone document
tikz_begin, tikz_code, tikz_end = self.split_tikz()[
1:4]
tex_code = self._tex_template.format(
tikz_begin=tikz_begin, tikz_code=tikz_code, tikz_end=tikz_end)
tex_path.write_text(tex_code, encoding='utf-8')
print(
f'TIKZ: created temp file = {tex_path.name}')
pdf_file = tex_path.with_suffix('.pdf')
print(f'TIKZ: Update pdf file')
if self.tex_engine == 'pdflatex':
# faster with template
# TODO EVID hard coded template
template_path = Path('tikz_format.fmt')
assert template_path.exists()
template = str(template_path)
command = ['pdflatex', f'--fmt={template}',
f'--output-directory={str(tex_path.parent.resolve())}',
str(tex_path.resolve())]
else:
# for STIX fonts, no template
command = ['lualatex',
f'--output-directory={str(tex_path.parent.resolve())}',
str(tex_path.resolve())]
if verbose:
print(f'TIKZ: TeX Command={" ".join(command)}')
TikzProcessor.run_command(command)
# to recreate
(tex_path.parent /
f'make_tikz.bat').write_text(" ".join(command))
if verbose:
print(
f'TIKZ: Creating svg file for Tikz (using new pdf2svg util)')
# https://github.com/jalios/pdf2svg-windows
command = [
'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg',
str(pdf_file.resolve()), str(svg_path.resolve())]
# seems to return info on stderr?
if verbose:
print(f'PDF->SVG: {" ".join(command)}')
TikzProcessor.run_command(command, flag=False)
if not verbose:
# tidy up
tex_path.unlink()
tex_path.with_suffix('.aux').unlink()
tex_path.with_suffix('.log').unlink()
pdf_file.unlink()
@staticmethod
def run_command(command, flag=True):
"""
Run a command and show results. Allows for weird xx behavior
:param command:
:param flag:
:return:
"""
with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
line1 = p.stdout.read()
line2 = p.stderr.read()
exit_code = p.poll()
if line1:
print('\n' + line1[-250:])
if line2:
if flag:
raise ValueError(line2)
else:
print(line2)
return exit_code
def display(self):
"""display in Jupyter Lab."""
display(SVG(self.file_path.with_suffix('.svg')))
+133
View File
@@ -0,0 +1,133 @@
"""
Create and display SVG files from TikZ pictures embedded in LaTeX.
Good for testing. Outputs are cached by hash. PDF→SVG uses pdf2svg.
GPT re-write of my old great2.blog code.
"""
import re
from pathlib import Path
from subprocess import run, Popen, PIPE
from IPython.display import SVG, display
from .hasher import txt_short_hash
class TikzProcessor:
# Full TeX preamble to generate a .fmt if needed
_tex_template_full = r"""\documentclass[10pt, border=5mm]{standalone}
\usepackage{amsfonts}
\usepackage{url}
\usepackage{tikz}
\usepackage{color}
\usetikzlibrary{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}
\usetikzlibrary{automata,fit,snakes,intersections}
\usetikzlibrary{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}
\usetikzlibrary{decorations.fractals,decorations.footprints}
\usetikzlibrary{graphs,matrix,shapes.geometric}
\usetikzlibrary{mindmap,shadows,backgrounds,cd}
\dump
"""
# Minimal template to embed user tikz
_tex_template = r"""
\newcommand{{\grtspacer}}{{\vphantom{{lp}}}}
\def\dfrac{{\displaystyle\frac}}
\def\dint{{\displaystyle\int}}
\begin{{document}}
{tikz_begin}{tikz_code}{tikz_end}
\end{{document}}
"""
def __init__(self, txt, base_path='.', tex_engine='pdflatex'):
self.txt = txt
self.tex_engine = tex_engine
self.base_path = Path(base_path).resolve()
self.out_path = self.base_path / 'tikz'
self.out_path.mkdir(exist_ok=True)
self.file_path = self.out_path / txt_short_hash(txt)
self.format_file = self.out_path / 'tikz_format.fmt'
def split_tikz(self):
"""Split text to extract the TikZ picture."""
return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', self.txt)
def ensure_format_file(self):
"""Create format file for faster compilation if missing."""
if self.format_file.exists():
return
print('building format file...')
tmp = self.out_path / 'tikz_format.tex'
tmp.write_text(self._tex_template_full, encoding='utf-8')
self.run_command([
'pdflatex',
f'-ini',
f'-jobname={self.format_file.stem}',
'&pdflatex',
tmp.name,
], raise_on_error=True, cwd=self.out_path)
# tmp.unlink()
(self.out_path / f'{self.format_file.stem}.log').unlink()
print('building format file...success', self.format_file.resolve())
def process_tikz(self, verbose=False):
"""Compile TikZ to PDF and convert to SVG."""
tikz_begin, tikz_code, tikz_end = self.split_tikz()[1:4]
tex_code = self._tex_template.format(
tikz_begin=tikz_begin,
tikz_code=tikz_code,
tikz_end=tikz_end
)
tex_path = self.file_path.with_suffix('.tex')
tex_path.write_text(tex_code, encoding='utf-8')
pdf_path = tex_path.with_suffix('.pdf')
svg_path = tex_path.with_suffix('.svg')
self.ensure_format_file()
tex_cmd = [
'pdflatex',
f'--fmt={self.format_file.stem}',
f'--output-directory={str(tex_path.parent)}',
str(tex_path)
]
if verbose:
print("Running:", " ".join(tex_cmd))
self.run_command(tex_cmd)
(tex_path.parent / 'make_tikz.bat').write_text(" ".join(tex_cmd), encoding='utf-8')
svg_cmd = [
'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg',
str(pdf_path),
str(svg_path)
]
if verbose:
print("Running:", " ".join(svg_cmd))
self.run_command(svg_cmd, raise_on_error=False)
if not verbose:
for ext in ('.tex', '.aux', '.log', '.pdf'):
path = tex_path.with_suffix(ext)
if path.exists():
path.unlink()
def display(self):
"""Display the SVG in Jupyter."""
display(SVG(self.file_path.with_suffix('.svg')))
@staticmethod
def run_command(command, raise_on_error=True, cwd=None):
"""Run command with subprocess and show output."""
with Popen(command, cwd=cwd, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
stdout, stderr = p.communicate()
if stdout:
print(stdout.strip()[-250:])
if stderr:
if raise_on_error:
raise RuntimeError(stderr.strip())
else:
print(stderr.strip())
+3
View File
@@ -30,6 +30,9 @@ classifiers = [
include = ["greater_tables"]
exclude = ["img", "tests", "docs"]
[tool.setuptools.package-data]
"greater_tables" = ["data/*.csv", "data/*.md"]
[tool.setuptools.dynamic]
version = { attr = "greater_tables.__version__" }