Working draft of 3.0.0

Added robust testdf
2026-06-27 16:15:38 +08:00 · 2025-06-14 13:25:26 +01:00
parent ec3ac66552
commit 863ee43b43
12 changed files with 6361 additions and 5825 deletions
@@ -0,0 +1,119 @@
+"""
+Find and process blobs of TeX.
+
+Change target directory to find other blobs.
+"""
+
+from pathlib import Path
+import re
+import subprocess
+
+import pandas as pd
+
+
+class TeXMacros():
+    """
+    A class for dealing with TeX macros.
+
+    made out of PublisherBase in blog_tools.py
+    from great2.blog
+    """
+
+    _macros = r"""
+\def\AA{\mathcal{A}}
+\def\atan{\mathrm{atan}}
+\def\AVaR{\mathsf{AVaR}}
+\def\bbeta{\mathbf{\beta}}
+\def\bb{\mathbf b}
+\def\bm{\mathbf }
+\def\biTVaR{\mathsf{biTVaR}}
+\def\corr{\mathsf{Corr}}
+\def\cov{\mathsf{cov}}
+\def\cp{\mathsf{CP}}
+\def\CTE{\mathsf{CTE}}
+\def\CVaR{\mathsf{CVaR}}
+\def\dint{\displaystyle\int}
+\def\dsum{\displaystyle\sum}
+\def\ecirc{\accentset{\circ} e}
+\def\ecirc{\accentset{\circ} e}
+\def\EPD{\mathsf{EPD}}
+\def\ES{\mathsf{ES}}
+\def\E{\mathsf{E}}
+\def\FFF{\mathscr{F}}
+\def\FF{\mathcal{F}}
+\def\HH{\mathbf{H}}
+\def\kpx{{{}_kp_x}}
+\def\MM{\mathcal{M}}
+\def\NN{\mathbb{N}}
+\def\nudge{2}
+\def\norm{}
+\def\OO{\mathscr{O}}
+\def\PPP{\mathscr{P}}
+\def\PP{\mathsf{P}}
+\def\Pr{\mathsf{Pr}}
+\def\QQ{\mathsf{Q}}
+\def\RR{\mathbb{R}}
+\def\SD{\mathsf{SD}}
+\def\TCE{\mathsf{TCE}}
+\def\TVaR{\mathsf{TVaR}}
+\def\Var{\mathsf{Var}}
+\def\var{\mathsf{var}}
+\def\VaR{\mathsf{VaR}}
+\def\WCE{\mathsf{WCE}}
+\def\ww{\mathbf{w}}
+\def\XXX{\mathcal{X}}
+\def\xx{\mathbf{x}}
+\def\XX{\mathbf{X}}
+\def\yy{\mathbf{y}}
+\def\ZZZ{\mathcal{Z}}
+\def\ZZ{\mathbb{Z}}
+"""
+
+    @staticmethod
+    def process_tex_macros(text):
+        """Expand standard general.tex macros in the text."""
+        m, regex = TeXMacros.tex_to_dict(TeXMacros._macros.strip())
+        return re.sub(regex, lambda x: m.get(x[0]), text, flags=re.MULTILINE)
+
+    @staticmethod
+    def tex_to_dict(text):
+        """
+        Convert text, a series of def{} macros into a dictionary
+        returns the dictionary and the regex of all keys
+        """
+        smacros = text.split('\n')
+        smacros = [TeXMacros.tex_splitter(i) for i in smacros]
+        m = {i: j for (i, j) in smacros}
+        regex = '|'.join([re.escape(k) for k in m.keys()])
+        return m, regex
+
+    @staticmethod
+    def tex_splitter(x):
+        """
+        x is a single def style tex macro
+        """
+        x = x.replace('\\def', '')
+        i = x.find('{')
+        return x[:i], x[i + 1:-1]
+
+def find_tex_snippeets(in_dir='\\S\\TELOS\\PIR\\docs',
+                       out_file='tex_list.csv'):
+    """Ripgrep / TeX macro expand list of TeX snippets."""
+    result = subprocess.run(
+        ['rg', '-N', '-o', '--no-filename', '-g', '*.md', r'\$.+?\$', in_dir],
+        capture_output=True,
+        text=True,
+        check=True
+    )
+    output_text = result.stdout
+    tm = TeXMacros()
+    txt = tm.process_tex_macros(output_text)
+    tex = txt.split('\n')
+    stex = set(tex)
+    stext = [i for i in stex if len(i) and i.find('\\PP') < 0 and i.find('$$') < 0]
+    df = pd.DataFrame({'expr': stext})
+    if out_file != '':
+        p = Path(__file__).parent / out_file
+        print(p)
+        df.to_csv(p, encoding='utf-8')
+    return df
@@ -62,11 +62,11 @@ class GTConfigModel(BaseModel):
    table_vrule_width: int = Field(
        1, description="Width of vertical rule separating index from body"
    )
-    hrule_widths: Optional[tuple[int, int, int]] = Field(
-        (0, 0, 0), description="Tuple of three ints for horizontal rule widths (for multiindex use)"
+    hrule_widths: Optional[tuple[float, float, float]] = Field(
+        (0, 0, 0), description="Tuple of three floats for horizontal rule widths (for multiindex use)"
    )
-    vrule_widths: Optional[tuple[int, int, int]] = Field(
-        (0, 0, 0), description="Tuple of three ints for vertical rule widths (for multiindex columns)"
+    vrule_widths: Optional[tuple[float, float, float]] = Field(
+        (0, 0, 0), description="Tuple of three floats for vertical rule widths (for multiindex columns)"
    )

    sparsify: bool = Field(
@@ -261,7 +261,11 @@ class GT(object):
            base_config = GTConfigModel()

        # access through config
-        self.config = base_config.model_copy(update=overrides)
+        # update and validate; need to merge to avoid repeated args
+        merged = dict(base_config.model_dump(), **overrides)
+        self.config = GTConfigModel(**merged)
+        # no validation
+        # self.config = base_config.model_copy(update=overrides)

        # deal with alternative input modes for df: None, DataFrame, Series, markdown text table
        if df is None:
@@ -26,3 +26,11 @@ def df_short_hash(df, length=12):
    hash_str = base64.b32encode(hash_bytes).decode("utf-8").rstrip("=")  # Trim padding

    return f"T{hash_str[:length]}"  # Prefix with 'T' to ensure a valid ID
+
+
+def txt_short_hash(txt):
+    hasher = hashlib.md5()
+    hasher.update(txt.encode('utf-8'))
+    hash_bytes = hasher.digest()
+    hash_str = base64.b32encode(hash_bytes).decode("utf-8").rstrip("=")  # Trim padding
+    return hash_str[::2]
@@ -5,7 +5,8 @@ GPT from SJMM design.
 """

 from datetime import datetime, timedelta
-from itertools import cycle
+from importlib.resources import files
+from itertools import cycle, chain
 from math import prod
 from pathlib import Path
 from typing import Optional, Union
@@ -81,21 +82,39 @@ class TestDataFrameFactory:
        self._index_namer = cycle(nwl)

        # read words and create cycler
-        p = Path(__file__).parent / 'words-12.md'
-        assert p.exists()
-        txt = p.read_text(encoding='utf-8')
+        data_path = files('greater_tables').joinpath('data', 'words-12.md')
+        with data_path.open('r', encoding='utf-8') as f:
+            txt = f.read()
        word_list = txt.split('\n')
        temp = word_list[:]
        random.shuffle(temp)
        self._word_gen = cycle(temp)

        # read tex expressions and create cycler
-        tex_list = pd.read_csv(Path(__file__).parent /
-                               'tex_list.csv')['expr'].to_list()
+        data_path = files('greater_tables').joinpath('data', 'tex_list.csv')
+        with data_path.open('r', encoding='utf-8') as f:
+            tex_list = pd.read_csv(f, index_col=0)['expr'].to_list()
+        # trim down slightly
        tex_list = [i for i in tex_list if len(i) < 50]
        random.shuffle(tex_list)
        self._tex_gen = cycle(tex_list)

+        self.simple_namer = {
+            'd': 'date',
+            'f': 'float',
+            'h': 'hash',
+            'i': 'integer',
+            'l': 'large_float',
+            'm': 'yr-mo',
+            'p': 'path',
+            'r': 'ratio',
+            's': 'string',
+            't': 'time',
+            'v': 'extreme_float',
+            'x': 'tex',
+            'y': 'year',
+        }
+
        # lengths of index (word count) sampled from:
        self.index_value_lengths = [1]*10 + [2] * 4 + [3]

@@ -113,13 +132,14 @@ class TestDataFrameFactory:
            l   log float (greater range than float)
            m   year - month
            p   path (filename)
+            r   ratio (smaller floats, for percents)
            sx  string length x
            t   time
+            v   very large range float
            x   tex text - an equation
            y   year

-
-        Args:
+            Args:
            rows: Number of rows.
            columns: Column type spec (int for all float cols, or string type codes).
            index: Index level types (int for RangeIndex or string like 'ti').
@@ -168,7 +188,7 @@ class TestDataFrameFactory:
            ['d', 'f', 'i', 's3', 'l', 'h', 't', 'p'], size=self.rng.integers(3, 7))
        missing = round(float(self.rng.uniform(0, 0.15)), 2)
        index = ''.join(self.rng.choice(
-            ['t', 'd', 'i', 's2'], size=index_levels))
+            ['t', 'd', 'y', 'i', 's2'], size=index_levels))
        col_index = ''.join(self.rng.choice(
            ['s', 's2', 's2', 's3'], size=column_levels))
        return self.make(rows=rows, columns=''.join(col_types), index=index, col_index=col_index, missing=missing)
@@ -182,18 +202,22 @@ class TestDataFrameFactory:
        else:
            col_types = self._parse_colspec(columns)
        # if col_index is an int then use all strings of that depth
-        if isinstance(col_index, int):
-            col_index_types = ['s'] * col_index
+        if col_index == 'simple':
+            col_idx = map(self.simple_namer.get, [i[0] for i in col_types])
+            col_idx = pd.Index(col_idx, name='simple')
        else:
-            col_index_types = self._parse_colspec(col_index)
+            if isinstance(col_index, int):
+                col_index_types = ['s'] * col_index
+            else:
+                col_index_types = self._parse_colspec(col_index)
+            col_idx = self._make_index(col_index_types, len(col_types))
        if isinstance(index, int):
            index = ['s'] * index
        else:
            index = self._parse_colspec(index)
-            print(index)
+            # print(index)
        # col names are a transposed index.
        df = pd.DataFrame(index=range(rows))
-        col_idx = self._make_index(col_index_types, len(col_types))
        for dt, c in zip(col_types, range(len(col_idx))):
            df[c] = self._generate_column(dt, rows)
        df.columns = col_idx
@@ -210,15 +234,23 @@ class TestDataFrameFactory:
            return pd.Series([" ".join(self.word() for i in range(max_words)) for j in range(n)])
        if dtype == 'f':
            return pd.Series(self.rng.normal(loc=100000, scale=250000, size=n))
+        if dtype == 'r':
+            return pd.Series(self.rng.normal(loc=0.5, scale=0.35, size=n))
        if dtype == 'l':
            # log float (greater range)
            return pd.Series(np.exp(self.rng.normal(loc=-4 / 2 + 4, scale=4, size=n)))
+        if dtype == 'v':
+            # log float (greater range)
+            sc = 5
+            return pd.Series(np.exp(self.rng.normal(loc=-sc**2 / 2 + 10, scale=sc, size=n)))
        if dtype == 'i':
            return pd.Series(self.rng.integers(-1e4, 1e6, size=n), dtype='int64')
        if dtype == 'd':
            start_date = TestDataFrameFactory.random_date_within_last_n_years(
                10)
            return pd.Series(pd.date_range(start=start_date, periods=n, freq='D'))
+        if dtype == 'y':
+            return pd.Series(random.sample(range(1990, 2031), n))
        if dtype == 't':
            start_dt = datetime.now() - timedelta(days=365 * 2)
            return pd.Series([
@@ -347,7 +379,11 @@ class TestDataFrameFactory:
             for w, k in zip(level_value_lengths, level_choices)]
        x = [[next(j) for j in r] for i in range(rows)]
        names = random.sample(name_word_list, levels)
-        idx = pd.MultiIndex.from_tuples(
-            random.sample(x, rows), names=names).sort_values()
+        if levels == 1:
+            idx = pd.Index(
+                list(chain.from_iterable(random.sample(x, rows))), name=names[0]).sort_values()
+        else:
+            idx = pd.MultiIndex.from_tuples(
+                random.sample(x, rows), names=names).sort_values()
        assert idx.is_unique
        return idx
@@ -0,0 +1,188 @@
+"""
+Create and display svg files from tikz tex tables.
+
+Good for testing.
+
+From great2.blog
+"""
+
+from datetime import datetime
+import pandas as pd
+from pathlib import Path
+import re
+import yaml
+from itertools import count
+from subprocess import Popen, PIPE
+from IPython.display import display, Markdown, SVG
+
+from . hasher import txt_short_hash
+
+
+class TikzProcessor():
+    _tex_template_full = """\\documentclass[10pt, border=5mm]{{standalone}}
+
+% needs lualatex - uncomment for Wiley fonts
+%\\usepackage{{fontspec}}
+%\\setmainfont{{Stix Two Text}}
+%\\usepackage{{unicode-math}}
+%\\setmathfont{{Stix Two Math}}
+
+\\usepackage{{amsfonts}}
+\\usepackage{{url}}
+\\usepackage{{tikz}}
+\\usepackage{{color}}
+\\usetikzlibrary{{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}}
+\\usetikzlibrary{{automata}}
+\\usetikzlibrary{{fit}}
+\\usetikzlibrary{{snakes}}
+\\usetikzlibrary{{intersections}}
+\\usetikzlibrary{{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}}
+\\usetikzlibrary{{decorations.fractals,decorations.footprints}}
+\\usetikzlibrary{{graphs}}
+\\usetikzlibrary{{matrix}}
+\\usetikzlibrary{{shapes.geometric}}
+\\usetikzlibrary{{mindmap, shadows}}
+\\usetikzlibrary{{backgrounds}}
+\\usetikzlibrary{{cd}}
+
+% really common macros
+\\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}}
+
+\\def\\dfrac{{\\displaystyle\\frac}}
+\\def\\dint{{\\displaystyle\\int}}
+
+\\begin{{document}}
+
+{tikz_begin}{tikz_code}{tikz_end}
+
+\\end{{document}}
+"""
+    # --------------------------------------------
+    _tex_template = """
+% really common macros
+\\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}}
+
+\\def\\dfrac{{\\displaystyle\\frac}}
+\\def\\dint{{\\displaystyle\\int}}
+
+\\begin{{document}}
+
+{tikz_begin}{tikz_code}{tikz_end}
+
+\\end{{document}}
+"""
+
+    def split_tikz(self):
+        """
+        Split text to get the tikzpicture. Format is
+
+        initial text pip then groups of four:
+
+        1. begin tag ``(1::4)``
+        2. tikz code ``(2::4)``
+        3. end tag   ``(3::4)``
+        4. non-related text ``(4::4)``
+
+        """
+        return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', self.txt)
+
+    def __init__(self, txt, base_path='.', tex_engine='pdflatex'):
+        """
+        TikzProcessor (from TikzConvertyer): process a tex tikz text string into svg.
+        The program
+
+        * creates a pdf and svg from the tikz blob
+
+        lualatex is more robust, but slower...
+        pdflatex can't handle the fancy wiley fonts
+
+        """
+        self.txt = txt
+        self.tex_engine = tex_engine
+        # directory for TeX and images
+        self.base_path = Path(base_path).resolve()
+        self.out_path = self.base_path / 'tikz'
+        self.out_path.mkdir(exist_ok=True)
+        self.file_path = self.out_path / txt_short_hash(txt)
+
+    def process_tikz(self, verbose=False):
+        """
+        Process the tikz into pdf and svg
+        """
+        # container contains a tikzpicture
+        svg_path = self.file_path.with_suffix('.svg')
+        tex_path = self.file_path.with_suffix('.tex')
+
+        # make tex code for a stand-alone document
+        tikz_begin, tikz_code, tikz_end = self.split_tikz()[
+            1:4]
+        tex_code = self._tex_template.format(
+            tikz_begin=tikz_begin, tikz_code=tikz_code, tikz_end=tikz_end)
+        tex_path.write_text(tex_code, encoding='utf-8')
+        print(
+            f'TIKZ: created temp file = {tex_path.name}')
+        pdf_file = tex_path.with_suffix('.pdf')
+        print(f'TIKZ: Update pdf file')
+        if self.tex_engine == 'pdflatex':
+            # faster with template
+            # TODO EVID hard coded template
+            template_path = Path('tikz_format.fmt')
+            assert template_path.exists()
+            template = str(template_path)
+            command = ['pdflatex', f'--fmt={template}',
+                       f'--output-directory={str(tex_path.parent.resolve())}',
+                       str(tex_path.resolve())]
+        else:
+            # for STIX fonts, no template
+            command = ['lualatex',
+                       f'--output-directory={str(tex_path.parent.resolve())}',
+                       str(tex_path.resolve())]
+        if verbose:
+            print(f'TIKZ: TeX Command={" ".join(command)}')
+        TikzProcessor.run_command(command)
+        # to recreate
+        (tex_path.parent /
+         f'make_tikz.bat').write_text(" ".join(command))
+        if verbose:
+            print(
+                f'TIKZ: Creating svg file for Tikz (using new pdf2svg util)')
+        # https://github.com/jalios/pdf2svg-windows
+        command = [
+            'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg',
+            str(pdf_file.resolve()), str(svg_path.resolve())]
+        # seems to return info on stderr?
+        if verbose:
+            print(f'PDF->SVG: {" ".join(command)}')
+        TikzProcessor.run_command(command, flag=False)
+        if not verbose:
+            # tidy up
+            tex_path.unlink()
+            tex_path.with_suffix('.aux').unlink()
+            tex_path.with_suffix('.log').unlink()
+            pdf_file.unlink()
+
+    @staticmethod
+    def run_command(command, flag=True):
+        """
+        Run a command and show results. Allows for weird xx behavior
+
+        :param command:
+        :param flag:
+        :return:
+        """
+        with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
+            line1 = p.stdout.read()
+            line2 = p.stderr.read()
+            exit_code = p.poll()
+            if line1:
+                print('\n' + line1[-250:])
+            if line2:
+                if flag:
+                    raise ValueError(line2)
+                else:
+                    print(line2)
+        return exit_code
+
+    def display(self):
+        """display in Jupyter Lab."""
+        display(SVG(self.file_path.with_suffix('.svg')))
@@ -0,0 +1,133 @@
+"""
+Create and display SVG files from TikZ pictures embedded in LaTeX.
+
+Good for testing. Outputs are cached by hash. PDF→SVG uses pdf2svg.
+
+GPT re-write of my old great2.blog code.
+"""
+
+import re
+from pathlib import Path
+from subprocess import run, Popen, PIPE
+from IPython.display import SVG, display
+
+from .hasher import txt_short_hash
+
+
+class TikzProcessor:
+    # Full TeX preamble to generate a .fmt if needed
+    _tex_template_full = r"""\documentclass[10pt, border=5mm]{standalone}
+\usepackage{amsfonts}
+\usepackage{url}
+\usepackage{tikz}
+\usepackage{color}
+\usetikzlibrary{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}
+\usetikzlibrary{automata,fit,snakes,intersections}
+\usetikzlibrary{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}
+\usetikzlibrary{decorations.fractals,decorations.footprints}
+\usetikzlibrary{graphs,matrix,shapes.geometric}
+\usetikzlibrary{mindmap,shadows,backgrounds,cd}
+\dump
+"""
+
+    # Minimal template to embed user tikz
+    _tex_template = r"""
+\newcommand{{\grtspacer}}{{\vphantom{{lp}}}}
+\def\dfrac{{\displaystyle\frac}}
+\def\dint{{\displaystyle\int}}
+\begin{{document}}
+{tikz_begin}{tikz_code}{tikz_end}
+\end{{document}}
+"""
+
+
+    def __init__(self, txt, base_path='.', tex_engine='pdflatex'):
+        self.txt = txt
+        self.tex_engine = tex_engine
+        self.base_path = Path(base_path).resolve()
+        self.out_path = self.base_path / 'tikz'
+        self.out_path.mkdir(exist_ok=True)
+        self.file_path = self.out_path / txt_short_hash(txt)
+        self.format_file = self.out_path / 'tikz_format.fmt'
+
+    def split_tikz(self):
+        """Split text to extract the TikZ picture."""
+        return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', self.txt)
+
+    def ensure_format_file(self):
+        """Create format file for faster compilation if missing."""
+        if self.format_file.exists():
+            return
+        print('building format file...')
+        tmp = self.out_path / 'tikz_format.tex'
+        tmp.write_text(self._tex_template_full, encoding='utf-8')
+        self.run_command([
+            'pdflatex',
+            f'-ini',
+            f'-jobname={self.format_file.stem}',
+            '&pdflatex',
+            tmp.name,
+            ], raise_on_error=True, cwd=self.out_path)
+        # tmp.unlink()
+        (self.out_path / f'{self.format_file.stem}.log').unlink()
+        print('building format file...success', self.format_file.resolve())
+
+    def process_tikz(self, verbose=False):
+        """Compile TikZ to PDF and convert to SVG."""
+        tikz_begin, tikz_code, tikz_end = self.split_tikz()[1:4]
+        tex_code = self._tex_template.format(
+            tikz_begin=tikz_begin,
+            tikz_code=tikz_code,
+            tikz_end=tikz_end
+        )
+
+        tex_path = self.file_path.with_suffix('.tex')
+        tex_path.write_text(tex_code, encoding='utf-8')
+        pdf_path = tex_path.with_suffix('.pdf')
+        svg_path = tex_path.with_suffix('.svg')
+
+        self.ensure_format_file()
+
+        tex_cmd = [
+            'pdflatex',
+            f'--fmt={self.format_file.stem}',
+            f'--output-directory={str(tex_path.parent)}',
+            str(tex_path)
+        ]
+        if verbose:
+            print("Running:", " ".join(tex_cmd))
+        self.run_command(tex_cmd)
+
+        (tex_path.parent / 'make_tikz.bat').write_text(" ".join(tex_cmd), encoding='utf-8')
+
+        svg_cmd = [
+            'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg',
+            str(pdf_path),
+            str(svg_path)
+        ]
+        if verbose:
+            print("Running:", " ".join(svg_cmd))
+        self.run_command(svg_cmd, raise_on_error=False)
+
+        if not verbose:
+            for ext in ('.tex', '.aux', '.log', '.pdf'):
+                path = tex_path.with_suffix(ext)
+                if path.exists():
+                    path.unlink()
+
+    def display(self):
+        """Display the SVG in Jupyter."""
+        display(SVG(self.file_path.with_suffix('.svg')))
+
+    @staticmethod
+    def run_command(command, raise_on_error=True, cwd=None):
+        """Run command with subprocess and show output."""
+        with Popen(command, cwd=cwd, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
+            stdout, stderr = p.communicate()
+            if stdout:
+                print(stdout.strip()[-250:])
+            if stderr:
+                if raise_on_error:
+                    raise RuntimeError(stderr.strip())
+                else:
+                    print(stderr.strip())
@@ -30,6 +30,9 @@ classifiers = [
 include = ["greater_tables"]
 exclude = ["img", "tests", "docs"]

+[tool.setuptools.package-data]
+"greater_tables" = ["data/*.csv", "data/*.md"]
+
 [tool.setuptools.dynamic]
 version  = { attr = "greater_tables.__version__" }