start on v3.0

2026-06-27 17:30:44 +08:00 · 2025-06-12 23:20:16 +01:00
parent acff8a038e
commit c54ab11cf5
16 changed files with 4737 additions and 133 deletions
@@ -1,3 +1,5 @@
+
+
 Here's how you subclass.


@@ -1,3 +1,121 @@
+sort out the variety of readmes... this is the main one
+
+# v 3.0 update
+
+* config files
+* unified col width and info dataframe
+* de-texing
+* cli for config and writeout a csv etc.
+* tests
+
+# TODO
+
+* Ratio cols with multi index columns
+* % in tex output - never allow comments?
+* center / left / right table output -> CSS
+* ?Option to hide index
+* Bring over the roll your own logger
+
+***
+
+# from GPT
+
+
+Absolutely—here’s a structured summary of everything we’ve covered, organized by topic.
+
+---
+
+## 🧱 Project Structure & Philosophy
+
+* Your `GreaterTables` class formats a `pandas.DataFrame` to **HTML, text, or LaTeX**.
+* The class is **immutable**: formatting is fixed at construction time, like a pure value object.
+* You avoid branchy, incremental APIs (like `ggplot`) and prefer creating fresh objects.
+* You wanted a way to handle growing config complexity—→ led to a **YAML config + Pydantic schema** design.
+
+---
+
+## 📁 Project Layout
+
+```
+greater_tables_project/
+├── greater_tables/
+│   ├── __init__.py
+│   ├── gtconfig.py          ← config model + loader
+│   ├── gtcore.py            ← GreaterTables class
+│   └── defaults/
+│       └── config_template.yaml
+├── tests/
+├── pyproject.toml
+```
+
+* `GTConfigModel` = schema + default source of truth
+* `GTConfig` = singleton loader and validator
+* `config_template.yaml` = editable fallback + documentation base
+
+---
+
+## 🔧 Config Management
+
+* All defaults and types are declared in `GTConfigModel` (Pydantic).
+* Config is **loaded from YAML**, validated by `GTConfigModel`.
+* You can **generate** a valid config file from the model using `.model_dump() → YAML`.
+* Singleton pattern (`GTConfig.__new__`) caches the config at runtime.
+
+### Helpers
+
+* `GTConfig().get(overrides=...)` gives a safe, override-able config
+* `write_template(path)` writes a default config YAML for user to edit
+
+---
+
+## 🛠 Git Workflow (Solo Dev, Linear)
+
+* Use **tags** (`git tag v0.2.0`) to label stable versions
+* Use **`git reset --hard <tag>`** to roll back and discard later commits
+* Avoid branches entirely—keep a **single linear history**
+* Tags let you bounce around safely, with names instead of hashes
+* Releases on GitHub are tags + metadata, optional for publishing
+
+---
+
+## ⚙️ CLI Tool
+
+* Built with `click`, with subcommands:
+
+  * `gt render data.csv --format html`
+  * `gt write-template`
+* Reads any Pandas-supported file (`.csv`, `.feather`, `.pkl`, etc.)
+* Outputs to console or to file
+* Uses current config by default, or override with `--config path.yaml`
+
+---
+
+## 🧠 Design Principles You’re Following
+
+| Principle                    | Your Approach                               |
+| ---------------------------- | ------------------------------------------- |
+| Immutability                 | `GT(df, config)` is fixed once created      |
+| Separation of concerns       | `GTConfigModel` holds defaults/types        |
+| Config as code/documentation | `config_template.yaml` generated from model |
+| CLI-first mindset            | `click` used to expose functionality        |
+| Linear Git workflow          | Tags for rollback, no branches              |
+
+---
+
+Let me know if you want me to generate:
+
+* a Markdown doc for contributors
+* a `.bat` script to roll back to a tag
+* test scaffolding or release automation
+
+You're in great shape. Gum-level perfection achieved.
+
+
+
+***
+
+# OLD
+
 # Greater Tables

 Creating presentation quality tables from pandas dataframes is frustrating. It is hard to left-align text and right-align numbers using pandas `display` or `df.to_html`. The  `great_tables` package does a really nice job with pandas and polars dataframes but does not support indexes or TeX output. 
@@ -1,41 +0,0 @@
-REM USE doc-test instead!!
-echo use doc test...
-rem @echo off
-rem setlocal
-
-rem :: Define paths
-rem set REPO_URL=https://github.com/mynl/greater_tables_project
-rem set BUILD_DIR=C:\tmp\greater_tables_docs
-rem set VENV_DIR=%BUILD_DIR%\venv
-
-rem :: Remove existing directory if it exists
-rem if exist "%BUILD_DIR%" rd /s /q "%BUILD_DIR%"
-
-rem :: Clone the latest development repo
-rem git clone --depth 1 %REPO_URL% "%BUILD_DIR%"
-rem if %errorlevel% neq 0 exit /b %errorlevel%
-
-rem pushd "%BUILD_DIR%"
-
-rem :: Create virtual environment
-rem python -m venv "%VENV_DIR%"
-rem if %errorlevel% neq 0 exit /b %errorlevel%
-
-rem :: Activate virtual environment
-rem call "%VENV_DIR%\Scripts\activate"
-
-rem :: Upgrade pip and install dependencies from pyproject.toml
-rem python -m pip install --upgrade pip
-rem pip install --upgrade build setuptools
-rem pip install .
-rem pip install ".[doc]"  || pip install sphinx  # Ensure Sphinx is installed
-
-rem :: Build the documentation
-rem sphinx-build -b html docs docs/_build/html
-rem if %errorlevel% neq 0 exit /b %errorlevel%
-
-rem :: Deactivate virtual environment
-rem deactivate
-
-rem echo Documentation build complete: %BUILD_DIR%\docs\_build\html
-rem endlocal
@@ -1,6 +0,0 @@
-# TODO
-
-* Ratio cols with multi index columns 
-* % in tex output - never allow comments? 
-* center / left / right table output -> CSS
-* ?Option to hide index
@@ -1,6 +1,7 @@
-__version__ = '2.0.0'
+__version__ = '3.0.0'
 __project__ = 'greater_tables'
 __author__ = 'Stephen J Mildenhall'

-from . greater_tables import *
-from . utilities import *
+from . gtcore import GT
+# from . gtbreaks import Breakability
+
@@ -0,0 +1,50 @@
+import click
+import pandas as pd
+from pathlib import Path
+from .gtconfig import GTConfig, write_template
+from .gtcore import GreaterTables
+
+@click.group()
+def cli():
+    """Greater Tables CLI tool"""
+    pass
+
+@cli.command()
+@click.argument("input_file", type=click.Path(exists=True))
+@click.option("--output", "-o", type=click.Path(), help="Write rendered output to file")
+@click.option("--format", "-f", type=click.Choice(["html", "text", "latex"]), default="html")
+@click.option("--config", type=click.Path(), help="Path to a YAML config file")
+def render(input_file, output, format, config):
+    """Render a table from a data file."""
+    path = Path(input_file)
+    ext = path.suffix.lower()
+
+    if ext == ".csv":
+        df = pd.read_csv(path)
+    elif ext == ".feather":
+        df = pd.read_feather(path)
+    elif ext == ".pkl":
+        df = pd.read_pickle(path)
+    else:
+        raise click.UsageError(f"Unsupported extension: {ext}")
+
+    cfg = GTConfig(Path(config) if config else None).get()
+    gt = GreaterTables(df, config=cfg)
+
+    rendered = (
+        gt.render_html() if format == "html"
+        else gt.render_text() if format == "text"
+        else gt.render_latex()
+    )
+
+    if output:
+        Path(output).write_text(rendered, encoding="utf-8")
+    else:
+        print(rendered)
+
+@cli.command()
+@click.argument("path", type=click.Path(), default="config.yaml")
+def write_template(path):
+    """Write default config to the given path."""
+    write_template(Path(path))
+    click.echo(f"Config written to {path}")
@@ -0,0 +1,171 @@
+"""
+Configuration model and utilities for GreaterTables.
+
+Defines the `GTConfigModel` schema using Pydantic, which acts as the single
+source of truth for default values, validation, and structure of all table-rendering options.
+
+Also includes functions for writing editable config templates and loading from YAML.
+"""
+
+
+from pathlib import Path
+from typing import Optional, Union, Literal
+import yaml
+
+from pydantic import BaseModel, Field, ValidationError, ConfigDict
+import yaml
+
+
+class GTConfigModel(BaseModel):
+    """
+    Configuration model for GreaterTables.
+
+    This class defines all configurable options for controlling the formatting
+    and rendering of tables in HTML, text, and LaTeX outputs.
+
+    Each field has a default value and is validated using Pydantic. You can load
+    configuration from a YAML file or create it programmatically. Use this model
+    as the authoritative source of valid configuration fields.
+
+    :Usage:
+
+        >>> from greater_tables.gtconfig import GTConfigModel
+        >>> cfg = GTConfigModel(font_size="1.2em", caption_align="left")
+
+    :see also: ``GTConfig`` for loading from YAML with overrides.
+              ``gt write-template`` CLI command to generate a default config file.
+    """
+    # immutable
+    model_config = ConfigDict(frozen=True)
+    default_integer_str: str = Field(
+        "{x:,d}", description="Format f-string for integers. Example: '{x:,d}'"
+    )
+    default_float_str: str = Field(
+        "{x:,.3f}", description="Format f-string for floats. Example: '{x:,.3f}'"
+    )
+    default_date_str: str = Field(
+        "%Y-%m-%d", description="Format string for dates (no braces or 'x'). Example: '%Y-%m-%d'"
+    )
+    default_ratio_str: str = Field(
+        "{x:.1%}", description="Format f-string for ratios. Example: '{x:.1%}'"
+    )
+    default_formatter: Optional[str] = Field(
+        None, description="Optional fallback formatter f-string"
+    )
+
+    table_float_format: Optional[str] = Field(
+        None, description="Float format string for the entire table; overrides column-specific formats"
+    )
+    table_hrule_width: int = Field(
+        1, description="Width of top, bottom, and header horizontal rules"
+    )
+    table_vrule_width: int = Field(
+        1, description="Width of vertical rule separating index from body"
+    )
+    hrule_widths: Optional[tuple[int, int, int]] = Field(
+        (0, 0, 0), description="Tuple of three ints for horizontal rule widths (for multiindex use)"
+    )
+    vrule_widths: Optional[tuple[int, int, int]] = Field(
+        (0, 0, 0), description="Tuple of three ints for vertical rule widths (for multiindex columns)"
+    )
+
+    sparsify: bool = Field(
+        True, description="If True, sparsify index columns (recommended)"
+    )
+    sparsify_columns: bool = Field(
+        True, description="If True, sparsify column headers using colspans"
+    )
+
+    spacing: str = Field(
+        "medium", description="Shorthand for cell padding. One of: 'tight', 'medium', 'wide'"
+    )
+    padding_trbl: Optional[tuple[int, int, int, int]] = Field(
+        None, description="Manual padding in the order (top, right, bottom, left)"
+    )
+
+    tikz_scale: float = Field(
+        1.0, description="Scaling factor applied to LaTeX TikZ tables"
+    )
+    font_body: float = Field(
+        0.9, description="Font size for body text (in em units)"
+    )
+    font_head: float = Field(
+        1.0, description="Font size for header text (in em units)"
+    )
+    font_caption: float = Field(
+        1.1, description="Font size for caption text (in em units)"
+    )
+    font_bold_index: bool = Field(
+        False, description="If True, make index columns bold"
+    )
+
+    pef_precision: int = Field(
+        3, description="Precision for engineering format (digits after decimal)"
+    )
+    pef_lower: int = Field(
+        -3, description="Lower threshold: apply engineering format if abs(x) < 10**pef_lower"
+    )
+    pef_upper: int = Field(
+        6, description="Upper threshold: apply engineering format if abs(x) > 10**pef_upper"
+    )
+
+    cast_to_floats: bool = Field(
+        True, description="If True, cast non-integer, non-date columns to float where possible"
+    )
+    header_row: bool = Field(
+        True, description="If True, use the first row as header; False disables header row"
+    )
+    # tabs: Optional[Union[list[float], float, int]] = Field(
+    #     None, description="Column widths in characters or ems; None triggers auto-calculation"
+    # )
+    equal: bool = Field(
+        False, description="If True, force equal column widths (may be ignored if conflicting)"
+    )
+
+    caption_align: str = Field(
+        "center", description="Alignment of the caption text"
+    )
+    large_ok: bool = Field(
+        False, description="If True, allow full rendering of large tables without truncation"
+    )
+
+    max_str_length: int = Field(
+        -1, description="Maximum length for stringified objects (e.g. nested DataFrames); -1 = unlimited"
+    )
+
+    max_table_width: int = Field(
+        200, description="Maximum table width for markdown/text output mode"
+    )
+    table_width_mode: Literal["explicit", "natural", "breakable", "minimum"] = Field(
+        "explicit",
+        description=(
+            "Mode for determining table width. "
+            "'explicit': fixed width using max_table_width; "
+            "'natural': each cell fits its full content; "
+            "'breakable': wrap breakable strings; "
+            "'minimum': also wraps dates or float-like cells"
+        )
+    )
+    table_width_header_adjust: float = Field(
+        0.1, description="Proportion of width allocated to headers to balance content width"
+    )
+    table_width_header_relax: float = Field(
+        10.0, description="Extra characters allowed per column heading to help header wrapping"
+    )
+    debug: bool = Field(False, description="Run in debug mode with more reporting, include internal ID in caption and use colored output lines")
+
+    def write_template(self, path: Path):
+        """Generate a clean default config file at the given path."""
+        path = Path(path)
+        yaml_str = yaml.dump(self.model_dump(), sort_keys=False)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(yaml_str, encoding="utf-8")
+
+
+def write_template(path: Path):
+    """Generate a clean default config file at the given path."""
+    path = Path(path)
+    cfg = GTConfigModel()
+    yaml_str = yaml.dump(cfg.model_dump(), sort_keys=False)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(yaml_str, encoding="utf-8")
@@ -1,9 +1,14 @@
-# -*- coding: utf-8 -*-
+"""
+Core rendering logic for GreaterTables.
+
+Defines the `GreaterTables` class, which formats and renders pandas DataFrames
+to HTML, plain text, or LaTeX output using a validated configuration model.
+
+This is the main entry point for rendering logic. See `gtconfig.py` for configuration schema.
+"""

-# table formatting again
 from collections import namedtuple
 from decimal import InvalidOperation
-from enum import IntEnum
 from io import StringIO
 from itertools import groupby
 import logging
@@ -22,6 +27,8 @@ from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
 from rich import box
 from rich.table import Table

+from . gtenums import Breakability
+from . gtformats import GT_Format, TableFormat
 from . hasher import df_short_hash

 # turn this fuck-fest off
@@ -52,51 +59,6 @@ logger.info(f'Logger Setup; {__name__} module recompiled.')
 # temp = None


-class Breakability(IntEnum):
-    """To track if a column should or should not be broken (wrapped)."""
-
-    NEVER = 0
-    DATE = 3
-    MAYBE = 5
-    ACCEPTABLE = 10
-
-
-# specify text mode
-Line = namedtuple('Line', ['begin', 'hline', 'sep', 'end', 'index_sep'])
-DataRow = namedtuple('DataRow', ['begin', 'sep', 'end', 'index_sep'])
-TableFormat = namedtuple('TableFormat', [
-    'lineabove',
-    'linebelowheader',
-    'linebetweenrows',
-    'linebelow',
-    'headerrow',
-    'datarow',
-    'padding',
-    'with_header_hide'
-])
-
-# generic text format
-GT_Format = TableFormat(
-    lineabove=Line('┍', '━', '┯', '┑', '┳'),
-    linebelowheader=Line('┝', '━', '┿', '┥', '╋'),
-    linebetweenrows=Line('├', '─', '┼', '┤', '╂'),
-    linebelow=Line('┕', '━', '┷', '┙', '┻'),
-    headerrow=DataRow('│', '│', '│', '┃'),
-    datarow=DataRow('│', '│', '│', '┃'),
-    padding=1,
-    with_header_hide=None
-)
-
-# GT_Format = TableFormat(
-#     lineabove=Line('\u250d', '\u2501', '\u252f', '\u2511', '\u2533'),
-#     linebelowheader=Line('\u251d', '\u2501', '\u253f', '\u2525', '\u254b'),
-#     linebetweenrows=Line('\u251c', '\u2500', '\u253c', '\u2524', '\u2502'),
-#     linebelow=Line('\u2515', '\u2501', '\u2537', '\u2519', '\u253b'),
-#     headerrow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
-#     datarow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
-#     padding=1,
-#     with_header_hide=None
-# )


 class GT(object):
@@ -274,41 +236,41 @@ class GT(object):
                 date_cols=None,
                 raw_cols=None,
                 show_index=True,
-                 default_integer_str='{x:,d}',
-                 default_float_str='{x:,.3f}',
-                 default_date_str='%Y-%m-%d',
-                 default_ratio_str='{x:.1%}',
-                 default_formatter=None,
-                 table_float_format=None,
-                 table_hrule_width=1,
-                 table_vrule_width=1,
-                 hrule_widths=None,
-                 vrule_widths=None,
-                 sparsify=True,             # index sparsification - almost certainly want this!
-                 sparsify_columns=True,     # column sparsification with colspans
-                 spacing='medium',          # tight, medium, wide
-                 padding_trbl=None,         # tuple of four ints for padding
-                 tikz_scale=1.0,
-                 font_body=0.9,
-                 font_head=1.0,
-                 font_caption=1.1,
-                 font_bold_index=False,
-                 pef_precision=3,
-                 pef_lower=-3,
-                 pef_upper=6,
-                 cast_to_floats=True,
-                 header_row=True,
-                 tabs=None,
-                 equal=False,
-                 caption_align='center',
-                 large_ok=False,
-                 max_str_length=-1,
-                 str_table_fmt='mixed_grid',
-                 table_width_mode='explicit',
-                 table_width_header_adjust=0.1,
-                 table_width_header_relax=10,
-                 max_table_width=200,
-                 debug=False):
+                 # --> config
+                 # default_integer_str='{x:,d}',
+                 # default_float_str='{x:,.3f}',
+                 # default_date_str='%Y-%m-%d',
+                 # default_ratio_str='{x:.1%}',
+                 # default_formatter=None,
+                 # table_float_format=None,
+                 # table_hrule_width=1,
+                 # table_vrule_width=1,
+                 # hrule_widths=None,
+                 # vrule_widths=None,
+                 # sparsify=True,             # index sparsification - almost certainly want this!
+                 # sparsify_columns=True,     # column sparsification with colspans
+                 # spacing='medium',          # tight, medium, wide
+                 # padding_trbl=None,         # tuple of four ints for padding
+                 # tikz_scale=1.0,
+                 # font_body=0.9,
+                 # font_head=1.0,
+                 # font_caption=1.1,
+                 # font_bold_index=False,
+                 # pef_precision=3,
+                 # pef_lower=-3,
+                 # pef_upper=6,
+                 # cast_to_floats=True,
+                 # header_row=True,
+                 # tabs=None,
+                 # equal=False,
+                 # caption_align='center',
+                 # large_ok=False,
+                 # max_str_length=-1,
+                 # str_table_fmt='mixed_grid',  # no longer used
+                 # table_width_mode='explicit',
+                 # table_width_header_adjust=0.1,
+                 # table_width_header_relax=10,
+                 ):

        # deal with alternative input modes
        if df is None:
@@ -0,0 +1,27 @@
+"""
+Defines greater_table enumerations.
+
+Defines Breakability enumeration.
+
+Lines should never be broken, be flagged as a date, may be broken, or are acceptable
+for breaks (text).
+
+"""
+from enum import IntEnum, Enum
+
+
+class Breakability(IntEnum):
+    """To track if a column should or should not be broken (wrapped)."""
+
+    NEVER = 0
+    DATE = 3
+    MAYBE = 5
+    ACCEPTABLE = 10
+
+
+class  Alignment(Enum):
+    """Left, right, center horizontal alignment."""
+
+    LEFT = 'l'
+    CENTER = 'c'
+    RIGHT = 'r'
@@ -0,0 +1,82 @@
+"""
+Define text table formats.
+
+Based on model in tabulate.
+
+Tabulate table formats are defined in namedtuples found in tf:
+
+from tabulate import _table_formats as tf
+
+This is a dict and e.g.,::
+
+    tf['mixed_grid'] = TableFormat(
+        lineabove=Line(begin='┍', hline='━', sep='┯', end='┑'),
+        linebelowheader=Line(begin='┝', hline='━', sep='┿', end='┥'),
+        linebetweenrows=Line(begin='├', hline='─', sep='┼', end='┤'),
+        linebelow=Line(begin='┕', hline='━', sep='┷', end='┙'),
+        headerrow=DataRow(begin='│', sep='│', end='│'),
+        datarow=DataRow(begin='│', sep='│', end='│'),
+        padding=1,
+        with_header_hide=None)
+
+    tf.keys() = dict_keys(['simple', 'plain', 'grid', 'simple_grid',
+    'rounded_grid', 'heavy_grid', 'mixed_grid', 'double_grid',
+    'fancy_grid', 'outline', 'simple_outline', 'rounded_outline',
+    'heavy_outline', 'mixed_outline', 'double_outline', 'fancy_outline',
+    'github', 'pipe', 'orgtbl', 'jira', 'presto', 'pretty', 'psql', 'rst',
+    'mediawiki', 'moinmoin', 'youtrack', 'html', 'unsafehtml', 'latex',
+    'latex_raw', 'latex_bookconfig.tabs', 'latex_longtable', 'tsv', 'textile',
+    'asciidoc'])
+
+Parameters:
+    df: pandas.DataFrame
+        The data to display. Should have index reset, but specify index_levels.
+    data_col_widths: list[int]
+        List of visible widths (excluding padding) for each column.
+    data_col_aligns: list[str]
+        Alignment specifiers per column: 'left', 'center', or 'right'.
+    index_levels: int
+        Number of columns at the start considered index columns (split visually).
+    fmt: TableFormat
+        Box-drawing configuration (defaults to myFormat).
+"""
+
+from collections import namedtuple
+
+
+# specify text mode
+Line = namedtuple('Line', ['begin', 'hline', 'sep', 'end', 'index_sep'])
+DataRow = namedtuple('DataRow', ['begin', 'sep', 'end', 'index_sep'])
+TableFormat = namedtuple('TableFormat', [
+    'lineabove',
+    'linebelowheader',
+    'linebetweenrows',
+    'linebelow',
+    'headerrow',
+    'datarow',
+    'padding',
+    'with_header_hide'
+])
+
+# generic text format
+GT_Format = TableFormat(
+    lineabove=Line('┍', '━', '┯', '┑', '┳'),
+    linebelowheader=Line('┝', '━', '┿', '┥', '╋'),
+    linebetweenrows=Line('├', '─', '┼', '┤', '╂'),
+    linebelow=Line('┕', '━', '┷', '┙', '┻'),
+    headerrow=DataRow('│', '│', '│', '┃'),
+    datarow=DataRow('│', '│', '│', '┃'),
+    padding=1,
+    with_header_hide=None
+)
+
+# GT_Format = TableFormat(
+#     lineabove=Line('\u250d', '\u2501', '\u252f', '\u2511', '\u2533'),
+#     linebelowheader=Line('\u251d', '\u2501', '\u253f', '\u2525', '\u254b'),
+#     linebetweenrows=Line('\u251c', '\u2500', '\u253c', '\u2524', '\u2502'),
+#     linebelow=Line('\u2515', '\u2501', '\u2537', '\u2519', '\u253b'),
+#     headerrow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
+#     datarow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
+#     padding=1,
+#     with_header_hide=None
+# )
@@ -0,0 +1,171 @@
+"""
+Make fake dataframes for testing.
+
+GPT from SJMM design.
+"""
+
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Optional, Union
+from datetime import datetime, timedelta
+import hashlib
+import re
+
+
+import numpy as np
+import pandas as pd
+from faker import Faker
+
+
+@dataclass
+class TestDataFrameFactory:
+    """
+    Factory for generating small synthetic pandas DataFrames for testing.
+
+    Attributes:
+        colname_words: Optional list of strings to use for column names.
+        default_word_count: Max number of words for string columns (default 3).
+        seed: Optional random seed. If None, one is generated.
+    """
+    colname_words: Optional[list[str]] = None
+    default_word_count: int = 3
+    seed: Optional[int] = None
+    _last_args: dict = field(default_factory=dict, init=False)
+
+    def __post_init__(self):
+        self.faker = Faker()
+        self.seed = int(self.seed if self.seed is not None else np.random.SeedSequence().entropy)
+        self.rng = np.random.default_rng(self.seed)
+
+    def make(self, rows: int, columns: Union[int, str], index: Union[int, str] = 0,
+             col_index: Union[int, str] = 0, missing: float = 0.0) -> pd.DataFrame:
+        """
+        Generate a test DataFrame with the given specification.
+
+        Args:
+            rows: Number of rows.
+            columns: Column type spec (int for all float cols, or string type codes).
+            index: Index level types (int for RangeIndex or string like 'ti').
+            col_index: Column index levels (same format as `index`).
+            missing: Proportion of missing data in each column.
+
+        Returns:
+            DataFrame
+        """
+        self._last_args = dict(rows=rows, columns=columns, index=index, col_index=col_index, missing=missing)
+        return self._generate(**self._last_args)
+
+    def another(self, new_seed: bool = True) -> pd.DataFrame:
+        """
+        Generate another DataFrame with the last parameters.
+
+        Args:
+            new_seed: If True, re-randomize the generator seed.
+
+        Returns:
+            DataFrame
+        """
+        if new_seed:
+            self.seed = int(np.random.SeedSequence().entropy)
+            self.rng = np.random.default_rng(self.seed)
+        return self._generate(**self._last_args)
+
+    def random(self, index_levels: int = 1, column_levels: int = 1) -> pd.DataFrame:
+        """
+        Generate a DataFrame with randomly chosen settings.
+
+        Args:
+            index_levels: Number of index levels to use.
+            column_levels: Number of column MultiIndex levels.
+
+        Returns:
+            DataFrame
+        """
+        rows = self.rng.integers(10, 50)
+        col_types = self.rng.choice(['d', 'f', 'i', 's1', 's3', 's7', 'h', 't', 'p'], size=self.rng.integers(3, 7))
+        missing = round(float(self.rng.uniform(0, 0.15)), 2)
+        index = ''.join(self.rng.choice(['t', 'd', 'i', 's2'], size=index_levels))
+        col_index = ''.join(self.rng.choice(['s', 'i', 'd'], size=column_levels))
+        return self.make(rows=rows, columns=''.join(col_types), index=index, col_index=col_index, missing=missing)
+
+    def _parse_colspec(self, spec: str) -> list[str]:
+        return re.findall(r's\d+|[a-z]', spec)
+
+
+    def _generate(self, rows: int, columns: Union[int, str], index: Union[int, str],
+                  col_index: Union[int, str], missing: float) -> pd.DataFrame:
+        if isinstance(columns, int):
+            col_types = ['s3'] * columns
+        else:
+            col_types = self._parse_colspec(columns)
+
+        colnames = self._make_column_names(len(col_types))
+        data = {
+            name: self._generate_column(dt, rows) for name, dt in zip(colnames, col_types)
+        }
+        df = pd.DataFrame(data)
+        df.index = self._make_index(index, rows, "i")
+        df.columns = self._make_index(col_index, len(df.columns), "c") if isinstance(col_index, str) else df.columns
+        df = self._insert_missing(df, missing)
+        return df
+
+    def _make_column_names(self, n: int) -> list[str]:
+        if self.colname_words:
+            pool = self.colname_words
+        else:
+            pool = [self.faker.word() for _ in range(n * 2)]
+        names = []
+        used = set()
+        for word in pool:
+            if len(names) >= n:
+                break
+            if word not in used:
+                names.append(word)
+                used.add(word)
+        while len(names) < n:
+            names.append(f"col_{len(names)}")
+        return names
+
+    def _generate_column(self, dtype: str, n: int) -> pd.Series:
+        if dtype.startswith('s'):
+            max_words = int(dtype[1:]) if len(dtype) > 1 else self.default_word_count
+            return pd.Series([" ".join(self.faker.words(self.rng.integers(1, max_words + 1))) for _ in range(n)])
+        if dtype == 'f':
+            return pd.Series(self.rng.normal(loc=100, scale=25, size=n))
+        if dtype == 'i':
+            return pd.Series(self.rng.integers(1e9, 1e12, size=n), dtype='int64')
+        if dtype == 'd':
+            start_date = self.faker.date_between(start_date='-10y', end_date='today')
+            return pd.Series(pd.date_range(start=start_date, periods=n, freq='D'))
+        if dtype == 't':
+            start_dt = datetime.now() - timedelta(days=365 * 2)
+            return pd.Series([start_dt + timedelta(minutes=int(self.rng.integers(0, 2 * 365 * 24 * 60))) for _ in range(n)])
+        if dtype == 'h':
+            return pd.Series([
+                hashlib.blake2b(f"val{i}".encode(), digest_size=32).hexdigest()
+                for i in range(n)
+            ])
+        if dtype == 'p':
+            return pd.Series([str(Path(f"/data/{self.faker.word()}/{i}.dat")) for i in range(n)])
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+    def _make_index(self, desc: Union[int, str], n: int, label_prefix: str) -> pd.Index:
+        if isinstance(desc, int):
+            return pd.RangeIndex(n, name=f"{label_prefix}0")
+        levels = []
+        names = []
+        for j, dt in enumerate(desc):
+            s = self._generate_column(dt, n)
+            levels.append(s)
+            names.append(f"{label_prefix}{j}")
+        return pd.MultiIndex.from_arrays(levels, names=names)
+
+    def _insert_missing(self, df: pd.DataFrame, prop: float) -> pd.DataFrame:
+        if prop <= 0:
+            return df
+        n_rows = df.shape[0]
+        for col in df.columns:
+            n_missing = max(1, int(np.floor(prop * n_rows)))
+            missing_indices = self.rng.choice(n_rows, size=n_missing, replace=False)
+            df.iloc[missing_indices, df.columns.get_loc(col)] = np.nan
+        return df
@@ -52,3 +52,7 @@ dev = [
    "sphinx-toggleprompt",
    "sphinx-multitoc-numbering"
 ]
+
+
+[project.scripts]
+gt = "greater_tables.cli:cli"