mirror of
https://github.com/wassname/greater_tables_project.git
synced 2026-06-27 17:30:44 +08:00
start on v3.0
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
|
||||
|
||||
Here's how you subclass.
|
||||
|
||||
|
||||
|
||||
@@ -1,3 +1,121 @@
|
||||
sort out the variety of readmes... this is the main one
|
||||
|
||||
# v 3.0 update
|
||||
|
||||
* config files
|
||||
* unified col width and info dataframe
|
||||
* de-texing
|
||||
* cli for config and writeout a csv etc.
|
||||
* tests
|
||||
|
||||
# TODO
|
||||
|
||||
* Ratio cols with multi index columns
|
||||
* % in tex output - never allow comments?
|
||||
* center / left / right table output -> CSS
|
||||
* ?Option to hide index
|
||||
* Bring over the roll your own logger
|
||||
|
||||
***
|
||||
|
||||
# from GPT
|
||||
|
||||
|
||||
Absolutely—here’s a structured summary of everything we’ve covered, organized by topic.
|
||||
|
||||
---
|
||||
|
||||
## 🧱 Project Structure & Philosophy
|
||||
|
||||
* Your `GreaterTables` class formats a `pandas.DataFrame` to **HTML, text, or LaTeX**.
|
||||
* The class is **immutable**: formatting is fixed at construction time, like a pure value object.
|
||||
* You avoid branchy, incremental APIs (like `ggplot`) and prefer creating fresh objects.
|
||||
* You wanted a way to handle growing config complexity—→ led to a **YAML config + Pydantic schema** design.
|
||||
|
||||
---
|
||||
|
||||
## 📁 Project Layout
|
||||
|
||||
```
|
||||
greater_tables_project/
|
||||
├── greater_tables/
|
||||
│ ├── __init__.py
|
||||
│ ├── gtconfig.py ← config model + loader
|
||||
│ ├── gtcore.py ← GreaterTables class
|
||||
│ └── defaults/
|
||||
│ └── config_template.yaml
|
||||
├── tests/
|
||||
├── pyproject.toml
|
||||
```
|
||||
|
||||
* `GTConfigModel` = schema + default source of truth
|
||||
* `GTConfig` = singleton loader and validator
|
||||
* `config_template.yaml` = editable fallback + documentation base
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Config Management
|
||||
|
||||
* All defaults and types are declared in `GTConfigModel` (Pydantic).
|
||||
* Config is **loaded from YAML**, validated by `GTConfigModel`.
|
||||
* You can **generate** a valid config file from the model using `.model_dump() → YAML`.
|
||||
* Singleton pattern (`GTConfig.__new__`) caches the config at runtime.
|
||||
|
||||
### Helpers
|
||||
|
||||
* `GTConfig().get(overrides=...)` gives a safe, override-able config
|
||||
* `write_template(path)` writes a default config YAML for user to edit
|
||||
|
||||
---
|
||||
|
||||
## 🛠 Git Workflow (Solo Dev, Linear)
|
||||
|
||||
* Use **tags** (`git tag v0.2.0`) to label stable versions
|
||||
* Use **`git reset --hard <tag>`** to roll back and discard later commits
|
||||
* Avoid branches entirely—keep a **single linear history**
|
||||
* Tags let you bounce around safely, with names instead of hashes
|
||||
* Releases on GitHub are tags + metadata, optional for publishing
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ CLI Tool
|
||||
|
||||
* Built with `click`, with subcommands:
|
||||
|
||||
* `gt render data.csv --format html`
|
||||
* `gt write-template`
|
||||
* Reads any Pandas-supported file (`.csv`, `.feather`, `.pkl`, etc.)
|
||||
* Outputs to console or to file
|
||||
* Uses current config by default, or override with `--config path.yaml`
|
||||
|
||||
---
|
||||
|
||||
## 🧠 Design Principles You’re Following
|
||||
|
||||
| Principle | Your Approach |
|
||||
| ---------------------------- | ------------------------------------------- |
|
||||
| Immutability | `GT(df, config)` is fixed once created |
|
||||
| Separation of concerns | `GTConfigModel` holds defaults/types |
|
||||
| Config as code/documentation | `config_template.yaml` generated from model |
|
||||
| CLI-first mindset | `click` used to expose functionality |
|
||||
| Linear Git workflow | Tags for rollback, no branches |
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to generate:
|
||||
|
||||
* a Markdown doc for contributors
|
||||
* a `.bat` script to roll back to a tag
|
||||
* test scaffolding or release automation
|
||||
|
||||
You're in great shape. Gum-level perfection achieved.
|
||||
|
||||
|
||||
|
||||
***
|
||||
|
||||
# OLD
|
||||
|
||||
# Greater Tables
|
||||
|
||||
Creating presentation quality tables from pandas dataframes is frustrating. It is hard to left-align text and right-align numbers using pandas `display` or `df.to_html`. The `great_tables` package does a really nice job with pandas and polars dataframes but does not support indexes or TeX output.
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
REM USE doc-test instead!!
|
||||
echo use doc test...
|
||||
rem @echo off
|
||||
rem setlocal
|
||||
|
||||
rem :: Define paths
|
||||
rem set REPO_URL=https://github.com/mynl/greater_tables_project
|
||||
rem set BUILD_DIR=C:\tmp\greater_tables_docs
|
||||
rem set VENV_DIR=%BUILD_DIR%\venv
|
||||
|
||||
rem :: Remove existing directory if it exists
|
||||
rem if exist "%BUILD_DIR%" rd /s /q "%BUILD_DIR%"
|
||||
|
||||
rem :: Clone the latest development repo
|
||||
rem git clone --depth 1 %REPO_URL% "%BUILD_DIR%"
|
||||
rem if %errorlevel% neq 0 exit /b %errorlevel%
|
||||
|
||||
rem pushd "%BUILD_DIR%"
|
||||
|
||||
rem :: Create virtual environment
|
||||
rem python -m venv "%VENV_DIR%"
|
||||
rem if %errorlevel% neq 0 exit /b %errorlevel%
|
||||
|
||||
rem :: Activate virtual environment
|
||||
rem call "%VENV_DIR%\Scripts\activate"
|
||||
|
||||
rem :: Upgrade pip and install dependencies from pyproject.toml
|
||||
rem python -m pip install --upgrade pip
|
||||
rem pip install --upgrade build setuptools
|
||||
rem pip install .
|
||||
rem pip install ".[doc]" || pip install sphinx # Ensure Sphinx is installed
|
||||
|
||||
rem :: Build the documentation
|
||||
rem sphinx-build -b html docs docs/_build/html
|
||||
rem if %errorlevel% neq 0 exit /b %errorlevel%
|
||||
|
||||
rem :: Deactivate virtual environment
|
||||
rem deactivate
|
||||
|
||||
rem echo Documentation build complete: %BUILD_DIR%\docs\_build\html
|
||||
rem endlocal
|
||||
@@ -1,6 +0,0 @@
|
||||
# TODO
|
||||
|
||||
* Ratio cols with multi index columns
|
||||
* % in tex output - never allow comments?
|
||||
* center / left / right table output -> CSS
|
||||
* ?Option to hide index
|
||||
@@ -1,6 +1,7 @@
|
||||
__version__ = '2.0.0'
|
||||
__version__ = '3.0.0'
|
||||
__project__ = 'greater_tables'
|
||||
__author__ = 'Stephen J Mildenhall'
|
||||
|
||||
from . greater_tables import *
|
||||
from . utilities import *
|
||||
from . gtcore import GT
|
||||
# from . gtbreaks import Breakability
|
||||
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
import click
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from .gtconfig import GTConfig, write_template
|
||||
from .gtcore import GreaterTables
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""Greater Tables CLI tool"""
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
@click.argument("input_file", type=click.Path(exists=True))
|
||||
@click.option("--output", "-o", type=click.Path(), help="Write rendered output to file")
|
||||
@click.option("--format", "-f", type=click.Choice(["html", "text", "latex"]), default="html")
|
||||
@click.option("--config", type=click.Path(), help="Path to a YAML config file")
|
||||
def render(input_file, output, format, config):
|
||||
"""Render a table from a data file."""
|
||||
path = Path(input_file)
|
||||
ext = path.suffix.lower()
|
||||
|
||||
if ext == ".csv":
|
||||
df = pd.read_csv(path)
|
||||
elif ext == ".feather":
|
||||
df = pd.read_feather(path)
|
||||
elif ext == ".pkl":
|
||||
df = pd.read_pickle(path)
|
||||
else:
|
||||
raise click.UsageError(f"Unsupported extension: {ext}")
|
||||
|
||||
cfg = GTConfig(Path(config) if config else None).get()
|
||||
gt = GreaterTables(df, config=cfg)
|
||||
|
||||
rendered = (
|
||||
gt.render_html() if format == "html"
|
||||
else gt.render_text() if format == "text"
|
||||
else gt.render_latex()
|
||||
)
|
||||
|
||||
if output:
|
||||
Path(output).write_text(rendered, encoding="utf-8")
|
||||
else:
|
||||
print(rendered)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("path", type=click.Path(), default="config.yaml")
|
||||
def write_template(path):
|
||||
"""Write default config to the given path."""
|
||||
write_template(Path(path))
|
||||
click.echo(f"Config written to {path}")
|
||||
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Configuration model and utilities for GreaterTables.
|
||||
|
||||
Defines the `GTConfigModel` schema using Pydantic, which acts as the single
|
||||
source of truth for default values, validation, and structure of all table-rendering options.
|
||||
|
||||
Also includes functions for writing editable config templates and loading from YAML.
|
||||
"""
|
||||
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Literal
|
||||
import yaml
|
||||
|
||||
from pydantic import BaseModel, Field, ValidationError, ConfigDict
|
||||
import yaml
|
||||
|
||||
|
||||
class GTConfigModel(BaseModel):
|
||||
"""
|
||||
Configuration model for GreaterTables.
|
||||
|
||||
This class defines all configurable options for controlling the formatting
|
||||
and rendering of tables in HTML, text, and LaTeX outputs.
|
||||
|
||||
Each field has a default value and is validated using Pydantic. You can load
|
||||
configuration from a YAML file or create it programmatically. Use this model
|
||||
as the authoritative source of valid configuration fields.
|
||||
|
||||
:Usage:
|
||||
|
||||
>>> from greater_tables.gtconfig import GTConfigModel
|
||||
>>> cfg = GTConfigModel(font_size="1.2em", caption_align="left")
|
||||
|
||||
:see also: ``GTConfig`` for loading from YAML with overrides.
|
||||
``gt write-template`` CLI command to generate a default config file.
|
||||
"""
|
||||
# immutable
|
||||
model_config = ConfigDict(frozen=True)
|
||||
default_integer_str: str = Field(
|
||||
"{x:,d}", description="Format f-string for integers. Example: '{x:,d}'"
|
||||
)
|
||||
default_float_str: str = Field(
|
||||
"{x:,.3f}", description="Format f-string for floats. Example: '{x:,.3f}'"
|
||||
)
|
||||
default_date_str: str = Field(
|
||||
"%Y-%m-%d", description="Format string for dates (no braces or 'x'). Example: '%Y-%m-%d'"
|
||||
)
|
||||
default_ratio_str: str = Field(
|
||||
"{x:.1%}", description="Format f-string for ratios. Example: '{x:.1%}'"
|
||||
)
|
||||
default_formatter: Optional[str] = Field(
|
||||
None, description="Optional fallback formatter f-string"
|
||||
)
|
||||
|
||||
table_float_format: Optional[str] = Field(
|
||||
None, description="Float format string for the entire table; overrides column-specific formats"
|
||||
)
|
||||
table_hrule_width: int = Field(
|
||||
1, description="Width of top, bottom, and header horizontal rules"
|
||||
)
|
||||
table_vrule_width: int = Field(
|
||||
1, description="Width of vertical rule separating index from body"
|
||||
)
|
||||
hrule_widths: Optional[tuple[int, int, int]] = Field(
|
||||
(0, 0, 0), description="Tuple of three ints for horizontal rule widths (for multiindex use)"
|
||||
)
|
||||
vrule_widths: Optional[tuple[int, int, int]] = Field(
|
||||
(0, 0, 0), description="Tuple of three ints for vertical rule widths (for multiindex columns)"
|
||||
)
|
||||
|
||||
sparsify: bool = Field(
|
||||
True, description="If True, sparsify index columns (recommended)"
|
||||
)
|
||||
sparsify_columns: bool = Field(
|
||||
True, description="If True, sparsify column headers using colspans"
|
||||
)
|
||||
|
||||
spacing: str = Field(
|
||||
"medium", description="Shorthand for cell padding. One of: 'tight', 'medium', 'wide'"
|
||||
)
|
||||
padding_trbl: Optional[tuple[int, int, int, int]] = Field(
|
||||
None, description="Manual padding in the order (top, right, bottom, left)"
|
||||
)
|
||||
|
||||
tikz_scale: float = Field(
|
||||
1.0, description="Scaling factor applied to LaTeX TikZ tables"
|
||||
)
|
||||
font_body: float = Field(
|
||||
0.9, description="Font size for body text (in em units)"
|
||||
)
|
||||
font_head: float = Field(
|
||||
1.0, description="Font size for header text (in em units)"
|
||||
)
|
||||
font_caption: float = Field(
|
||||
1.1, description="Font size for caption text (in em units)"
|
||||
)
|
||||
font_bold_index: bool = Field(
|
||||
False, description="If True, make index columns bold"
|
||||
)
|
||||
|
||||
pef_precision: int = Field(
|
||||
3, description="Precision for engineering format (digits after decimal)"
|
||||
)
|
||||
pef_lower: int = Field(
|
||||
-3, description="Lower threshold: apply engineering format if abs(x) < 10**pef_lower"
|
||||
)
|
||||
pef_upper: int = Field(
|
||||
6, description="Upper threshold: apply engineering format if abs(x) > 10**pef_upper"
|
||||
)
|
||||
|
||||
cast_to_floats: bool = Field(
|
||||
True, description="If True, cast non-integer, non-date columns to float where possible"
|
||||
)
|
||||
header_row: bool = Field(
|
||||
True, description="If True, use the first row as header; False disables header row"
|
||||
)
|
||||
# tabs: Optional[Union[list[float], float, int]] = Field(
|
||||
# None, description="Column widths in characters or ems; None triggers auto-calculation"
|
||||
# )
|
||||
equal: bool = Field(
|
||||
False, description="If True, force equal column widths (may be ignored if conflicting)"
|
||||
)
|
||||
|
||||
caption_align: str = Field(
|
||||
"center", description="Alignment of the caption text"
|
||||
)
|
||||
large_ok: bool = Field(
|
||||
False, description="If True, allow full rendering of large tables without truncation"
|
||||
)
|
||||
|
||||
max_str_length: int = Field(
|
||||
-1, description="Maximum length for stringified objects (e.g. nested DataFrames); -1 = unlimited"
|
||||
)
|
||||
|
||||
max_table_width: int = Field(
|
||||
200, description="Maximum table width for markdown/text output mode"
|
||||
)
|
||||
table_width_mode: Literal["explicit", "natural", "breakable", "minimum"] = Field(
|
||||
"explicit",
|
||||
description=(
|
||||
"Mode for determining table width. "
|
||||
"'explicit': fixed width using max_table_width; "
|
||||
"'natural': each cell fits its full content; "
|
||||
"'breakable': wrap breakable strings; "
|
||||
"'minimum': also wraps dates or float-like cells"
|
||||
)
|
||||
)
|
||||
table_width_header_adjust: float = Field(
|
||||
0.1, description="Proportion of width allocated to headers to balance content width"
|
||||
)
|
||||
table_width_header_relax: float = Field(
|
||||
10.0, description="Extra characters allowed per column heading to help header wrapping"
|
||||
)
|
||||
debug: bool = Field(False, description="Run in debug mode with more reporting, include internal ID in caption and use colored output lines")
|
||||
|
||||
def write_template(self, path: Path):
|
||||
"""Generate a clean default config file at the given path."""
|
||||
path = Path(path)
|
||||
yaml_str = yaml.dump(self.model_dump(), sort_keys=False)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(yaml_str, encoding="utf-8")
|
||||
|
||||
|
||||
def write_template(path: Path):
|
||||
"""Generate a clean default config file at the given path."""
|
||||
path = Path(path)
|
||||
cfg = GTConfigModel()
|
||||
yaml_str = yaml.dump(cfg.model_dump(), sort_keys=False)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(yaml_str, encoding="utf-8")
|
||||
@@ -1,9 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Core rendering logic for GreaterTables.
|
||||
|
||||
Defines the `GreaterTables` class, which formats and renders pandas DataFrames
|
||||
to HTML, plain text, or LaTeX output using a validated configuration model.
|
||||
|
||||
This is the main entry point for rendering logic. See `gtconfig.py` for configuration schema.
|
||||
"""
|
||||
|
||||
# table formatting again
|
||||
from collections import namedtuple
|
||||
from decimal import InvalidOperation
|
||||
from enum import IntEnum
|
||||
from io import StringIO
|
||||
from itertools import groupby
|
||||
import logging
|
||||
@@ -22,6 +27,8 @@ from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, \
|
||||
from rich import box
|
||||
from rich.table import Table
|
||||
|
||||
from . gtenums import Breakability
|
||||
from . gtformats import GT_Format, TableFormat
|
||||
from . hasher import df_short_hash
|
||||
|
||||
# turn this fuck-fest off
|
||||
@@ -52,51 +59,6 @@ logger.info(f'Logger Setup; {__name__} module recompiled.')
|
||||
# temp = None
|
||||
|
||||
|
||||
class Breakability(IntEnum):
|
||||
"""To track if a column should or should not be broken (wrapped)."""
|
||||
|
||||
NEVER = 0
|
||||
DATE = 3
|
||||
MAYBE = 5
|
||||
ACCEPTABLE = 10
|
||||
|
||||
|
||||
# specify text mode
|
||||
Line = namedtuple('Line', ['begin', 'hline', 'sep', 'end', 'index_sep'])
|
||||
DataRow = namedtuple('DataRow', ['begin', 'sep', 'end', 'index_sep'])
|
||||
TableFormat = namedtuple('TableFormat', [
|
||||
'lineabove',
|
||||
'linebelowheader',
|
||||
'linebetweenrows',
|
||||
'linebelow',
|
||||
'headerrow',
|
||||
'datarow',
|
||||
'padding',
|
||||
'with_header_hide'
|
||||
])
|
||||
|
||||
# generic text format
|
||||
GT_Format = TableFormat(
|
||||
lineabove=Line('┍', '━', '┯', '┑', '┳'),
|
||||
linebelowheader=Line('┝', '━', '┿', '┥', '╋'),
|
||||
linebetweenrows=Line('├', '─', '┼', '┤', '╂'),
|
||||
linebelow=Line('┕', '━', '┷', '┙', '┻'),
|
||||
headerrow=DataRow('│', '│', '│', '┃'),
|
||||
datarow=DataRow('│', '│', '│', '┃'),
|
||||
padding=1,
|
||||
with_header_hide=None
|
||||
)
|
||||
|
||||
# GT_Format = TableFormat(
|
||||
# lineabove=Line('\u250d', '\u2501', '\u252f', '\u2511', '\u2533'),
|
||||
# linebelowheader=Line('\u251d', '\u2501', '\u253f', '\u2525', '\u254b'),
|
||||
# linebetweenrows=Line('\u251c', '\u2500', '\u253c', '\u2524', '\u2502'),
|
||||
# linebelow=Line('\u2515', '\u2501', '\u2537', '\u2519', '\u253b'),
|
||||
# headerrow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
|
||||
# datarow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
|
||||
# padding=1,
|
||||
# with_header_hide=None
|
||||
# )
|
||||
|
||||
|
||||
class GT(object):
|
||||
@@ -274,41 +236,41 @@ class GT(object):
|
||||
date_cols=None,
|
||||
raw_cols=None,
|
||||
show_index=True,
|
||||
default_integer_str='{x:,d}',
|
||||
default_float_str='{x:,.3f}',
|
||||
default_date_str='%Y-%m-%d',
|
||||
default_ratio_str='{x:.1%}',
|
||||
default_formatter=None,
|
||||
table_float_format=None,
|
||||
table_hrule_width=1,
|
||||
table_vrule_width=1,
|
||||
hrule_widths=None,
|
||||
vrule_widths=None,
|
||||
sparsify=True, # index sparsification - almost certainly want this!
|
||||
sparsify_columns=True, # column sparsification with colspans
|
||||
spacing='medium', # tight, medium, wide
|
||||
padding_trbl=None, # tuple of four ints for padding
|
||||
tikz_scale=1.0,
|
||||
font_body=0.9,
|
||||
font_head=1.0,
|
||||
font_caption=1.1,
|
||||
font_bold_index=False,
|
||||
pef_precision=3,
|
||||
pef_lower=-3,
|
||||
pef_upper=6,
|
||||
cast_to_floats=True,
|
||||
header_row=True,
|
||||
tabs=None,
|
||||
equal=False,
|
||||
caption_align='center',
|
||||
large_ok=False,
|
||||
max_str_length=-1,
|
||||
str_table_fmt='mixed_grid',
|
||||
table_width_mode='explicit',
|
||||
table_width_header_adjust=0.1,
|
||||
table_width_header_relax=10,
|
||||
max_table_width=200,
|
||||
debug=False):
|
||||
# --> config
|
||||
# default_integer_str='{x:,d}',
|
||||
# default_float_str='{x:,.3f}',
|
||||
# default_date_str='%Y-%m-%d',
|
||||
# default_ratio_str='{x:.1%}',
|
||||
# default_formatter=None,
|
||||
# table_float_format=None,
|
||||
# table_hrule_width=1,
|
||||
# table_vrule_width=1,
|
||||
# hrule_widths=None,
|
||||
# vrule_widths=None,
|
||||
# sparsify=True, # index sparsification - almost certainly want this!
|
||||
# sparsify_columns=True, # column sparsification with colspans
|
||||
# spacing='medium', # tight, medium, wide
|
||||
# padding_trbl=None, # tuple of four ints for padding
|
||||
# tikz_scale=1.0,
|
||||
# font_body=0.9,
|
||||
# font_head=1.0,
|
||||
# font_caption=1.1,
|
||||
# font_bold_index=False,
|
||||
# pef_precision=3,
|
||||
# pef_lower=-3,
|
||||
# pef_upper=6,
|
||||
# cast_to_floats=True,
|
||||
# header_row=True,
|
||||
# tabs=None,
|
||||
# equal=False,
|
||||
# caption_align='center',
|
||||
# large_ok=False,
|
||||
# max_str_length=-1,
|
||||
# str_table_fmt='mixed_grid', # no longer used
|
||||
# table_width_mode='explicit',
|
||||
# table_width_header_adjust=0.1,
|
||||
# table_width_header_relax=10,
|
||||
):
|
||||
|
||||
# deal with alternative input modes
|
||||
if df is None:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
Defines greater_table enumerations.
|
||||
|
||||
Defines Breakability enumeration.
|
||||
|
||||
Lines should never be broken, be flagged as a date, may be broken, or are acceptable
|
||||
for breaks (text).
|
||||
|
||||
"""
|
||||
from enum import IntEnum, Enum
|
||||
|
||||
|
||||
class Breakability(IntEnum):
|
||||
"""To track if a column should or should not be broken (wrapped)."""
|
||||
|
||||
NEVER = 0
|
||||
DATE = 3
|
||||
MAYBE = 5
|
||||
ACCEPTABLE = 10
|
||||
|
||||
|
||||
class Alignment(Enum):
|
||||
"""Left, right, center horizontal alignment."""
|
||||
|
||||
LEFT = 'l'
|
||||
CENTER = 'c'
|
||||
RIGHT = 'r'
|
||||
@@ -0,0 +1,82 @@
|
||||
"""
|
||||
Define text table formats.
|
||||
|
||||
Based on model in tabulate.
|
||||
|
||||
Tabulate table formats are defined in namedtuples found in tf:
|
||||
|
||||
from tabulate import _table_formats as tf
|
||||
|
||||
This is a dict and e.g.,::
|
||||
|
||||
tf['mixed_grid'] = TableFormat(
|
||||
lineabove=Line(begin='┍', hline='━', sep='┯', end='┑'),
|
||||
linebelowheader=Line(begin='┝', hline='━', sep='┿', end='┥'),
|
||||
linebetweenrows=Line(begin='├', hline='─', sep='┼', end='┤'),
|
||||
linebelow=Line(begin='┕', hline='━', sep='┷', end='┙'),
|
||||
headerrow=DataRow(begin='│', sep='│', end='│'),
|
||||
datarow=DataRow(begin='│', sep='│', end='│'),
|
||||
padding=1,
|
||||
with_header_hide=None)
|
||||
|
||||
tf.keys() = dict_keys(['simple', 'plain', 'grid', 'simple_grid',
|
||||
'rounded_grid', 'heavy_grid', 'mixed_grid', 'double_grid',
|
||||
'fancy_grid', 'outline', 'simple_outline', 'rounded_outline',
|
||||
'heavy_outline', 'mixed_outline', 'double_outline', 'fancy_outline',
|
||||
'github', 'pipe', 'orgtbl', 'jira', 'presto', 'pretty', 'psql', 'rst',
|
||||
'mediawiki', 'moinmoin', 'youtrack', 'html', 'unsafehtml', 'latex',
|
||||
'latex_raw', 'latex_bookconfig.tabs', 'latex_longtable', 'tsv', 'textile',
|
||||
'asciidoc'])
|
||||
|
||||
Parameters:
|
||||
df: pandas.DataFrame
|
||||
The data to display. Should have index reset, but specify index_levels.
|
||||
data_col_widths: list[int]
|
||||
List of visible widths (excluding padding) for each column.
|
||||
data_col_aligns: list[str]
|
||||
Alignment specifiers per column: 'left', 'center', or 'right'.
|
||||
index_levels: int
|
||||
Number of columns at the start considered index columns (split visually).
|
||||
fmt: TableFormat
|
||||
Box-drawing configuration (defaults to myFormat).
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
# specify text mode
|
||||
Line = namedtuple('Line', ['begin', 'hline', 'sep', 'end', 'index_sep'])
|
||||
DataRow = namedtuple('DataRow', ['begin', 'sep', 'end', 'index_sep'])
|
||||
TableFormat = namedtuple('TableFormat', [
|
||||
'lineabove',
|
||||
'linebelowheader',
|
||||
'linebetweenrows',
|
||||
'linebelow',
|
||||
'headerrow',
|
||||
'datarow',
|
||||
'padding',
|
||||
'with_header_hide'
|
||||
])
|
||||
|
||||
# generic text format
|
||||
GT_Format = TableFormat(
|
||||
lineabove=Line('┍', '━', '┯', '┑', '┳'),
|
||||
linebelowheader=Line('┝', '━', '┿', '┥', '╋'),
|
||||
linebetweenrows=Line('├', '─', '┼', '┤', '╂'),
|
||||
linebelow=Line('┕', '━', '┷', '┙', '┻'),
|
||||
headerrow=DataRow('│', '│', '│', '┃'),
|
||||
datarow=DataRow('│', '│', '│', '┃'),
|
||||
padding=1,
|
||||
with_header_hide=None
|
||||
)
|
||||
|
||||
# GT_Format = TableFormat(
|
||||
# lineabove=Line('\u250d', '\u2501', '\u252f', '\u2511', '\u2533'),
|
||||
# linebelowheader=Line('\u251d', '\u2501', '\u253f', '\u2525', '\u254b'),
|
||||
# linebetweenrows=Line('\u251c', '\u2500', '\u253c', '\u2524', '\u2502'),
|
||||
# linebelow=Line('\u2515', '\u2501', '\u2537', '\u2519', '\u253b'),
|
||||
# headerrow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
|
||||
# datarow=DataRow('\u2502', '\u2502', '\u2502', '\u2503'),
|
||||
# padding=1,
|
||||
# with_header_hide=None
|
||||
# )
|
||||
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Make fake dataframes for testing.
|
||||
|
||||
GPT from SJMM design.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
from datetime import datetime, timedelta
|
||||
import hashlib
|
||||
import re
|
||||
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from faker import Faker
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestDataFrameFactory:
|
||||
"""
|
||||
Factory for generating small synthetic pandas DataFrames for testing.
|
||||
|
||||
Attributes:
|
||||
colname_words: Optional list of strings to use for column names.
|
||||
default_word_count: Max number of words for string columns (default 3).
|
||||
seed: Optional random seed. If None, one is generated.
|
||||
"""
|
||||
colname_words: Optional[list[str]] = None
|
||||
default_word_count: int = 3
|
||||
seed: Optional[int] = None
|
||||
_last_args: dict = field(default_factory=dict, init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
self.faker = Faker()
|
||||
self.seed = int(self.seed if self.seed is not None else np.random.SeedSequence().entropy)
|
||||
self.rng = np.random.default_rng(self.seed)
|
||||
|
||||
def make(self, rows: int, columns: Union[int, str], index: Union[int, str] = 0,
|
||||
col_index: Union[int, str] = 0, missing: float = 0.0) -> pd.DataFrame:
|
||||
"""
|
||||
Generate a test DataFrame with the given specification.
|
||||
|
||||
Args:
|
||||
rows: Number of rows.
|
||||
columns: Column type spec (int for all float cols, or string type codes).
|
||||
index: Index level types (int for RangeIndex or string like 'ti').
|
||||
col_index: Column index levels (same format as `index`).
|
||||
missing: Proportion of missing data in each column.
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
self._last_args = dict(rows=rows, columns=columns, index=index, col_index=col_index, missing=missing)
|
||||
return self._generate(**self._last_args)
|
||||
|
||||
def another(self, new_seed: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Generate another DataFrame with the last parameters.
|
||||
|
||||
Args:
|
||||
new_seed: If True, re-randomize the generator seed.
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
if new_seed:
|
||||
self.seed = int(np.random.SeedSequence().entropy)
|
||||
self.rng = np.random.default_rng(self.seed)
|
||||
return self._generate(**self._last_args)
|
||||
|
||||
def random(self, index_levels: int = 1, column_levels: int = 1) -> pd.DataFrame:
|
||||
"""
|
||||
Generate a DataFrame with randomly chosen settings.
|
||||
|
||||
Args:
|
||||
index_levels: Number of index levels to use.
|
||||
column_levels: Number of column MultiIndex levels.
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
rows = self.rng.integers(10, 50)
|
||||
col_types = self.rng.choice(['d', 'f', 'i', 's1', 's3', 's7', 'h', 't', 'p'], size=self.rng.integers(3, 7))
|
||||
missing = round(float(self.rng.uniform(0, 0.15)), 2)
|
||||
index = ''.join(self.rng.choice(['t', 'd', 'i', 's2'], size=index_levels))
|
||||
col_index = ''.join(self.rng.choice(['s', 'i', 'd'], size=column_levels))
|
||||
return self.make(rows=rows, columns=''.join(col_types), index=index, col_index=col_index, missing=missing)
|
||||
|
||||
def _parse_colspec(self, spec: str) -> list[str]:
|
||||
return re.findall(r's\d+|[a-z]', spec)
|
||||
|
||||
|
||||
def _generate(self, rows: int, columns: Union[int, str], index: Union[int, str],
|
||||
col_index: Union[int, str], missing: float) -> pd.DataFrame:
|
||||
if isinstance(columns, int):
|
||||
col_types = ['s3'] * columns
|
||||
else:
|
||||
col_types = self._parse_colspec(columns)
|
||||
|
||||
colnames = self._make_column_names(len(col_types))
|
||||
data = {
|
||||
name: self._generate_column(dt, rows) for name, dt in zip(colnames, col_types)
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
df.index = self._make_index(index, rows, "i")
|
||||
df.columns = self._make_index(col_index, len(df.columns), "c") if isinstance(col_index, str) else df.columns
|
||||
df = self._insert_missing(df, missing)
|
||||
return df
|
||||
|
||||
def _make_column_names(self, n: int) -> list[str]:
|
||||
if self.colname_words:
|
||||
pool = self.colname_words
|
||||
else:
|
||||
pool = [self.faker.word() for _ in range(n * 2)]
|
||||
names = []
|
||||
used = set()
|
||||
for word in pool:
|
||||
if len(names) >= n:
|
||||
break
|
||||
if word not in used:
|
||||
names.append(word)
|
||||
used.add(word)
|
||||
while len(names) < n:
|
||||
names.append(f"col_{len(names)}")
|
||||
return names
|
||||
|
||||
def _generate_column(self, dtype: str, n: int) -> pd.Series:
|
||||
if dtype.startswith('s'):
|
||||
max_words = int(dtype[1:]) if len(dtype) > 1 else self.default_word_count
|
||||
return pd.Series([" ".join(self.faker.words(self.rng.integers(1, max_words + 1))) for _ in range(n)])
|
||||
if dtype == 'f':
|
||||
return pd.Series(self.rng.normal(loc=100, scale=25, size=n))
|
||||
if dtype == 'i':
|
||||
return pd.Series(self.rng.integers(1e9, 1e12, size=n), dtype='int64')
|
||||
if dtype == 'd':
|
||||
start_date = self.faker.date_between(start_date='-10y', end_date='today')
|
||||
return pd.Series(pd.date_range(start=start_date, periods=n, freq='D'))
|
||||
if dtype == 't':
|
||||
start_dt = datetime.now() - timedelta(days=365 * 2)
|
||||
return pd.Series([start_dt + timedelta(minutes=int(self.rng.integers(0, 2 * 365 * 24 * 60))) for _ in range(n)])
|
||||
if dtype == 'h':
|
||||
return pd.Series([
|
||||
hashlib.blake2b(f"val{i}".encode(), digest_size=32).hexdigest()
|
||||
for i in range(n)
|
||||
])
|
||||
if dtype == 'p':
|
||||
return pd.Series([str(Path(f"/data/{self.faker.word()}/{i}.dat")) for i in range(n)])
|
||||
raise ValueError(f"Unknown dtype: {dtype}")
|
||||
|
||||
def _make_index(self, desc: Union[int, str], n: int, label_prefix: str) -> pd.Index:
|
||||
if isinstance(desc, int):
|
||||
return pd.RangeIndex(n, name=f"{label_prefix}0")
|
||||
levels = []
|
||||
names = []
|
||||
for j, dt in enumerate(desc):
|
||||
s = self._generate_column(dt, n)
|
||||
levels.append(s)
|
||||
names.append(f"{label_prefix}{j}")
|
||||
return pd.MultiIndex.from_arrays(levels, names=names)
|
||||
|
||||
def _insert_missing(self, df: pd.DataFrame, prop: float) -> pd.DataFrame:
|
||||
if prop <= 0:
|
||||
return df
|
||||
n_rows = df.shape[0]
|
||||
for col in df.columns:
|
||||
n_missing = max(1, int(np.floor(prop * n_rows)))
|
||||
missing_indices = self.rng.choice(n_rows, size=n_missing, replace=False)
|
||||
df.iloc[missing_indices, df.columns.get_loc(col)] = np.nan
|
||||
return df
|
||||
@@ -52,3 +52,7 @@ dev = [
|
||||
"sphinx-toggleprompt",
|
||||
"sphinx-multitoc-numbering"
|
||||
]
|
||||
|
||||
|
||||
[project.scripts]
|
||||
gt = "greater_tables.cli:cli"
|
||||
|
||||
+1295
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user