mirror of
https://github.com/wassname/greater_tables_project.git
synced 2026-06-27 15:00:20 +08:00
4.0 beta; new Fabricator (not plumbed in, pre-delete)
This commit is contained in:
@@ -7,25 +7,22 @@
|
||||
|
||||
## Greater Tables
|
||||
|
||||
Creating presentation quality tables is difficult. `greater_tables` provides
|
||||
a flexible way to create consistent tables in HTML, LaTeX (PDF), and terminal
|
||||
text outputs from Pandas dataframes.
|
||||
It has many options but sensible defaults. It is designed
|
||||
for use in Jupyter Lab and Quarto and will seamlessly return the correct format
|
||||
for each output type. The basic usage is simply:
|
||||
**Greater Tables** is a Python tool for producing high-quality, static display tables—intended for use in journal articles, books, formal reports, and printed financial statements. It turns your pandas DataFrame into a clean, black-and-white table—ready for print, PDF, or web. It produces consistent, typographically sound output in HTML, LaTeX (via TikZ), and plain text.
|
||||
|
||||
It’s opinionated but flexible, with many options and sensible defaults. Designed for use in **Jupyter Lab**, **Quarto**, and scripting environments, it auto-detects the output format and renders accordingly. Display tables are small and focused—the end result of your analysis, after selecting rows and columns, ordering, and labeling. Greater Tables helps you get those raw materials onto the page, cleanly and consistently.
|
||||
|
||||
```python
|
||||
from greater_tables import GT
|
||||
# ...create dataframe df...
|
||||
from greater_tables import GT
|
||||
GT(df)
|
||||
```
|
||||
|
||||
or `display(GT(df))` if called within a Jupyter or Quarto code block. Once created `GT(df)` is immutable; to change options re-create. Presentation tables are small! They fit on one or two pages and, while `GT` does a lot of work to determine formating options, it still runs very quickly.
|
||||
Or use `display(GT(df))` in notebooks and Quarto documents. Once created, a `GT(df)` object is immutable; re-create it to apply new options. Arguments can be passed directly or loaded from a YAML config file—validated using `pydantic`.
|
||||
|
||||
`greater_tables` provides similar functionality to pandas `to_html`, `to_latex` and `to_markdown`
|
||||
methods, without relying on them, and improves them in various ways. LaTeX output uses Tikz tables for very tight control over layout and grid lines. Arguments can be passed directly or set via a YAML configuration file. Validation is handled by `pydantic`.
|
||||
Greater Tables offers similar functionality to `pandas.DataFrame.to_html`, `to_latex`, and `to_markdown`, but with tighter control, better defaults, and no reliance on `pandas` internals. The LaTeX backend uses TikZ for precise control over layout and grid lines.
|
||||
|
||||
The package is tailored to more austere, black-and-white tables: no sparklines, colors or background shading. Tables can include a simple caption, but not more elaborate headers and footers.
|
||||
This is a tool for serious tables—no sparklines, colors, or shading. Just your data, rendered cleanly.
|
||||
|
||||
Also included: **Fabricator**, a flexible test DataFrame generator—specify row count, index and column hierarchies, data types, missing values, and more.
|
||||
|
||||
|
||||
## Installation
|
||||
@@ -101,77 +98,6 @@ reusable, extensible actuarial tools) in VBA and Python since the late
|
||||
1990s, and call all my macro packages *GREAT*.
|
||||
|
||||
|
||||
## History
|
||||
|
||||
3.3.0
|
||||
-------
|
||||
* Added `tikz_` series of options to config: column and row separation,
|
||||
container_env (for e.g., sidewaystable), hrule and vrule indices.
|
||||
|
||||
3.2.0
|
||||
-------
|
||||
* Added more tex snippets!
|
||||
* Refactored tikz and column width behavior
|
||||
|
||||
3.1.0
|
||||
-------
|
||||
* adjustments for auto format
|
||||
* rearranged gtcore order of methods
|
||||
|
||||
3.0.0
|
||||
-------
|
||||
|
||||
* config files / pydantic config input
|
||||
* unified col width and info dataframe
|
||||
* de-texing
|
||||
* cli for config and writeout a csv etc.
|
||||
|
||||
* testdf suite
|
||||
* Automated TeX to SVG
|
||||
|
||||
2.0.0
|
||||
------
|
||||
|
||||
* **v2.0.0** solid release old-style, all-argument GT
|
||||
* Better column widths
|
||||
* Custom text output
|
||||
* Rich table output
|
||||
|
||||
1.1.1
|
||||
-------
|
||||
* Added logo, updated docs.
|
||||
|
||||
1.1.0
|
||||
------
|
||||
|
||||
* added ``formatters`` argument to pass in column specific formatters by name as a number (``n`` converts to ``{x:.nf}``, format string, or function
|
||||
* Added ```tabs`` argument to provide column widths
|
||||
* Added ``equal`` argument to provide hint that column widths should all be equal
|
||||
* Added ``caption_align='center'`` argument to set the caption alignment
|
||||
* Added ``large_ok=False`` argument, if ``False`` providing a dataframe with more than 100 rows throws an error. This function is expensive and is designed for small frames.
|
||||
|
||||
1.0.0
|
||||
------
|
||||
|
||||
* Allow input via list of lists, or markdown table
|
||||
* Specify overall float format for whole table
|
||||
* Specify column alingment with 'llrc' style string
|
||||
* ``show_index`` option
|
||||
* Added more tests
|
||||
* Docs updated
|
||||
* Set tabs for width; use of width in HTML format.
|
||||
|
||||
0.6.0
|
||||
------
|
||||
|
||||
* Initial release
|
||||
|
||||
Early development
|
||||
-------------------
|
||||
|
||||
* 0.1.0 - 0.5.0: Early development
|
||||
* tikz code from great.pres_manager
|
||||
|
||||
|
||||
## 📁 Project Layout
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ greater\_tables.gtformats module
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
greater\_tables.hasher module
|
||||
greater\_tables.gthasher module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: greater_tables.hasher
|
||||
@@ -60,18 +60,18 @@ greater\_tables.hasher module
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
greater\_tables.testdf module
|
||||
-----------------------------
|
||||
greater\_tables.gtfabrications module
|
||||
----------------------------------------
|
||||
|
||||
.. automodule:: greater_tables.testdf
|
||||
.. automodule:: greater_tables.gtfabrications
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
greater\_tables.tex\_svg module
|
||||
greater\_tables.gtetcher module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: greater_tables.tex_svg
|
||||
.. automodule:: greater_tables.gtetcher
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
Greater Tables: Purpose and Process
|
||||
=====================================
|
||||
|
||||
1. Introduction and Purpose
|
||||
-------------------------------
|
||||
|
||||
.. raw analysis selected/curated display
|
||||
|
||||
**Greater Tables** is a Python-based tool for producing high-quality, static display tables—intended for use in journal articles, books, formal reports, and printed financial statements. Its focus is on clarity, precision, and typographic consistency in **black-and-white, presentation-grade tables**. If you need visual embellishments like colors, sparklines, or interactive filters, you're looking for a different kind of tool. Greater Tables is about making numbers clear—not making them move.
|
||||
|
||||
The goal is to bridge the final step between analysis and publication: turning structured data into a table that looks right on the page. That means getting the **spacing, alignment, wrapping, and formatting** right so the reader can focus on the content—not fight with the layout.
|
||||
|
||||
It’s important to emphasize that Greater Tables is **not** a charting library or an exploratory data analysis tool. Those serve different purposes. Charts help readers understand patterns and trends—often at the cost of precision. Tables are for **exact values**, **structured comparison**, and **decisions**.
|
||||
|
||||
Greater Tables is built for **static, print-ready output**. This is not a dashboarding tool, and it does not attempt to support interactivity. If you're preparing data for decision-makers, regulators, or publication, Greater Tables helps you present that data with clarity, consistency, and dignity. It turns structured data into final-form tables: faithful to the analysis, respectful of the reader.
|
||||
|
||||
.. admonition:: Charts vs. Tables
|
||||
|
||||
Tables and charts serve different cognitive functions.
|
||||
|
||||
* **Charts** are tools for exploration, discovery, and communication of patterns. They help identify trends and relationships, especially where precision is secondary.
|
||||
* **Tables** are tools for reference, validation, and decision-making. They are used where exact values matter and where the structure is already known.
|
||||
|
||||
You might use a chart to explain seasonal trends to a client. You use a table to calculate bonuses, approve budgets, or submit results to a regulator.
|
||||
|
||||
Greater Tables is for that second case: high-precision, high-trust, high-clarity tabular presentation.
|
||||
|
||||
2. Anatomy of a Table
|
||||
---------------------------
|
||||
|
||||
A well-formed table has three structural components:
|
||||
|
||||
1. **The data**—a rectangular array of values, typically numeric or categorical.
|
||||
2. **Row labels**—one or more leading columns that organize the data by grouping rows, often hierarchically (e.g., country → region → city).
|
||||
3. **Column labels**—a corresponding structure that organizes the columns, such as year → quarter → month.
|
||||
|
||||
Additional elements such as a **title**, **caption**, or **footnotes** may accompany the table but are not considered part of the data itself. Greater Tables encourages separating descriptive or contextual information into captions, preserving the purity of the table structure.
|
||||
|
||||
While most software merges row labels into the data table and treats column labels as headers, Greater Tables gives both equal attention. Each can carry meaningful hierarchy. The distinction between rows and columns is often arbitrary—in many tables, either axis could be transposed without changing the semantics. Layout and print constraints usually determine which becomes which.
|
||||
|
||||
.. admonition:: From Raw Data to Presentation Table
|
||||
|
||||
Before using Greater Tables, the dataset goes through several stages of preparation:
|
||||
|
||||
1. **Raw data**—often includes many columns, internal naming, and structure optimized for computation.
|
||||
2. **Analysis dataset**—cleaned, subsetted, and possibly aggregated; new variables may be computed here.
|
||||
3. **Derived elements**—secondary metrics like change over time, percentages of subtotal, or indexed values. These don't reflect direct observations but are built from them.
|
||||
4. **Pre-presentation table**—a tidy, well-labeled subset of columns, in the desired order and naming scheme.
|
||||
5. **Presentation**—Greater Tables takes this prepped table and handles only layout and typography.
|
||||
|
||||
.. admonition:: Tidy Data
|
||||
|
||||
Greater Tables assumes your table follows the tidy data model:
|
||||
|
||||
* Each column is a variable.
|
||||
* Each row is an observation.
|
||||
* Each cell contains one value.
|
||||
|
||||
Hierarchical data (e.g., region → city, or year → month) is expressed via multiple columns or MultiIndexes. This structure is always **flat in form**, even if hierarchical in meaning.
|
||||
|
||||
.. admonition:: Pandas Index vs. SQL Index
|
||||
|
||||
In pandas, the *Index* is a labeling tool used to identify rows and columns for selection, alignment, and display. It is semantic. In SQL, an *index* is a performance structure used for speeding up lookups. Same word, very different purpose.
|
||||
|
||||
3. Labeling, Formatting, and Layout
|
||||
------------------------------------
|
||||
|
||||
Once a tidy, pre-presentation table is prepared, Greater Tables applies formatting decisions:
|
||||
|
||||
* **Column widths** are calculated from both the data values and the label lengths.
|
||||
* **Alignment** is conventional: text is left-aligned, numbers right-aligned, and dates centered.
|
||||
* **Text wrapping** can be applied, ideally with ragged-right edges and minimal hyphenation.
|
||||
* **Number formatting** supports thousands separators, consistent decimal places, and suppression of floating-point artifacts.
|
||||
* **Monospaced fonts** are recommended for numbers to make differences in magnitude visually apparent.
|
||||
* **Semantic formatting** is encouraged: margins as a percentage of sales, changes over time, or indexed values.
|
||||
|
||||
Greater Tables does **not** modify the structure of the data. It does not sort, filter, rename, or pivot. It simply takes the table and **renders it with typographic precision**, making good decisions about space, alignment, and clarity.
|
||||
|
||||
.. admonition:: Meta-Rows and Derived Columns
|
||||
|
||||
In final tables, it’s common to introduce elements that are not raw observations:
|
||||
|
||||
* **Subtotals** and **grand totals**: rows that summarize other rows.
|
||||
* **Percentage-of-total** or **difference-from-baseline**: columns derived from multiple observations.
|
||||
* **Headers or separators**: rows that serve as group titles.
|
||||
|
||||
These are structural enhancements introduced during presentation, not part of the tidy data itself. Greater Tables renders them as-is, assuming you’ve added them deliberately to improve clarity.
|
||||
|
||||
|
||||
.. admonition:: Time, State, and Change
|
||||
|
||||
Time is a special axis.
|
||||
|
||||
Most datasets report either:
|
||||
|
||||
* **States** at a point in time (e.g., balance, inventory, weight), or
|
||||
* **Changes** over a period (e.g., revenue, flow, weight loss).
|
||||
|
||||
These correspond to **point-in-time** vs. **over-time** measurements. Both can be stored in tidy format, but mixing them requires care. Their time labels mean different things.
|
||||
|
||||
Other axes (e.g., elevation vs. change in elevation, treatment A vs. B, document version X vs. Y) follow similar logic, but time remains the most natural and universal case. Derived quantities like differences or growth rates reflect *relationships* across time, not independent observations. They must be labeled accordingly.
|
||||
|
||||
The analogy to **balance sheet vs. income statement** is instructive:
|
||||
|
||||
* A balance sheet gives a snapshot at a single date.
|
||||
* An income statement measures change between two dates.
|
||||
* Both are valid, but they require different structures—and cannot be naively combined.
|
||||
|
||||
4. Output Consistency Across Formats
|
||||
----------------------------------------
|
||||
|
||||
Greater Tables produces tables in three formats:
|
||||
|
||||
* **Text** (for console or plain-text rendering),
|
||||
* **HTML** (for web or rich email output),
|
||||
* **TeX/PDF** (for inclusion in LaTeX documents).
|
||||
|
||||
Each output is designed to preserve the structure, alignment, and formatting choices defined by the table's metadata. The layout engine adapts to the output medium but **never alters the underlying table**. This ensures visual consistency across formats.
|
||||
|
||||
Each backend respects:
|
||||
|
||||
* Column widths and wrapping constraints,
|
||||
* Font styles (monospaced for numbers),
|
||||
* Multi-level headers and index structure,
|
||||
* Alignment and spacing.
|
||||
|
||||
This guarantees that the same table, rendered in multiple formats, carries the same logic, appearance, and communicative power.
|
||||
+20
-32
@@ -8,38 +8,32 @@ Welcome to greater_tables's documentation!
|
||||
|
||||
|image1| |image2| |image3|
|
||||
|
||||
Greater Tables
|
||||
--------------
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Contents:
|
||||
|
||||
Creating presentation quality tables is difficult. ``greater_tables``
|
||||
provides a flexible way to create consistent tables in HTML, LaTeX
|
||||
(PDF), and terminal text outputs from Pandas dataframes. It has many
|
||||
options but sensible defaults. It is designed for use in Jupyter Lab and
|
||||
Quarto and will seamlessly return the correct format for each output
|
||||
type. The basic usage is simply:
|
||||
homepage
|
||||
versions
|
||||
greater_tables
|
||||
greater_tables.data
|
||||
|
||||
.. code:: python
|
||||
|
||||
from greater_tables import GT
|
||||
# ...create dataframe df...
|
||||
GT(df)
|
||||
**Greater Tables** is a Python tool for producing high-quality, static display tables—intended for use in journal articles, books, formal reports, and printed financial statements. It turns your pandas DataFrame into a clean, black-and-white table—ready for print, PDF, or web. It produces consistent, typographically sound output in HTML, LaTeX (via TikZ), and plain text.
|
||||
|
||||
or ``display(GT(df))`` if called within a Jupyter or Quarto code block.
|
||||
Once created ``GT(df)`` is immutable; to change options re-create.
|
||||
Presentation tables are small! They fit on one or two pages and, while
|
||||
``GT`` does a lot of work to determine formating options, it still runs
|
||||
very quickly.
|
||||
It’s opinionated but flexible, with many options and sensible defaults. Designed for use in **Jupyter Lab**, **Quarto**, and scripting environments, it auto-detects the output format and renders accordingly. Display tables are small and focused—the end result of your analysis, after selecting rows and columns, ordering, and labeling. Greater Tables helps you get those raw materials onto the page, cleanly and consistently.
|
||||
|
||||
``greater_tables`` provides similar functionality to pandas ``to_html``,
|
||||
``to_latex`` and ``to_markdown`` methods, without relying on them, and
|
||||
improves them in various ways. LaTeX output uses Tikz tables for very
|
||||
tight control over layout and grid lines. Arguments can be passed
|
||||
directly or set via a YAML configuration file. Validation is handled by
|
||||
``pydantic``.
|
||||
```python
|
||||
from greater_tables import GT
|
||||
GT(df)
|
||||
```
|
||||
|
||||
The package is tailored to more austere, black-and-white tables: no
|
||||
sparklines, colors or background shading. Tables can include a simple
|
||||
caption, but not more elaborate headers and footers.
|
||||
Or use `display(GT(df))` in notebooks and Quarto documents. Once created, a `GT(df)` object is immutable; re-create it to apply new options. Arguments can be passed directly or loaded from a YAML config file—validated using `pydantic`.
|
||||
|
||||
Greater Tables offers similar functionality to `pandas.DataFrame.to_html`, `to_latex`, and `to_markdown`, but with tighter control, better defaults, and no reliance on `pandas` internals. The LaTeX backend uses TikZ for precise control over layout and grid lines.
|
||||
|
||||
This is a tool for serious tables—no sparklines, colors, or shading. Just your data, rendered cleanly.
|
||||
|
||||
Also included: **Fabricator**, a flexible test DataFrame generator—specify row count, index and column hierarchies, data types, missing values, and more.
|
||||
|
||||
Installation
|
||||
------------
|
||||
@@ -126,12 +120,6 @@ reusable, extensible actuarial tools) in VBA and Python since the late
|
||||
1990s, and call all my macro packages *GREAT*.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
greater_tables
|
||||
greater_tables.data
|
||||
|
||||
|
||||
Indices and tables
|
||||
|
||||
@@ -0,0 +1,78 @@
|
||||
Versions
|
||||
==========
|
||||
|
||||
4.0.0
|
||||
-------
|
||||
* Moved constants out of magic strings and into config
|
||||
* Renamed: gtetecher and `Etcher` class, gtconfig and `Conigurator`, gtfabrications and `Fabricator`
|
||||
* Changed argument names in `Fabricator` to more align with dataframe and data terminology:
|
||||
* Structuring docs
|
||||
|
||||
3.3.0
|
||||
-------
|
||||
* Added `tikz_` series of options to config: column and row separation,
|
||||
container_env (for e.g., sidewaystable), hrule and vrule indices.
|
||||
|
||||
3.2.0
|
||||
-------
|
||||
* Added more tex snippets!
|
||||
* Refactored tikz and column width behavior
|
||||
|
||||
3.1.0
|
||||
-------
|
||||
* adjustments for auto format
|
||||
* rearranged gtcore order of methods
|
||||
|
||||
3.0.0
|
||||
-------
|
||||
|
||||
* config files / pydantic config input
|
||||
* unified col width and info dataframe
|
||||
* de-texing
|
||||
* cli for config and writeout a csv etc.
|
||||
|
||||
* testdf suite
|
||||
* Automated TeX to SVG
|
||||
|
||||
2.0.0
|
||||
------
|
||||
|
||||
* **v2.0.0** solid release old-style, all-argument GT
|
||||
* Better column widths
|
||||
* Custom text output
|
||||
* Rich table output
|
||||
|
||||
1.1.1
|
||||
-------
|
||||
* Added logo, updated docs.
|
||||
|
||||
1.1.0
|
||||
------
|
||||
|
||||
* added ``formatters`` argument to pass in column specific formatters by name as a number (``n`` converts to ``{x:.nf}``, format string, or function
|
||||
* Added ```tabs`` argument to provide column widths
|
||||
* Added ``equal`` argument to provide hint that column widths should all be equal
|
||||
* Added ``caption_align='center'`` argument to set the caption alignment
|
||||
* Added ``large_ok=False`` argument, if ``False`` providing a dataframe with more than 100 rows throws an error. This function is expensive and is designed for small frames.
|
||||
|
||||
1.0.0
|
||||
------
|
||||
|
||||
* Allow input via list of lists, or markdown table
|
||||
* Specify overall float format for whole table
|
||||
* Specify column alingment with 'llrc' style string
|
||||
* ``show_index`` option
|
||||
* Added more tests
|
||||
* Docs updated
|
||||
* Set tabs for width; use of width in HTML format.
|
||||
|
||||
0.6.0
|
||||
------
|
||||
|
||||
* Initial release
|
||||
|
||||
Early development
|
||||
-------------------
|
||||
|
||||
* 0.1.0 - 0.5.0: Early development
|
||||
* tikz code from great.pres_manager
|
||||
@@ -1,7 +1,10 @@
|
||||
__version__ = '3.3.0'
|
||||
__version__ = '4.0.0'
|
||||
__project__ = 'greater_tables'
|
||||
__author__ = 'Stephen J Mildenhall'
|
||||
|
||||
from . gtcore import GT
|
||||
from . gtfabrications import Fabricator
|
||||
from . gtetcher import Etcher
|
||||
|
||||
# from . gtbreaks import Breakability
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import click
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from .gtconfig import GTConfigModel, write_template
|
||||
from .gtconfig import Configurator, write_template
|
||||
from .gtcore import GT
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ def render(input_file, output, format, config):
|
||||
else:
|
||||
raise click.UsageError(f"Unsupported extension: {ext}")
|
||||
|
||||
cfg = GTConfigModel(Path(config) if config else None).get()
|
||||
cfg = Configurator(Path(config) if config else None).get()
|
||||
gt = GT(df, config=cfg)
|
||||
|
||||
rendered = (
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""
|
||||
Configuration model and utilities for GreaterTables.
|
||||
|
||||
Defines the `GTConfigModel` schema using Pydantic, which acts as the single
|
||||
Defines the `Configurator` schema using Pydantic, which acts as the single
|
||||
source of truth for default values, validation, and structure of all table-rendering options.
|
||||
|
||||
Also includes functions for writing editable config templates and loading from YAML.
|
||||
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, ValidationError, ConfigDict
|
||||
import yaml
|
||||
|
||||
|
||||
class GTConfigModel(BaseModel):
|
||||
class Configurator(BaseModel):
|
||||
"""
|
||||
Configuration model for GreaterTables.
|
||||
|
||||
@@ -29,8 +29,8 @@ class GTConfigModel(BaseModel):
|
||||
|
||||
:Usage:
|
||||
|
||||
>>> from greater_tables.gtconfig import GTConfigModel
|
||||
>>> cfg = GTConfigModel(font_size="1.2em", caption_align="left")
|
||||
>>> from greater_tables.gtconfig import Configurator
|
||||
>>> cfg = Configurator(font_size="1.2em", caption_align="left")
|
||||
|
||||
:see also: ``GTConfig`` for loading from YAML with overrides.
|
||||
``gt write-template`` CLI command to generate a default config file.
|
||||
@@ -197,7 +197,7 @@ class GTConfigModel(BaseModel):
|
||||
def write_template(path: Path):
|
||||
"""Generate a clean default config file at the given path."""
|
||||
path = Path(path)
|
||||
cfg = GTConfigModel()
|
||||
cfg = Configurator()
|
||||
yaml_str = yaml.dump(cfg.model_dump(), sort_keys=False)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(yaml_str, encoding="utf-8")
|
||||
|
||||
@@ -34,9 +34,9 @@ from IPython.display import display, SVG
|
||||
|
||||
from . gtenums import Breakability, Alignment
|
||||
from . gtformats import GT_Format, TableFormat, Line, DataRow
|
||||
from . gtconfig import GTConfigModel
|
||||
from . hasher import df_short_hash
|
||||
from . tex_svg import TikzProcessor
|
||||
from . gtconfig import Configurator
|
||||
from . gthasher import df_short_hash
|
||||
from . gtetcher import Etcher
|
||||
|
||||
# turn this fuck-fest off
|
||||
pd.set_option('future.no_silent_downcasting', True)
|
||||
@@ -242,7 +242,7 @@ class GT(object):
|
||||
raw_cols=None,
|
||||
show_index=True,
|
||||
#
|
||||
config: GTConfigModel | None = None,
|
||||
config: Configurator | None = None,
|
||||
config_path: Path | None = None,
|
||||
**overrides,
|
||||
):
|
||||
@@ -255,18 +255,18 @@ class GT(object):
|
||||
elif config_path:
|
||||
try:
|
||||
raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
||||
base_config = GTConfigModel.model_validate(raw)
|
||||
base_config = Configurator.model_validate(raw)
|
||||
except (ValidationError, OSError) as e:
|
||||
raise ValueError(
|
||||
f"Failed to load config from {config_path}") from e
|
||||
else:
|
||||
base_config = GTConfigModel()
|
||||
base_config = Configurator()
|
||||
|
||||
# access through config
|
||||
# update and validate; need to merge to avoid repeated args
|
||||
# merged = dict(base_config.model_dump(), **overrides)
|
||||
merged = base_config.model_dump() | overrides
|
||||
self.config = GTConfigModel(**merged)
|
||||
self.config = Configurator(**merged)
|
||||
# no validation
|
||||
# self.config = base_config.model_copy(update=overrides)
|
||||
|
||||
@@ -2812,7 +2812,7 @@ class GT(object):
|
||||
|
||||
def make_svg(self):
|
||||
"""Render tikz into svg text."""
|
||||
tz = TikzProcessor(self._repr_latex_(),
|
||||
tz = Etcher(self._repr_latex_(),
|
||||
file_name=self.df_id, debug=self.config.debug)
|
||||
p = tz.file_path.with_suffix('.svg')
|
||||
if not p.exists():
|
||||
|
||||
@@ -11,10 +11,10 @@ from pathlib import Path
|
||||
from subprocess import run, Popen, PIPE
|
||||
from IPython.display import SVG, display
|
||||
|
||||
from .hasher import txt_short_hash
|
||||
from .gthasher import txt_short_hash
|
||||
|
||||
|
||||
class TikzProcessor:
|
||||
class Etcher:
|
||||
"""Create PDF and SVG files from Tikz blocks."""
|
||||
# Full TeX preamble to generate a .fmt if needed
|
||||
_tex_template_full = r"""\documentclass[10pt, border=5mm]{standalone}
|
||||
@@ -66,7 +66,7 @@ class TikzProcessor:
|
||||
"""Create format file for faster compilation if missing."""
|
||||
if self.format_file.exists():
|
||||
return
|
||||
print('TikzProcessor: building TeX format fmt file...', end ='')
|
||||
print('Etcher: building TeX format fmt file...', end ='')
|
||||
tmp = self.out_path / 'tikz_format.tex'
|
||||
tmp.write_text(self._tex_template_full, encoding='utf-8')
|
||||
cmd = [
|
||||
@@ -0,0 +1,417 @@
|
||||
"""
|
||||
Fabricate dataframes for testing.
|
||||
"""
|
||||
|
||||
from collections import deque
|
||||
from datetime import datetime, timedelta
|
||||
from importlib.resources import files
|
||||
from itertools import cycle, chain, count, zip_longest, product, islice
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
import hashlib
|
||||
import random
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from IPython.display import display
|
||||
|
||||
|
||||
class Fabricator:
|
||||
"""
|
||||
Fabricate dataframes.
|
||||
"""
|
||||
|
||||
metric_roots = ['absorption', 'acceleration', 'account', 'activation', 'adjustment', 'allocation', 'amplitude', 'approval', 'asset', 'atom', 'attrition', 'balance', 'band', 'binding', 'cancellation', 'capacitance', 'capital', 'cashflow', 'category', 'cell', 'charge', 'claim', 'commission', 'compound', 'concentration', 'conductivity', 'constraint', 'consumption', 'conversion', 'correlation', 'cost', 'count', 'coverage', 'credit', 'current', 'debt', 'decay', 'decibel', 'deductible', 'deficit', 'deflator', 'demand', 'density', 'development', 'diffusion', 'discount', 'distribution', 'dividend', 'dose', 'duration', 'earnings', 'efficiency', 'elasticity', 'employment', 'energy', 'entropy', 'enzyme', 'estimate', 'excess', 'exhaustion', 'expense', 'exposure', 'failure', 'field', 'flux', 'force', 'frequency', 'funding', 'gdp', 'gene', 'gradient', 'growth', 'half_life', 'incidence', 'income', 'index', 'indicator', 'inequality', 'inflation', 'inhibition', 'input', 'intensity', 'investment', 'kurtosis', 'lapse', 'layer', 'leverage', 'liability', 'limit', 'loss', 'luminosity', 'margin', 'mass', 'molecule', 'momentum', 'mortality', 'neutron', 'noise', 'operating', 'output', 'penalty', 'photon', 'policy', 'portfolio', 'potential', 'power', 'preference', 'premium', 'pressure', 'price', 'productivity', 'profit', 'protein', 'proton', 'provision', 'radiation', 'rate', 'ratio', 'reaction', 'recovery', 'reflection', 'refraction', 'renewal', 'reserve', 'residual', 'resistance', 'return', 'revenue', 'risk', 'sample', 'savings', 'scenario', 'score', 'sector', 'settlement', 'severity', 'shock', 'shortfall', 'signal', 'skewness', 'spread', 'strain', 'stress', 'subsidy', 'supply', 'tail', 'tariff', 'tax', 'temperature', 'tension', 'term', 'threshold', 'trade', 'trend', 'turbulence', 'unemployment', 'uptake', 'utility', 'utilization', 'valuation', 'variance', 'velocity', 'viscosity', 'volatility', 'voltage', 'volume', 'wage', 'wavelength', 'wealth', 'weight', 'yield']
|
||||
|
||||
metric_suffix = ["", "rate", "score", "amount", "index", "ratio", "factor", "value"]
|
||||
|
||||
def __init__(self, seed: Optional[int] = None):
|
||||
"""
|
||||
Fabricate small synthetic pandas DataFrames for testing.
|
||||
|
||||
Attributes:
|
||||
seed: Optional random seed. If None, one is generated.
|
||||
"""
|
||||
self._last_args = {}
|
||||
self.seed = int(
|
||||
seed if seed is not None else np.random.SeedSequence().entropy)
|
||||
|
||||
# rng
|
||||
self.rng = np.random.default_rng(self.seed)
|
||||
|
||||
# word list for names of index levels
|
||||
nwl = self.metric_roots[:]
|
||||
self.rng.shuffle(nwl)
|
||||
self._metric_namer = cycle(nwl)
|
||||
|
||||
# read words and create cycler
|
||||
data_path = files('greater_tables').joinpath('data', 'words-12.md')
|
||||
with data_path.open('r', encoding='utf-8') as f:
|
||||
txt = f.read()
|
||||
word_list = txt.split('\n')
|
||||
temp = word_list[:]
|
||||
self.rng.shuffle(temp)
|
||||
self._word_gen = cycle(temp)
|
||||
|
||||
# read tex expressions and create cycler
|
||||
data_path = files('greater_tables').joinpath('data', 'tex_list.csv')
|
||||
with data_path.open('r', encoding='utf-8') as f:
|
||||
tex_list = pd.read_csv(f, index_col=0)['expr'].to_list()
|
||||
# trim down slightly
|
||||
pat = re.compile(r'(?<!\\)\b[a-z]{4,}\b')
|
||||
tex_list = [i for i in tex_list if not pat.search(i) and len(i)<=50]
|
||||
self.rng.shuffle(tex_list)
|
||||
self._tex_gen = cycle(tex_list)
|
||||
|
||||
self.simple_namer = {
|
||||
'd': 'date',
|
||||
'f': 'float',
|
||||
'h': 'hash',
|
||||
'i': 'integer',
|
||||
'l': 'large_float',
|
||||
'm': 'yr-mo',
|
||||
'p': 'path',
|
||||
'r': 'ratio',
|
||||
's': 'string',
|
||||
't': 'time',
|
||||
'v': 'extreme_float',
|
||||
'x': 'tex',
|
||||
'y': 'year',
|
||||
}
|
||||
|
||||
# lengths of index (word count) sampled from:
|
||||
self.index_value_lengths = [1]*10 + [2] * 4 + [3]
|
||||
self.cache = deque(maxlen=10)
|
||||
|
||||
@staticmethod
|
||||
def roll_columns(df, levels=-1):
|
||||
""""Roll" the column MultiIndex round by levels, default makes top bottom, rest move up."""
|
||||
idx = df.columns
|
||||
idx = idx.reorder_levels(np.roll(range(df.columns.nlevels), levels))
|
||||
df.columns = idx
|
||||
df = df.sort_index(axis=1)
|
||||
return df
|
||||
|
||||
def uber(self, rows, data_spec, *, index_levels=1, index_names=None, column_groups=1, column_levels=1, column_names=None, decorate=False, simplify=True, oversample=1):
|
||||
"""
|
||||
Fabricate a dataframe.
|
||||
|
||||
Data types
|
||||
|
||||
d date
|
||||
f float
|
||||
h hash
|
||||
i integer
|
||||
l log float (greater range than float)
|
||||
m year - month
|
||||
p path (filename)
|
||||
r ratio (smaller floats, for percents)
|
||||
sx string length x
|
||||
t time
|
||||
v very large range float
|
||||
x tex text - an equation
|
||||
y year
|
||||
|
||||
|
||||
metrics
|
||||
total num cols = metrics x column_groups
|
||||
"""
|
||||
# validate args
|
||||
assert column_levels <= column_groups, 'Column levels must be <= groups'
|
||||
assert index_names is None or len(index_names) == index_levels, 'Index names must have length index_levels'
|
||||
assert column_names is None or len(column_names) == column_levels, 'Column names must have length column_levels'
|
||||
|
||||
# figure data_spec and hence (important) number of metrics
|
||||
data_spec = self._parse_colspec(data_spec)
|
||||
metrics = len(data_spec)
|
||||
if oversample > 1:
|
||||
df = self.uber(oversample * rows, metrics, data_spec, index_levels=index_levels,
|
||||
|
||||
index_names=index_names, column_groups=column_groups, column_levels=column_levels,
|
||||
column_names=column_names, decorate=decorate, oversample=1)
|
||||
df = df.iloc[:rows, :]
|
||||
return df
|
||||
|
||||
inames = index_names or [f'i_{i}' for i in range(index_levels)]
|
||||
index = pd.MultiIndex.from_tuples(islice(product(*(self._generate_column('s', v) for v in self.primes_for_product(rows, index_levels))), rows), names=inames)
|
||||
|
||||
# create with col groups and drop later if needed
|
||||
cnames = (column_names or [f'c_{i}' for i in range(column_levels)]) + ['metric']
|
||||
columns_pfp = self.primes_for_product(column_groups, column_levels)
|
||||
cgroup_product = product(*(self._generate_column('s', v) for v in columns_pfp))
|
||||
# take first column_groups entries - islice works without creating the full iterable
|
||||
cgroup_product = islice(cgroup_product, column_groups)
|
||||
# add metrics
|
||||
metric_names = [self.metric_name for _ in range(metrics)]
|
||||
cgroup_product = product(cgroup_product, metric_names)
|
||||
# flatten
|
||||
cgroup_product = [(*x, y) for x, y in cgroup_product]
|
||||
columns = pd.MultiIndex.from_tuples(cgroup_product, names=cnames)
|
||||
|
||||
# create empty df
|
||||
df = pd.DataFrame(index=index, columns=columns)
|
||||
|
||||
if df.shape[1] != prod(columns_pfp):
|
||||
print("Incomplete column...won't unstack")
|
||||
print(df.shape[1], prod(columns_pfp ))
|
||||
|
||||
# fill in the data, data_spec x column_groups groups
|
||||
for c, dt in zip(df.columns, data_spec * column_groups):
|
||||
df[c] =self._generate_column(dt, rows).values
|
||||
|
||||
if simplify:
|
||||
df = self.drop_singleton_levels(df)
|
||||
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def drop_singleton_levels(df):
|
||||
if isinstance(df.index, pd.MultiIndex):
|
||||
df = df.droplevel([i for i, lvl in enumerate(df.index.levels)
|
||||
if len(lvl) == 1])
|
||||
if isinstance(df.columns, pd.MultiIndex):
|
||||
df.columns = df.columns.droplevel([i for i, lvl in enumerate(df.columns.levels)
|
||||
if len(lvl) == 1])
|
||||
return df
|
||||
|
||||
# def make(self, rows: int, columns: Union[int, str], index: Union[int, str] = 0,
|
||||
# col_index: Union[int, str] = 0, missing: float = 0.0) -> pd.DataFrame:
|
||||
# """
|
||||
# Generate a test DataFrame with the given specification.
|
||||
|
||||
|
||||
# Args:
|
||||
# rows: Number of rows.
|
||||
# columns: Column type spec (int for all float cols, or string type codes).
|
||||
# index: Index level types (int for RangeIndex or string like 'ti').
|
||||
# col_index: Column index levels (same format as `index`).
|
||||
# missing: Proportion of missing data in each column.
|
||||
|
||||
# Returns:
|
||||
# DataFrame
|
||||
# """
|
||||
# self._last_args = dict(rows=rows, columns=columns,
|
||||
# index=index, col_index=col_index, missing=missing)
|
||||
# return self._generate(**self._last_args)
|
||||
|
||||
# def another(self) -> pd.DataFrame:
|
||||
# """
|
||||
# Generate another DataFrame with the last parameters.
|
||||
|
||||
# Returns:
|
||||
# DataFrame
|
||||
# """
|
||||
# return self._generate(**self._last_args)
|
||||
|
||||
# def random(self, index_levels: int = 0, column_levels: int = 0, omit: str = 'p') -> pd.DataFrame:
|
||||
# """
|
||||
# Generate a DataFrame with randomly chosen settings.
|
||||
|
||||
# Args:
|
||||
# index_levels: Number of index levels to use.
|
||||
# column_levels: Number of column MultiIndex levels.
|
||||
# omit: omit column datatypes in omit
|
||||
# Returns:
|
||||
# DataFrame
|
||||
# """
|
||||
# if index_levels == 0:
|
||||
# index_levels = int(self.choice([1, 2, 3], p=(.6, .3, .1)))
|
||||
# if column_levels == 0:
|
||||
# column_levels = int(self.choice([1, 2, 3], p=(.5, .3, .2)))
|
||||
# rows = self.rng.integers(5 * index_levels, 10 * index_levels)
|
||||
# valid_types = [i for i in ['d', 'f', 'i', 's3', 'l', 'h', 't', 'p', 'x', 'r', 'y'] if i not in omit]
|
||||
# col_types = self.rng.choice(
|
||||
# valid_types, size=self.rng.integers(3, 7))
|
||||
# missing = round(float(self.rng.uniform(0, 0.15)), 2)
|
||||
# index = ''.join(self.rng.choice(
|
||||
# ['t', 'd', 'y', 'i', 's2'], size=index_levels))
|
||||
# col_index = ''.join(self.rng.choice(
|
||||
# ['s', 's2', 's2', 's3'], size=column_levels))
|
||||
# return self.make(rows=rows, columns=''.join(col_types), index=index, col_index=col_index, missing=missing)
|
||||
|
||||
def _parse_colspec(self, spec: str) -> list[str]:
|
||||
return re.findall(r's\d+|[a-z]', spec)
|
||||
|
||||
# def _generate_column_ex(self, dtype: str, n: int, d: int = 0, r: int = 0) -> pd.Series:
|
||||
# """
|
||||
# Generate a sample of n values from d distinct values of dtype repeated r times
|
||||
|
||||
# If d == 0 generate unique values.
|
||||
# """
|
||||
# if d == 0:
|
||||
# return self._generate_column(dtype, n)
|
||||
# assert d > 0
|
||||
# base = self._generate_column(dtype, d)
|
||||
# return pd.Series(np.repeat(base.values, r)[:n])
|
||||
|
||||
def _generate_column(self, dtype: str, n: int) -> pd.Series:
|
||||
"""Generate a sample of n distinct values of dtype."""
|
||||
if dtype.startswith('s'):
|
||||
max_words = int(dtype[1:]) if len(dtype) > 1 else 1
|
||||
return pd.Series([" ".join(self.word() for i in range(max_words)) for j in range(n)])
|
||||
if dtype == 'f':
|
||||
return pd.Series(self.rng.normal(loc=100000, scale=250000, size=n))
|
||||
if dtype == 'r':
|
||||
return pd.Series(self.rng.normal(loc=0.5, scale=0.35, size=n))
|
||||
if dtype == 'l':
|
||||
# log float (greater range)
|
||||
scale = 10.
|
||||
return pd.Series(np.exp(self.rng.normal(loc=-scale**2 / 2 + 15, scale=scale, size=n)))
|
||||
if dtype == 'v':
|
||||
# log float (greater range)
|
||||
sc = 5
|
||||
return pd.Series(np.exp(self.rng.normal(loc=-sc**2 / 2 + 10, scale=sc, size=n)))
|
||||
if dtype == 'i':
|
||||
return pd.Series(self.rng.integers(-1e4, 1e6, size=n), dtype='int64')
|
||||
if dtype == 'd':
|
||||
start_date = Fabricator.random_date_within_last_n_years(
|
||||
10)
|
||||
return pd.Series(pd.date_range(start=start_date, periods=n, freq='D'))
|
||||
if dtype == 'y':
|
||||
return pd.Series(random.sample(range(1990, 2031), n))
|
||||
if dtype == 't':
|
||||
start_dt = datetime.now() - timedelta(days=365 * 2)
|
||||
return pd.Series([
|
||||
start_dt +
|
||||
timedelta(minutes=int(self.rng.integers(0, 2 * 365 * 24 * 60)))
|
||||
for _ in range(n)
|
||||
])
|
||||
if dtype == 'h':
|
||||
return pd.Series([
|
||||
hashlib.blake2b(f"val{i}".encode(), digest_size=32).hexdigest()
|
||||
for i in range(n)
|
||||
])
|
||||
if dtype == 'p':
|
||||
return pd.Series([str(Path(f"/data/{self.word()}/{i}.dat")) for i in range(n)])
|
||||
if dtype == 'x':
|
||||
# tex
|
||||
return pd.Series([self.tex() for i in range(n)])
|
||||
raise ValueError(f"Unknown dtype: {dtype}")
|
||||
|
||||
# def _make_index(self, desc: Union[int, str, list[str]], n: int) -> pd.Index:
|
||||
# if isinstance(desc, int):
|
||||
# return pd.RangeIndex(n, name=self.index_name())
|
||||
# if isinstance(desc, str):
|
||||
# desc = self._parse_colspec(desc)
|
||||
# if len(desc) == 1:
|
||||
# if desc[0] == 'i':
|
||||
# return pd.RangeIndex(n, name=self.index_name())
|
||||
# elif desc[0] in ('d', 't', 'x', 'y'):
|
||||
# vals = self._generate_column(desc[0], n)
|
||||
# return pd.Index(vals, name=self.index_name())
|
||||
# elif not all(i[0] == 's' for i in desc):
|
||||
# raise ValueError(
|
||||
# f'Inadmissible index spec: only string, int, and date types allowed, not {desc}.')
|
||||
# level_value_lengths = [1 if len(i) == 1 else int(i[1:]) for i in desc]
|
||||
# return self.make_index(rows=n, levels=len(desc), level_value_lengths=level_value_lengths,
|
||||
# p0=1, padding=2)
|
||||
|
||||
@property
|
||||
def metric_name(self):
|
||||
"""Return a one-word metric name."""
|
||||
return next(self._metric_namer)
|
||||
|
||||
def word(self):
|
||||
"""Return a random word (cycles eventually)."""
|
||||
return next(self._word_gen)
|
||||
|
||||
def tex(self):
|
||||
"""Return a blob of TeX."""
|
||||
return next(self._tex_gen)
|
||||
|
||||
@staticmethod
|
||||
def random_date_within_last_n_years(n: int) -> pd.Timestamp:
|
||||
today = datetime.today()
|
||||
days = random.randint(0, n * 365)
|
||||
return pd.Timestamp(today - timedelta(days=days))
|
||||
|
||||
def _insert_missing(self, df: pd.DataFrame, prop: float) -> pd.DataFrame:
|
||||
"""Insert missing values into dataframe."""
|
||||
if prop <= 0:
|
||||
return df
|
||||
n_rows = df.shape[0]
|
||||
for col in df.columns:
|
||||
n_missing = max(1, int(np.floor(prop * n_rows)))
|
||||
missing_indices = self.rng.choice(
|
||||
n_rows, size=n_missing, replace=False)
|
||||
df.iloc[missing_indices, df.columns.get_loc(col)] = np.nan
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def _is_prime(p: int) -> bool:
|
||||
if p < 2:
|
||||
return False
|
||||
if p == 2:
|
||||
return True
|
||||
if p % 2 == 0:
|
||||
return False
|
||||
for i in range(3, int(p**0.5) + 1, 2):
|
||||
if p % i == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _next_prime(p: int) -> int:
|
||||
if p < 2:
|
||||
return 2
|
||||
p += 1 if p % 2 == 0 else 2 # ensure odd start > p
|
||||
while True:
|
||||
if Fabricator._is_prime(p):
|
||||
return p
|
||||
p += 2
|
||||
|
||||
def primes_for_product(self, n: int, v: int, shuffle: bool = False) -> list[int]:
|
||||
"""Return a list of v distinct primes whose product is >= n."""
|
||||
# starting prime is next after p0
|
||||
if n == 1:
|
||||
# still want it to work for n = 1
|
||||
return [1]
|
||||
p0 = max(1, int(n ** (1 / (v))))
|
||||
primes = []
|
||||
p = Fabricator._next_prime(max(p0 - 1, 1))
|
||||
while len(primes) < v:
|
||||
primes.append(p)
|
||||
p = Fabricator._next_prime(p)
|
||||
|
||||
while prod(primes := sorted(primes)) < n:
|
||||
# increase one level until product is high enough
|
||||
p = Fabricator._next_prime(primes[-1])
|
||||
primes[-1] = p
|
||||
# shuffle order? really hierarchical order will go smallest to largest...
|
||||
# but for rules other orders may be of interest?
|
||||
if shuffle:
|
||||
self.rng.shuffle(primes)
|
||||
return primes
|
||||
|
||||
# def make_index(self, rows: int, levels: int,
|
||||
# level_value_lengths: Union[list[int], None] = None,
|
||||
# p0: int = 1,
|
||||
# padding: int = 2):
|
||||
# """
|
||||
# Make an Index with unique values, rows x len(level_value_lengths) cols.
|
||||
|
||||
# level_velue_lengths shows how many words long each value should be.
|
||||
# padding = over-sample by padding and select sample.
|
||||
# """
|
||||
# if level_value_lengths is None:
|
||||
# level_value_lengths = random.sample(
|
||||
# self.index_value_lengths, levels)
|
||||
# else:
|
||||
# assert levels == len(
|
||||
# level_value_lengths), 'levels must equal len(level_value_lengths)'
|
||||
# level_choices = self.primes_for_product(rows * padding, levels, p0=p0)
|
||||
# r = [cycle([' '.join([self.word() for _ in range(w)]) for _ in range(k)])
|
||||
# for w, k in zip(level_value_lengths, level_choices)]
|
||||
# x = [[next(j) for j in r] for i in range(rows)]
|
||||
# names = random.sample(name_word_list, levels)
|
||||
# if levels == 1:
|
||||
# idx = pd.Index(
|
||||
# list(chain.from_iterable(random.sample(x, rows))), name=names[0]).sort_values()
|
||||
# else:
|
||||
# idx = pd.MultiIndex.from_tuples(
|
||||
# random.sample(x, rows), names=names).sort_values()
|
||||
# assert idx.is_unique
|
||||
# return idx
|
||||
@@ -1,404 +0,0 @@
|
||||
"""
|
||||
Make fake dataframes for testing.
|
||||
|
||||
GPT from SJMM design.
|
||||
"""
|
||||
|
||||
from collections import deque
|
||||
from datetime import datetime, timedelta
|
||||
from importlib.resources import files
|
||||
from itertools import cycle, chain
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
import hashlib
|
||||
import random
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
name_word_list = [
|
||||
"account",
|
||||
"address",
|
||||
"amount",
|
||||
"balance",
|
||||
"category",
|
||||
"client",
|
||||
"combined ratio",
|
||||
"comment",
|
||||
"currency",
|
||||
"description",
|
||||
"duration",
|
||||
"email",
|
||||
"entry",
|
||||
"estimate",
|
||||
"extension",
|
||||
"failure",
|
||||
"filename",
|
||||
"identifier",
|
||||
"location",
|
||||
"loss ratio",
|
||||
"note",
|
||||
"operation",
|
||||
"premium",
|
||||
"processing",
|
||||
"project",
|
||||
"reference",
|
||||
"remark",
|
||||
"status",
|
||||
"supplier",
|
||||
"timestamp",
|
||||
"transaction",
|
||||
"type",
|
||||
"user",
|
||||
'expense ratio',
|
||||
'loss date'
|
||||
]
|
||||
|
||||
|
||||
class TestDataFrameFactory:
|
||||
"""
|
||||
Create super-dooper test dataframes.
|
||||
"""
|
||||
|
||||
def __init__(self, seed: Optional[int] = None):
|
||||
"""
|
||||
Factory for generating small synthetic pandas DataFrames for testing.
|
||||
|
||||
Attributes:
|
||||
seed: Optional random seed. If None, one is generated.
|
||||
"""
|
||||
self._last_args = {}
|
||||
self.seed = int(
|
||||
seed if seed is not None else np.random.SeedSequence().entropy)
|
||||
|
||||
# rng
|
||||
self.rng = np.random.default_rng(self.seed)
|
||||
|
||||
# word list for names of index levels
|
||||
nwl = name_word_list[:]
|
||||
random.shuffle(nwl)
|
||||
self._index_namer = cycle(nwl)
|
||||
|
||||
# read words and create cycler
|
||||
data_path = files('greater_tables').joinpath('data', 'words-12.md')
|
||||
with data_path.open('r', encoding='utf-8') as f:
|
||||
txt = f.read()
|
||||
word_list = txt.split('\n')
|
||||
temp = word_list[:]
|
||||
random.shuffle(temp)
|
||||
self._word_gen = cycle(temp)
|
||||
|
||||
# read tex expressions and create cycler
|
||||
data_path = files('greater_tables').joinpath('data', 'tex_list.csv')
|
||||
with data_path.open('r', encoding='utf-8') as f:
|
||||
tex_list = pd.read_csv(f, index_col=0)['expr'].to_list()
|
||||
# trim down slightly
|
||||
pat = re.compile(r'(?<!\\)\b[a-z]{4,}\b')
|
||||
tex_list = [i for i in tex_list if not pat.search(i) and len(i)<=50]
|
||||
random.shuffle(tex_list)
|
||||
self._tex_gen = cycle(tex_list)
|
||||
|
||||
self.simple_namer = {
|
||||
'd': 'date',
|
||||
'f': 'float',
|
||||
'h': 'hash',
|
||||
'i': 'integer',
|
||||
'l': 'large_float',
|
||||
'm': 'yr-mo',
|
||||
'p': 'path',
|
||||
'r': 'ratio',
|
||||
's': 'string',
|
||||
't': 'time',
|
||||
'v': 'extreme_float',
|
||||
'x': 'tex',
|
||||
'y': 'year',
|
||||
}
|
||||
|
||||
# lengths of index (word count) sampled from:
|
||||
self.index_value_lengths = [1]*10 + [2] * 4 + [3]
|
||||
self.cache = deque(maxlen=10)
|
||||
|
||||
# def cache(self, n=0):
|
||||
# """Get nth item ago from cache, default = 0, latest."""
|
||||
# if n < len(self._cache):
|
||||
# return self._cache[n]
|
||||
# else:
|
||||
# print(f'Cache only contains {len(self._cache)} < {n} items.')
|
||||
|
||||
def make(self, rows: int, columns: Union[int, str], index: Union[int, str] = 0,
|
||||
col_index: Union[int, str] = 0, missing: float = 0.0) -> pd.DataFrame:
|
||||
"""
|
||||
Generate a test DataFrame with the given specification.
|
||||
|
||||
Data types
|
||||
|
||||
d date
|
||||
f float
|
||||
h hash
|
||||
i integer
|
||||
l log float (greater range than float)
|
||||
m year - month
|
||||
p path (filename)
|
||||
r ratio (smaller floats, for percents)
|
||||
sx string length x
|
||||
t time
|
||||
v very large range float
|
||||
x tex text - an equation
|
||||
y year
|
||||
|
||||
Args:
|
||||
rows: Number of rows.
|
||||
columns: Column type spec (int for all float cols, or string type codes).
|
||||
index: Index level types (int for RangeIndex or string like 'ti').
|
||||
col_index: Column index levels (same format as `index`).
|
||||
missing: Proportion of missing data in each column.
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
self._last_args = dict(rows=rows, columns=columns,
|
||||
index=index, col_index=col_index, missing=missing)
|
||||
return self._generate(**self._last_args)
|
||||
|
||||
def another(self, new_seed: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Generate another DataFrame with the last parameters.
|
||||
|
||||
Args:
|
||||
new_seed: If True, re-randomize the generator seed.
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
if new_seed:
|
||||
self.seed = int(np.random.SeedSequence().entropy)
|
||||
self.rng = np.random.default_rng(self.seed)
|
||||
return self._generate(**self._last_args)
|
||||
|
||||
def random(self, index_levels: int = 0, column_levels: int = 0, omit: str = 'p') -> pd.DataFrame:
|
||||
"""
|
||||
Generate a DataFrame with randomly chosen settings.
|
||||
|
||||
|
||||
Args:
|
||||
index_levels: Number of index levels to use.
|
||||
column_levels: Number of column MultiIndex levels.
|
||||
omit: omit column datatypes in omit
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
if index_levels == 0:
|
||||
index_levels = random.choice([1, 1, 1, 1, 1, 2, 2, 3])
|
||||
if column_levels == 0:
|
||||
column_levels = random.choice([1, 1, 1, 1, 1, 2, 2, 3])
|
||||
rows = self.rng.integers(5 * index_levels, 10 * index_levels)
|
||||
valid_types = [i for i in ['d', 'f', 'i', 's3', 'l', 'h', 't', 'p', 'x', 'r', 'y']
|
||||
if i not in omit]
|
||||
col_types = self.rng.choice(
|
||||
valid_types, size=self.rng.integers(3, 7))
|
||||
missing = round(float(self.rng.uniform(0, 0.15)), 2)
|
||||
index = ''.join(self.rng.choice(
|
||||
['t', 'd', 'y', 'i', 's2'], size=index_levels))
|
||||
col_index = ''.join(self.rng.choice(
|
||||
['s', 's2', 's2', 's3'], size=column_levels))
|
||||
return self.make(rows=rows, columns=''.join(col_types), index=index, col_index=col_index, missing=missing)
|
||||
|
||||
def _generate(self, rows: int, columns: Union[int, str], index: Union[int, str],
|
||||
col_index: Union[int, str], missing: float) -> pd.DataFrame:
|
||||
# if columns is an int then make up types
|
||||
if isinstance(columns, int):
|
||||
col_types = self.rng.choice(
|
||||
['d', 't', 'f', 'l', 'i', 's1', 's3', 's9', 'h', 'p', 'x'], size=columns)
|
||||
else:
|
||||
col_types = self._parse_colspec(columns)
|
||||
# if col_index is an int then use all strings of that depth
|
||||
if col_index == 'simple':
|
||||
col_idx = map(self.simple_namer.get, [i[0] for i in col_types])
|
||||
col_idx = pd.Index(col_idx, name='simple')
|
||||
else:
|
||||
if isinstance(col_index, int):
|
||||
col_index_types = ['s'] * col_index
|
||||
else:
|
||||
col_index_types = self._parse_colspec(col_index)
|
||||
col_idx = self._make_index(col_index_types, len(col_types))
|
||||
if isinstance(index, int):
|
||||
index = ['s'] * index
|
||||
else:
|
||||
index = self._parse_colspec(index)
|
||||
# print(index)
|
||||
# col names are a transposed index.
|
||||
df = pd.DataFrame(index=range(rows))
|
||||
for dt, c in zip(col_types, range(len(col_idx))):
|
||||
df[c] = self._generate_column(dt, rows)
|
||||
df.columns = col_idx
|
||||
df.index = self._make_index(index, rows)
|
||||
df = self._insert_missing(df, missing)
|
||||
self.cache.appendleft(df)
|
||||
return df
|
||||
|
||||
def _parse_colspec(self, spec: str) -> list[str]:
|
||||
return re.findall(r's\d+|[a-z]', spec)
|
||||
|
||||
def _generate_column(self, dtype: str, n: int) -> pd.Series:
|
||||
if dtype.startswith('s'):
|
||||
max_words = int(dtype[1:]) if len(dtype) > 1 else 1
|
||||
return pd.Series([" ".join(self.word() for i in range(max_words)) for j in range(n)])
|
||||
if dtype == 'f':
|
||||
return pd.Series(self.rng.normal(loc=100000, scale=250000, size=n))
|
||||
if dtype == 'r':
|
||||
return pd.Series(self.rng.normal(loc=0.5, scale=0.35, size=n))
|
||||
if dtype == 'l':
|
||||
# log float (greater range)
|
||||
scale = 10.
|
||||
return pd.Series(np.exp(self.rng.normal(loc=-scale**2 / 2 + 15, scale=scale, size=n)))
|
||||
if dtype == 'v':
|
||||
# log float (greater range)
|
||||
sc = 5
|
||||
return pd.Series(np.exp(self.rng.normal(loc=-sc**2 / 2 + 10, scale=sc, size=n)))
|
||||
if dtype == 'i':
|
||||
return pd.Series(self.rng.integers(-1e4, 1e6, size=n), dtype='int64')
|
||||
if dtype == 'd':
|
||||
start_date = TestDataFrameFactory.random_date_within_last_n_years(
|
||||
10)
|
||||
return pd.Series(pd.date_range(start=start_date, periods=n, freq='D'))
|
||||
if dtype == 'y':
|
||||
return pd.Series(random.sample(range(1990, 2031), n))
|
||||
if dtype == 't':
|
||||
start_dt = datetime.now() - timedelta(days=365 * 2)
|
||||
return pd.Series([
|
||||
start_dt +
|
||||
timedelta(minutes=int(self.rng.integers(0, 2 * 365 * 24 * 60)))
|
||||
for _ in range(n)
|
||||
])
|
||||
if dtype == 'h':
|
||||
return pd.Series([
|
||||
hashlib.blake2b(f"val{i}".encode(), digest_size=32).hexdigest()
|
||||
for i in range(n)
|
||||
])
|
||||
if dtype == 'p':
|
||||
return pd.Series([str(Path(f"/data/{self.word()}/{i}.dat")) for i in range(n)])
|
||||
if dtype == 'x':
|
||||
# tex
|
||||
return pd.Series([self.tex() for i in range(n)])
|
||||
raise ValueError(f"Unknown dtype: {dtype}")
|
||||
|
||||
def _make_index(self, desc: Union[int, str, list[str]], n: int) -> pd.Index:
|
||||
if isinstance(desc, int):
|
||||
return pd.RangeIndex(n, name=self.index_name())
|
||||
if isinstance(desc, str):
|
||||
desc = self._parse_colspec(desc)
|
||||
if len(desc) == 1:
|
||||
if desc[0] == 'i':
|
||||
return pd.RangeIndex(n, name=self.index_name())
|
||||
elif desc[0] in ('d', 't', 'x', 'y'):
|
||||
vals = self._generate_column(desc[0], n)
|
||||
return pd.Index(vals, name=self.index_name())
|
||||
elif not all(i[0] == 's' for i in desc):
|
||||
raise ValueError(
|
||||
f'Inadmissible index spec: only string, int, and date types allowed, not {desc}.')
|
||||
level_value_lengths = [1 if len(i) == 1 else int(i[1:]) for i in desc]
|
||||
return self.make_index(rows=n, levels=len(desc), level_value_lengths=level_value_lengths,
|
||||
p0=1, padding=2)
|
||||
|
||||
def index_name(self):
|
||||
"""Return a one-word index name."""
|
||||
return next(self._index_namer)
|
||||
|
||||
def word(self):
|
||||
"""Return a random word (cycles eventually)."""
|
||||
return next(self._word_gen)
|
||||
|
||||
def tex(self):
|
||||
"""Return a blob of TeX."""
|
||||
return next(self._tex_gen)
|
||||
|
||||
@staticmethod
|
||||
def random_date_within_last_n_years(n: int) -> pd.Timestamp:
|
||||
today = datetime.today()
|
||||
days = random.randint(0, n * 365)
|
||||
return pd.Timestamp(today - timedelta(days=days))
|
||||
|
||||
def _insert_missing(self, df: pd.DataFrame, prop: float) -> pd.DataFrame:
|
||||
"""Insert missing values into dataframe."""
|
||||
if prop <= 0:
|
||||
return df
|
||||
n_rows = df.shape[0]
|
||||
for col in df.columns:
|
||||
n_missing = max(1, int(np.floor(prop * n_rows)))
|
||||
missing_indices = self.rng.choice(
|
||||
n_rows, size=n_missing, replace=False)
|
||||
df.iloc[missing_indices, df.columns.get_loc(col)] = np.nan
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def _is_prime(p: int) -> bool:
|
||||
if p < 2:
|
||||
return False
|
||||
if p == 2:
|
||||
return True
|
||||
if p % 2 == 0:
|
||||
return False
|
||||
for i in range(3, int(p**0.5) + 1, 2):
|
||||
if p % i == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _next_prime(p: int) -> int:
|
||||
if p < 2:
|
||||
return 2
|
||||
p += 1 if p % 2 == 0 else 2 # ensure odd start > p
|
||||
while True:
|
||||
if TestDataFrameFactory._is_prime(p):
|
||||
return p
|
||||
p += 2
|
||||
|
||||
@staticmethod
|
||||
def primes_for_product(n: int, v: int, p0: int) -> list[int]:
|
||||
"""Return a list of distinct primes all >= p0 whose product is >= n."""
|
||||
primes = []
|
||||
p = TestDataFrameFactory._next_prime(max(p0 - 1, 1))
|
||||
while len(primes) < v:
|
||||
primes.append(p)
|
||||
p = TestDataFrameFactory._next_prime(p)
|
||||
|
||||
while prod(primes := sorted(primes)) < n:
|
||||
# increase one level until product is high enough
|
||||
p = TestDataFrameFactory._next_prime(primes[-1])
|
||||
primes[-1] = p
|
||||
# shuffle order
|
||||
random.shuffle(primes)
|
||||
return primes
|
||||
|
||||
def make_index(self, rows: int, levels: int,
|
||||
level_value_lengths: Union[list[int], None] = None,
|
||||
p0: int = 1,
|
||||
padding: int = 2):
|
||||
"""
|
||||
Make an Index with unique values, rows x len(level_value_lengths) cols.
|
||||
|
||||
level_velue_lengths shows how many words long each value should be.
|
||||
padding = over-sample by padding and select sample.
|
||||
"""
|
||||
if level_value_lengths is None:
|
||||
level_value_lengths = random.sample(
|
||||
self.index_value_lengths, levels)
|
||||
else:
|
||||
assert levels == len(
|
||||
level_value_lengths), 'levels must equal len(level_value_lengths)'
|
||||
level_choices = self.primes_for_product(rows * padding, levels, p0=p0)
|
||||
r = [cycle([' '.join([self.word() for _ in range(w)]) for _ in range(k)])
|
||||
for w, k in zip(level_value_lengths, level_choices)]
|
||||
x = [[next(j) for j in r] for i in range(rows)]
|
||||
names = random.sample(name_word_list, levels)
|
||||
if levels == 1:
|
||||
idx = pd.Index(
|
||||
list(chain.from_iterable(random.sample(x, rows))), name=names[0]).sort_values()
|
||||
else:
|
||||
idx = pd.MultiIndex.from_tuples(
|
||||
random.sample(x, rows), names=names).sort_values()
|
||||
assert idx.is_unique
|
||||
return idx
|
||||
@@ -44,6 +44,7 @@ version = { attr = "greater_tables.__version__" }
|
||||
dev = [
|
||||
"docutils<0.17",
|
||||
"jupyter-sphinx",
|
||||
"myst_parser",
|
||||
"nbsphinx",
|
||||
"pickleshare",
|
||||
"recommonmark>=0.7.1",
|
||||
|
||||
Reference in New Issue
Block a user