Added pyarrow support in GT (new and old versions) and in Fabricator

This commit is contained in:
Stephen Mildenhall
2025-12-19 10:51:12 +00:00
parent 3e208fe783
commit 3b5c2118f0
7 changed files with 2476 additions and 4864 deletions
+1
View File
@@ -169,3 +169,4 @@ docs/library.bib
# rust
gtfont/target
/.virtual_documents
+7
View File
@@ -11,9 +11,16 @@ Versions and Change Log
.. TODO
* self.padl and r / 12 in make html width adj s/b elsewhere
6.0.0
-----
* Support for pyarrow data types.
* Added pyarrow option to Fabricator to create pyarrow dataframes.
5.4.1 and 5.4.1
------------------
* Minor bug fixes; tikz method no works for empty dataframe.
* Non-integer width bug.
5.4
----
+1 -1
View File
@@ -1,6 +1,6 @@
__project__ = 'greater_tables'
__author__ = 'Stephen J Mildenhall'
__version__ = '5.4.2'
__version__ = '6.0.0'
from . core import GT
from . fabrications import *
+305 -920
View File
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+12 -7
View File
@@ -11,7 +11,6 @@ from math import prod
from pathlib import Path
from typing import Optional, Union, Literal
import hashlib
import random
import re
import numpy as np
@@ -93,7 +92,7 @@ class Fabricator:
metric_suffix = ["", "rate", "score", "amount",
"index", "ratio", "factor", "value"]
def __init__(self, decorate=False, seed: Optional[int] = None):
def __init__(self, decorate=False, pyarrow: bool = False, seed: Optional[int] = None):
"""
Fabricate small synthetic pandas DataFrames for testing.
@@ -105,6 +104,7 @@ class Fabricator:
self.seed = int(
seed if seed is not None else np.random.SeedSequence().entropy)
self.decorate = decorate
self.pyarrow = pyarrow
# rng
self.rng = np.random.default_rng(self.seed)
@@ -295,6 +295,9 @@ class Fabricator:
if simplify:
df = self.drop_singleton_levels(df)
if self.pyarrow:
df = df.convert_dtypes(dtype_backend='pyarrow')
self.cache.appendleft(df)
return df
@@ -366,7 +369,7 @@ class Fabricator:
if dtype == 'd':
return random_datetime_series(n, rng=self.rng)
if dtype == 'y':
return pd.Series(random.sample(range(1990, 2031), n))
return pd.Series(self.rng.integers(1990, 2031, n))
if dtype == 't':
start_dt = datetime.now() - timedelta(days=365 * 2)
return pd.Series([
@@ -411,7 +414,8 @@ class Fabricator:
@staticmethod
def random_date_within_last_n_years(n: int) -> pd.Timestamp:
today = datetime.today()
days = random.randint(0, n * 365)
# days = random.randint(0, n * 365)
days = self.rng.integers(0, n * 365, endpoint=True)
return pd.Timestamp(today - timedelta(days=days))
def _insert_missing(self, df: pd.DataFrame, prop: float) -> pd.DataFrame:
@@ -473,10 +477,11 @@ class Fabricator:
return primes
def quick_fab(rows: int = 10, data_spec: str = 's3sfid', **kwargs):
def quick_fab(rows: int = 10, data_spec: str = 's3sfid', pyarrow=False, **kwargs):
"""One-stop quick fabrication of a random dataframe."""
fab = Fabricator()
return fab.make(rows, data_spec, **kwargs)
fab = Fabricator(pyarrow=pyarrow)
df = fab.make(rows, data_spec, **kwargs)
return df
rand_df = make_df = quick_df = quick_fab
+79 -3927
View File
File diff suppressed because it is too large Load Diff