Files
greater_tables_project/greater_tables/utilities.py
T

123 lines
4.6 KiB
Python

import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import HTML, display
from . greater_tables import GT
# __gt_global = GT()
# def qhtml(df, **kwargs):
# """Generic "quick display" function."""
# return HTML(__gt_global(df, **kwargs))
# def qd(df, **kwargs):
# """Generic "quick display" function."""
# if isinstance(df, pd.Series):
# if df.name is None:
# df.name = 'value'
# df = df.to_frame()
# return display(HTML(__gt_global(df, **kwargs)))
def create_three_level_multiindex(df):
"""
Adds two random levels to a DataFrame's column MultiIndex.
Parameters:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with a three-level MultiIndex on the columns.
"""
n_columns = len(df.columns)
level_1 = np.random.choice(["A", "B", "C"], size=n_columns)
level_2 = np.random.choice(["X", "Y", "Z"], size=n_columns)
# Create the MultiIndex
multi_index = pd.MultiIndex.from_tuples(
[(l1, l2, col) for l1, l2, col in zip(level_1, level_2, df.columns)],
names=["Level 1", "Level 2", df.columns.name]
)
# Apply the new MultiIndex to the DataFrame
df.columns = multi_index
return df
def test_df(date=False, mi_columns=True):
"""Make a test dataframe nr rows with multi index."""
nr = 10
words = 'Parliament organised a year-long programme of events called Parliament in the Making to celebrate the 800th anniversary of the sealing of Magna Carta on 15 June and the 750th anniversary of the first representative parliament on 20 January Events were coordinated with Parliament Week'
words = list(set(words.split(' ')))
w1 = ['Abel', 'Cain', 'Issac', 'Fred', 'George']
w2 = ['South', 'East', 'West', 'North']
w4 = ['A', 'B', 'C', 'D']
df = pd.DataFrame({'idx1': np.random.choice(w1, nr),
'idx2': np.random.choice(w2, nr),
'idx3': np.random.poisson(2, nr),
'floats': np.random.rand(nr) * 3000.,
'smaller': np.random.rand(nr) * 10 ** np.linspace(-3, 4, nr),
'larger': np.random.choice([-1., 0, 1.], nr) * np.random.rand(nr) * 10 ** np.linspace(3, 12, nr),
'ints': np.random.poisson(20, nr),
'powers': np.pi * 10. ** np.arange(-20, 26, 5),
'ratios': np.random.rand(nr) * 3. - 1.,
'string': [' '.join(np.random.choice(words, 4, replace=False)) for i in range(nr)],
# 'object': [np.random.poisson(2, nr) for i in range(nr)]
})
if date:
df['date'] = [dt.datetime.fromordinal(np.random.randint(dt.date(2020, 1, 1).toordinal(),
dt.date(2030, 1, 1).toordinal())) + dt.timedelta(seconds=np.random.randint(86400)) for _ in range(nr)]
df['date'] = pd.to_datetime(df['date'])
df.columns.name = 'Col name'
df = df.set_index(['idx1', 'idx2', 'idx3'])
if mi_columns:
df = create_three_level_multiindex(df)
# check unique and sort
df = df.loc[df.index[~df.index.duplicated()]]
assert np.all(~df.columns.duplicated()), 'Columns not all unique'
df = df.sort_index(axis=0).sort_index(axis=1)
return df
def make_test_dfs():
"""Make a dict of test dataframes with different characteristics."""
ans = {}
df = pd.DataFrame({'x': range(10), 'y': np.arange(10, dtype=float)})
df.x = df.x ** 2
df['z'] = df.y ** .25
ans['basic'] = df.copy()
df1 = df.copy()
df1.index.name = 'idx name'
ans['basic w idx name'] = df1.copy()
df1 = df.copy()
df1.columns.name = 'col name'
ans['basic w col name'] = df1.copy()
df1 = df.copy()
df1.index.name = 'idx name'
df1.columns.name = 'col name'
ans['basic w both names'] = df1.copy()
df1['date'] = [dt.datetime.fromordinal(np.random.randint(dt.date(2020, 1, 1).toordinal(),
dt.date(2030, 1, 1).toordinal())) + dt.timedelta(seconds=np.random.randint(86400))
for _ in range(len(df1))]
df2 = df1.set_index('date')
ans['time series'] = df2.copy()
df2 = df1.set_index('date', append=True)
ans['time series and range'] = df2.copy()
ans['realistic'] = test_df(date=False, mi_columns=False)
ans['realistic w date'] = test_df(date=True, mi_columns=False).droplevel(2, axis=0)
ans['realistic mi'] = test_df(date=False, mi_columns=True).droplevel(2, axis=1)
ans['realistic mi w date'] = test_df(date=True, mi_columns=True).droplevel(2, axis=0).droplevel(2, axis=1)
return ans