Files
2019-08-05 16:40:04 -03:00

91 lines
3.0 KiB
Python

import logging
import hashlib
import pandas as pd
import pandas_market_calendars as mcal
from . import cboe
from .notifications import slack_notification
logger = logging.getLogger(__name__)
def file_hash_matches_data(file_path, data):
file_hash = file_md5(file_path)
data_md5 = hashlib.md5(data.encode()).hexdigest()
return file_hash == data_md5
def file_md5(file, chunk_size=4096):
md5 = hashlib.md5()
with open(file, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
md5.update(chunk)
return md5.hexdigest()
def validate_dates_in_month(symbol, date_range):
"""Compares `date_range` (month) with NYSE trading calendar.
Returns `True` if there are no missing days.
"""
# NYSE and CBOE have the same trading calendar
# https://www.nyse.com/markets/hours-calendars
# http://cfe.cboe.com/about-cfe/holiday-calendar
nyse = mcal.get_calendar("NYSE")
first_date = date_range[0]
period = pd.Period(year=first_date.year, month=first_date.month, freq="M")
trading_days = nyse.valid_days(start_date=period.start_time,
end_date=period.end_time)
# Remove timezone info
trading_days = trading_days.tz_convert(tz=None)
missing_days = trading_days.difference(date_range)
if not missing_days.empty:
logger.error("Error validating monthly dates. Missing: %s",
missing_days)
return missing_days.empty
def validate_historical_dates(symbol, date_range):
"""Compares `date_range` (any time range) with trading calendar.
Returns `True` if there are no missing days.
"""
nyse = mcal.get_calendar("NYSE")
start_date = date_range.min()
end_date = date_range.max()
trading_days = nyse.valid_days(start_date=start_date, end_date=end_date)
# Remove timezone info
trading_days = trading_days.tz_convert(tz=None)
date_range = date_range.dt.tz_convert(tz=None)
missing_days = trading_days.difference(date_range)
if not missing_days.empty:
logger.error("Error validating historical dates. Missing: %s",
missing_days)
return missing_days.empty
def validate_columns(expected, received):
"""Verify that the `received` columns scraped are equal to `expected`"""
expected = sorted(expected)
received = sorted(received)
valid = expected == received
if not valid:
expected_cols = ", ".join(expected)
received_cols = ", ".join(received)
msg = """Columns expected differ from those received.
Expected: {}
Received: {}""".format(expected_cols, received_cols)
logger.error(msg)
slack_notification(msg, __name__)
return valid
def validate_aggregate_file(aggregate_file, daily_files):
"""Compares `aggregate_file` with the data from `daily_files`."""
aggregate_df = pd.read_csv(aggregate_file)
recreated_df = cboe.concatenate_files(daily_files)
return aggregate_df.equals(recreated_df)