BLD: improved the bundle test suite and related adjustments

This commit is contained in:
Frederic Fortier
2018-01-18 17:09:37 -05:00
parent 563fc433d5
commit 51126fd7ae
11 changed files with 409 additions and 326 deletions
+21 -3
View File
@@ -22,8 +22,10 @@ from catalyst.exchange.exchange_errors import InvalidHistoryFrequencyError, \
UnsupportedHistoryFrequencyError
from catalyst.exchange.exchange_execution import ExchangeLimitOrder
from catalyst.exchange.utils.exchange_utils import mixin_market_params, \
from_ms_timestamp, get_epoch, get_exchange_folder, get_catalyst_symbol, \
get_exchange_folder, get_catalyst_symbol, \
get_exchange_auth
from exchange.utils.datetime_utils import from_ms_timestamp, get_epoch, \
get_periods_range
from catalyst.finance.order import Order, ORDER_STATUS
from catalyst.finance.transaction import Transaction
@@ -399,7 +401,7 @@ class CCXT(Exchange):
timeframe, source='ccxt', raise_error=raise_error
)
def get_candles(self, freq, assets, bar_count=None, start_dt=None,
def get_candles(self, freq, assets, bar_count=1, start_dt=None,
end_dt=None):
is_single = (isinstance(assets, TradingPair))
if is_single:
@@ -416,9 +418,25 @@ class CCXT(Exchange):
freqs=freqs,
)
if start_dt is not None and end_dt is not None:
raise ValueError(
'Please provide either start_dt or end_dt, not both.'
)
elif end_dt is not None:
dt_range = get_periods_range(
end_dt=end_dt,
periods=bar_count,
freq=freq,
)
# skip the left bound of the range since the open range is
# on the right bound
start_dt = dt_range[1]
ms = None
if start_dt is not None:
delta = start_dt - get_epoch()
if end_dt is not None:
delta = start_dt - get_epoch()
ms = int(delta.total_seconds()) * 1000
candles = dict()
+5 -6
View File
@@ -13,10 +13,10 @@ from catalyst.exchange.exchange_errors import MismatchingBaseCurrencies, \
PricingDataNotLoadedError, \
NoDataAvailableOnExchange, NoValueForField, LastCandleTooEarlyError, \
TickerNotFoundError, NotEnoughCashError
from catalyst.exchange.utils.bundle_utils import get_start_dt, \
get_delta, get_periods, get_periods_range
from exchange.utils.datetime_utils import get_delta, get_periods_range, \
get_periods, get_start_dt, get_frequency
from catalyst.exchange.utils.exchange_utils import get_exchange_symbols, \
get_frequency, resample_history_df, has_bundle
resample_history_df, has_bundle
from logbook import Logger
log = Logger('Exchange', level=LOG_LEVEL)
@@ -433,7 +433,7 @@ class Exchange:
series = pd.Series(values, index=dates)
periods = get_periods_range(
start_dt, end_dt, data_frequency
start_dt=start_dt, end_dt=end_dt, freq=data_frequency
)
# TODO: ensure that this working as expected, if not use fillna
series = series.reindex(
@@ -929,8 +929,7 @@ class Exchange:
pass
@abstractmethod
def get_candles(self, freq, assets, bar_count=None,
start_dt=None, end_dt=None):
def get_candles(self, freq, assets, bar_count, start_dt=None, end_dt=None):
"""
Retrieve OHLCV candles for the given assets
+3 -3
View File
@@ -21,9 +21,9 @@ from catalyst.exchange.exchange_errors import EmptyValuesInBundleError, \
NoDataAvailableOnExchange, \
PricingDataNotLoadedError, DataCorruptionError, PricingDataValueError
from catalyst.exchange.utils.bundle_utils import range_in_bundle, \
get_bcolz_chunk, get_month_start_end, \
get_year_start_end, get_df_from_arrays, get_start_dt, get_period_label, \
get_delta, get_assets
get_bcolz_chunk, get_df_from_arrays, get_assets
from exchange.utils.datetime_utils import get_delta, get_start_dt, \
get_period_label, get_month_start_end, get_year_start_end
from catalyst.exchange.utils.exchange_utils import get_exchange_folder, \
save_exchange_symbols, mixin_market_params, get_catalyst_symbol
from catalyst.utils.cli import maybe_show_progress
+2 -2
View File
@@ -9,8 +9,8 @@ from catalyst.exchange.exchange_bundle import ExchangeBundle
from catalyst.exchange.exchange_errors import (
ExchangeRequestError,
PricingDataNotLoadedError)
from catalyst.exchange.utils.exchange_utils import get_frequency, \
resample_history_df, group_assets_by_exchange
from catalyst.exchange.utils.exchange_utils import resample_history_df, group_assets_by_exchange
from exchange.utils.datetime_utils import get_frequency
from logbook import Logger
from redo import retry
+13 -212
View File
@@ -1,11 +1,19 @@
import calendar
import os
import tarfile
from datetime import timedelta, datetime, date
from datetime import datetime
import numpy as np
import pandas as pd
import pytz
from catalyst.data.bundles.core import download_without_progress
from catalyst.exchange.utils.exchange_utils import get_exchange_bundles_folder
import os
import tarfile
from datetime import datetime
import numpy as np
import pandas as pd
from catalyst.data.bundles.core import download_without_progress
from catalyst.exchange.utils.exchange_utils import get_exchange_bundles_folder
@@ -13,41 +21,6 @@ EXCHANGE_NAMES = ['bitfinex', 'bittrex', 'poloniex']
API_URL = 'http://data.enigma.co/api/v1'
def get_date_from_ms(ms):
"""
The date from the number of miliseconds from the epoch.
Parameters
----------
ms: int
Returns
-------
datetime
"""
return datetime.fromtimestamp(ms / 1000.0)
def get_seconds_from_date(date):
"""
The number of seconds from the epoch.
Parameters
----------
date: datetime
Returns
-------
int
"""
epoch = datetime.utcfromtimestamp(0)
epoch = epoch.replace(tzinfo=pytz.UTC)
return int((date - epoch).total_seconds())
def get_bcolz_chunk(exchange_name, symbol, data_frequency, period):
"""
Download and extract a bcolz bundle.
@@ -77,8 +50,8 @@ def get_bcolz_chunk(exchange_name, symbol, data_frequency, period):
if not os.path.isdir(path):
url = 'https://s3.amazonaws.com/enigmaco/catalyst-bundles/' \
'exchange-{exchange}/{name}.tar.gz'.format(
exchange=exchange_name,
name=name)
exchange=exchange_name,
name=name)
bytes = download_without_progress(url)
with tarfile.open('r', fileobj=bytes) as tar:
@@ -87,178 +60,6 @@ def get_bcolz_chunk(exchange_name, symbol, data_frequency, period):
return path
def get_delta(periods, data_frequency):
"""
Get a time delta based on the specified data frequency.
Parameters
----------
periods: int
data_frequency: str
Returns
-------
timedelta
"""
return timedelta(minutes=periods) \
if data_frequency == 'minute' else timedelta(days=periods)
def get_periods_range(start_dt, end_dt, freq):
"""
Get a date range for the specified parameters.
Parameters
----------
start_dt: datetime
end_dt: datetime
freq: str
Returns
-------
DateTimeIndex
"""
if freq == 'minute':
freq = 'T'
elif freq == 'daily':
freq = 'D'
return pd.date_range(start_dt, end_dt, freq=freq)
def get_periods(start_dt, end_dt, freq):
"""
The number of periods in the specified range.
Parameters
----------
start_dt: datetime
end_dt: datetime
freq: str
Returns
-------
int
"""
return len(get_periods_range(start_dt, end_dt, freq))
def get_start_dt(end_dt, bar_count, data_frequency, include_first=True):
"""
The start date based on specified end date and data frequency.
Parameters
----------
end_dt: datetime
bar_count: int
data_frequency: str
Returns
-------
datetime
"""
periods = bar_count
if periods > 1:
delta = get_delta(periods, data_frequency)
start_dt = end_dt - delta
if not include_first:
start_dt += get_delta(1, data_frequency)
else:
start_dt = end_dt
return start_dt
def get_period_label(dt, data_frequency):
"""
The period label for the specified date and frequency.
Parameters
----------
dt: datetime
data_frequency: str
Returns
-------
str
"""
if data_frequency == 'minute':
return '{}-{:02d}'.format(dt.year, dt.month)
else:
return '{}'.format(dt.year)
def get_month_start_end(dt, first_day=None, last_day=None):
"""
The first and last day of the month for the specified date.
Parameters
----------
dt: datetime
first_day: datetime
last_day: datetime
Returns
-------
datetime, datetime
"""
month_range = calendar.monthrange(dt.year, dt.month)
if first_day:
month_start = first_day
else:
month_start = pd.to_datetime(datetime(
dt.year, dt.month, 1, 0, 0, 0, 0
), utc=True)
if last_day:
month_end = last_day
else:
month_end = pd.to_datetime(datetime(
dt.year, dt.month, month_range[1], 23, 59, 0, 0
), utc=True)
if month_end > pd.Timestamp.utcnow():
month_end = pd.Timestamp.utcnow().floor('1D')
return month_start, month_end
def get_year_start_end(dt, first_day=None, last_day=None):
"""
The first and last day of the year for the specified date.
Parameters
----------
dt: datetime
first_day: datetime
last_day: datetime
Returns
-------
datetime, datetime
"""
year_start = first_day if first_day \
else pd.to_datetime(date(dt.year, 1, 1), utc=True)
year_end = last_day if last_day \
else pd.to_datetime(date(dt.year, 12, 31), utc=True)
if year_end > pd.Timestamp.utcnow():
year_end = pd.Timestamp.utcnow().floor('1D')
return year_start, year_end
def get_df_from_arrays(arrays, periods):
"""
A DataFrame from the specified OHCLV arrays.
+327
View File
@@ -0,0 +1,327 @@
import calendar
import re
from datetime import datetime, timedelta, date
import pandas as pd
import pytz
from exchange.exchange_errors import InvalidHistoryFrequencyError, \
InvalidHistoryFrequencyAlias
def get_date_from_ms(ms):
"""
The date from the number of miliseconds from the epoch.
Parameters
----------
ms: int
Returns
-------
datetime
"""
return datetime.fromtimestamp(ms / 1000.0)
def get_seconds_from_date(date):
"""
The number of seconds from the epoch.
Parameters
----------
date: datetime
Returns
-------
int
"""
epoch = datetime.utcfromtimestamp(0)
epoch = epoch.replace(tzinfo=pytz.UTC)
return int((date - epoch).total_seconds())
def get_delta(periods, data_frequency):
"""
Get a time delta based on the specified data frequency.
Parameters
----------
periods: int
data_frequency: str
Returns
-------
timedelta
"""
return timedelta(minutes=periods) \
if data_frequency == 'minute' else timedelta(days=periods)
def get_periods_range(freq, start_dt=None, end_dt=None, periods=None):
"""
Get a date range for the specified parameters.
Parameters
----------
start_dt: datetime
end_dt: datetime
freq: str
Returns
-------
DateTimeIndex
"""
if freq == 'minute':
freq = 'T'
elif freq == 'daily':
freq = 'D'
if start_dt is not None and end_dt is not None and periods is None:
return pd.date_range(start_dt, end_dt, freq=freq)
elif periods is not None and (start_dt is not None or end_dt is not None):
_, unit_periods, unit, _ = get_frequency(freq)
adj_periods = periods * unit_periods
# TODO: standardize time aliases to avoid any mapping
unit = 'd' if unit == 'D' else 'm'
delta = pd.Timedelta(adj_periods, unit)
if start_dt is not None:
return pd.date_range(
start=start_dt,
end=start_dt + delta,
freq=freq,
closed='left',
)
else:
return pd.date_range(
start=end_dt - delta,
end=end_dt,
freq=freq,
)
else:
raise ValueError(
'Choose only two parameters between start_dt, end_dt '
'and periods.'
)
def get_periods(start_dt, end_dt, freq):
"""
The number of periods in the specified range.
Parameters
----------
start_dt: datetime
end_dt: datetime
freq: str
Returns
-------
int
"""
return len(get_periods_range(start_dt=start_dt, end_dt=end_dt, freq=freq))
def get_start_dt(end_dt, bar_count, data_frequency, include_first=True):
"""
The start date based on specified end date and data frequency.
Parameters
----------
end_dt: datetime
bar_count: int
data_frequency: str
include_first
Returns
-------
datetime
"""
periods = bar_count
if periods > 1:
delta = get_delta(periods, data_frequency)
start_dt = end_dt - delta
if not include_first:
start_dt += get_delta(1, data_frequency)
else:
start_dt = end_dt
return start_dt
def get_period_label(dt, data_frequency):
"""
The period label for the specified date and frequency.
Parameters
----------
dt: datetime
data_frequency: str
Returns
-------
str
"""
if data_frequency == 'minute':
return '{}-{:02d}'.format(dt.year, dt.month)
else:
return '{}'.format(dt.year)
def get_month_start_end(dt, first_day=None, last_day=None):
"""
The first and last day of the month for the specified date.
Parameters
----------
dt: datetime
first_day: datetime
last_day: datetime
Returns
-------
datetime, datetime
"""
month_range = calendar.monthrange(dt.year, dt.month)
if first_day:
month_start = first_day
else:
month_start = pd.to_datetime(datetime(
dt.year, dt.month, 1, 0, 0, 0, 0
), utc=True)
if last_day:
month_end = last_day
else:
month_end = pd.to_datetime(datetime(
dt.year, dt.month, month_range[1], 23, 59, 0, 0
), utc=True)
if month_end > pd.Timestamp.utcnow():
month_end = pd.Timestamp.utcnow().floor('1D')
return month_start, month_end
def get_year_start_end(dt, first_day=None, last_day=None):
"""
The first and last day of the year for the specified date.
Parameters
----------
dt: datetime
first_day: datetime
last_day: datetime
Returns
-------
datetime, datetime
"""
year_start = first_day if first_day \
else pd.to_datetime(date(dt.year, 1, 1), utc=True)
year_end = last_day if last_day \
else pd.to_datetime(date(dt.year, 12, 31), utc=True)
if year_end > pd.Timestamp.utcnow():
year_end = pd.Timestamp.utcnow().floor('1D')
return year_start, year_end
def get_frequency(freq, data_frequency=None):
"""
Get the frequency parameters.
Notes
-----
We're trying to use Pandas convention for frequency aliases.
Parameters
----------
freq: str
data_frequency: str
Returns
-------
str, int, str, str
"""
if data_frequency is None:
data_frequency = 'daily' if freq.upper().endswith('D') else 'minute'
if freq == 'minute':
unit = 'T'
candle_size = 1
elif freq == 'daily':
unit = 'D'
candle_size = 1
else:
freq_match = re.match(r'([0-9].*)?(m|M|d|D|h|H|T)', freq, re.M | re.I)
if freq_match:
candle_size = int(freq_match.group(1)) if freq_match.group(1) \
else 1
unit = freq_match.group(2)
else:
raise InvalidHistoryFrequencyError(frequency=freq)
# TODO: some exchanges support H and W frequencies but not bundles
# Find a way to pass-through these parameters to exchanges
# but resample from minute or daily in backtest mode
# see catalyst/exchange/ccxt/ccxt_exchange.py:242 for mapping between
# Pandas offet aliases (used by Catalyst) and the CCXT timeframes
if unit.lower() == 'd':
unit = 'D'
alias = '{}D'.format(candle_size)
if data_frequency == 'minute':
data_frequency = 'daily'
elif unit.lower() == 'm' or unit == 'T':
unit = 'T'
alias = '{}T'.format(candle_size)
if data_frequency == 'daily':
data_frequency = 'minute'
# elif unit.lower() == 'h':
# candle_size = candle_size * 60
#
# alias = '{}T'.format(candle_size)
# if data_frequency == 'daily':
# data_frequency = 'minute'
else:
raise InvalidHistoryFrequencyAlias(freq=freq)
return alias, candle_size, unit, data_frequency
def from_ms_timestamp(ms):
return pd.to_datetime(ms, unit='ms', utc=True)
def get_epoch():
return pd.to_datetime('1970-1-1', utc=True)
+4 -79
View File
@@ -2,21 +2,20 @@ import hashlib
import json
import os
import pickle
import re
import shutil
from datetime import date, datetime
import pandas as pd
from catalyst.assets._assets import TradingPair
from six import string_types
from six.moves.urllib import request
from catalyst.constants import DATE_FORMAT, SYMBOLS_URL
from catalyst.exchange.exchange_errors import ExchangeSymbolsNotFound, \
InvalidHistoryFrequencyError, InvalidHistoryFrequencyAlias
from catalyst.exchange.exchange_errors import ExchangeSymbolsNotFound
from catalyst.exchange.utils.serialization_utils import ExchangeJSONEncoder, \
ExchangeJSONDecoder
from catalyst.utils.paths import data_root, ensure_directory, \
last_modified_time
from six import string_types
from six.moves.urllib import request
def get_sid(symbol):
@@ -513,72 +512,6 @@ def get_common_assets(exchanges):
return assets
def get_frequency(freq, data_frequency):
"""
Get the frequency parameters.
Notes
-----
We're trying to use Pandas convention for frequency aliases.
Parameters
----------
freq: str
data_frequency: str
Returns
-------
str, int, str, str
"""
if freq == 'minute':
unit = 'T'
candle_size = 1
elif freq == 'daily':
unit = 'D'
candle_size = 1
else:
freq_match = re.match(r'([0-9].*)?(m|M|d|D|h|H|T)', freq, re.M | re.I)
if freq_match:
candle_size = int(freq_match.group(1)) if freq_match.group(1) \
else 1
unit = freq_match.group(2)
else:
raise InvalidHistoryFrequencyError(frequency=freq)
# TODO: some exchanges support H and W frequencies but not bundles
# Find a way to pass-through these parameters to exchanges
# but resample from minute or daily in backtest mode
# see catalyst/exchange/ccxt/ccxt_exchange.py:242 for mapping between
# Pandas offet aliases (used by Catalyst) and the CCXT timeframes
if unit.lower() == 'd':
alias = '{}D'.format(candle_size)
if data_frequency == 'minute':
data_frequency = 'daily'
elif unit.lower() == 'm' or unit == 'T':
alias = '{}T'.format(candle_size)
if data_frequency == 'daily':
data_frequency = 'minute'
# elif unit.lower() == 'h':
# candle_size = candle_size * 60
#
# alias = '{}T'.format(candle_size)
# if data_frequency == 'daily':
# data_frequency = 'minute'
else:
raise InvalidHistoryFrequencyAlias(freq=freq)
return alias, candle_size, unit, data_frequency
def resample_history_df(df, freq, field):
"""
Resample the OHCLV DataFrame using the specified frequency.
@@ -652,14 +585,6 @@ def mixin_market_params(exchange_name, params, market):
params['lot'] = params['min_trade_size']
def from_ms_timestamp(ms):
return pd.to_datetime(ms, unit='ms', utc=True)
def get_epoch():
return pd.to_datetime('1970-1-1', utc=True)
def group_assets_by_exchange(assets):
exchange_assets = dict()
for asset in assets:
+6 -6
View File
@@ -62,14 +62,14 @@ def output_df(df, assets, name=None):
"""
if isinstance(assets, TradingPair):
exchange_folder = assets.exchange
asset_folder = assets.symbol
asset_folder = '{}_{}'.format(assets.exchange, assets.symbol)
else:
exchange_folder = ','.join([asset.exchange for asset in assets])
asset_folder = ','.join([asset.symbol for asset in assets])
asset_folder = ','.join(
['{}_{}'.format(a.exchange, a.symbol) for a in assets]
)
folder = os.path.join(
tempfile.gettempdir(), 'catalyst', exchange_folder, asset_folder
tempfile.gettempdir(), 'catalyst', asset_folder
)
ensure_directory(folder)
@@ -79,4 +79,4 @@ def output_df(df, assets, name=None):
path = os.path.join(folder, '{}.csv'.format(name))
df.to_csv(path)
return path
return path, folder
+2 -1
View File
@@ -10,7 +10,8 @@ from catalyst.exchange.exchange_bcolz import BcolzExchangeBarReader, \
from catalyst.exchange.exchange_bundle import ExchangeBundle, \
BUNDLE_NAME_TEMPLATE
from catalyst.exchange.utils.bundle_utils import get_bcolz_chunk, \
get_start_dt, get_df_from_arrays
get_df_from_arrays
from exchange.utils.datetime_utils import get_start_dt
from catalyst.exchange.utils.exchange_utils import get_exchange_folder
from catalyst.exchange.utils.factory import get_exchange
from catalyst.exchange.utils.stats_utils import df_to_string
+3 -1
View File
@@ -1,6 +1,8 @@
import pandas as pd
from logbook import Logger
from catalyst.testing import ZiplineTestCase
from catalyst.testing.fixtures import WithLogger
from .base import BaseExchangeTestCase
from catalyst.exchange.ccxt.ccxt_exchange import CCXT
from catalyst.exchange.exchange_execution import ExchangeLimitOrder
@@ -59,7 +61,7 @@ class TestCCXT(BaseExchangeTestCase):
freq='5T',
assets=[self.exchange.get_asset('eth_btc')],
bar_count=200,
start_dt=pd.to_datetime('2017-01-01', utc=True)
start_dt=pd.to_datetime('2017-09-01', utc=True)
)
for asset in candles:
+23 -13
View File
@@ -1,5 +1,6 @@
import random
import os
import pandas as pd
from logbook import TestHandler
from pandas.util.testing import assert_frame_equal
@@ -11,7 +12,6 @@ from catalyst.exchange.utils.exchange_utils import get_candles_df
from catalyst.exchange.utils.factory import get_exchange
from catalyst.exchange.utils.test_utils import output_df, \
select_random_assets
from catalyst.testing.fixtures import WithLogger, ZiplineTestCase
pd.set_option('display.expand_frame_repr', False)
pd.set_option('precision', 8)
@@ -19,7 +19,7 @@ pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
class TestSuiteBundle(WithLogger, ZiplineTestCase):
class TestSuiteBundle:
@staticmethod
def get_data_portal(exchanges):
open_calendar = get_calendar('OPEN')
@@ -46,7 +46,9 @@ class TestSuiteBundle(WithLogger, ZiplineTestCase):
assets
end_dt
bar_count
sample_minutes
freq
data_frequency
data_portal
Returns
-------
@@ -64,10 +66,6 @@ class TestSuiteBundle(WithLogger, ZiplineTestCase):
field='close',
data_frequency=data_frequency,
)
print('bundle data:\n{}'.format(
data['bundle'].tail(10))
)
candles = exchange.get_candles(
end_dt=end_dt,
freq=freq,
@@ -81,19 +79,31 @@ class TestSuiteBundle(WithLogger, ZiplineTestCase):
bar_count=bar_count,
end_dt=end_dt,
)
print('exchange data:\n{}'.format(
data['exchange'].tail(10))
)
for source in data:
df = data[source]
path = output_df(df, assets, '{}_{}'.format(freq, source))
print('saved {}:\n{}'.format(source, path))
path, folder = output_df(
df, assets, '{}_{}'.format(freq, source)
)
assert_frame_equal(
right=data['bundle'],
left=data['exchange'],
check_less_precise=True,
check_less_precise=1,
)
try:
assert_frame_equal(
right=data['bundle'],
left=data['exchange'],
check_less_precise=min([a.decimals for a in assets]),
)
except Exception as e:
print('Some differences were found within a 1 decimal point '
'interval of confidence: {}'.format(e))
with open(os.path.join(folder, 'compare.txt'), 'w+') as handle:
handle.write(e.args[0])
print('saved test results: {}'.format(folder))
pass
def test_validate_bundles(self):
# exchange_population = 3