mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-28 11:51:47 +08:00
BUG/MAINT: Switch over to Google for benchmarking
MAINT: Remove mentions of Yahoo & ^GSPC MAINT: Fill in missing dates MAINT/BLD: Rebuild example data to match new benchmark
This commit is contained in:
Binary file not shown.
@@ -5,6 +5,7 @@ import shutil
|
||||
import tarfile
|
||||
|
||||
import click
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
@@ -14,6 +15,8 @@ from zipline.testing import test_resource_path, tmp_dir
|
||||
from zipline.utils.cache import dataframe_cache
|
||||
|
||||
|
||||
matplotlib.use('Agg')
|
||||
|
||||
INPUT_DATA_START_DATE = pd.Timestamp('2004-01-02')
|
||||
INPUT_DATA_END_DATE = pd.Timestamp('2014-12-31')
|
||||
INPUT_DATA_SYMBOLS = (
|
||||
|
||||
@@ -54,8 +54,6 @@ class ExamplesTests(WithTmpDir, ZiplineTestCase):
|
||||
serialization='pickle',
|
||||
)
|
||||
|
||||
copy_market_data(WithTradingEnvironment.MARKET_DATA_DIR,
|
||||
cls.tmpdir.getpath('example_data/root'))
|
||||
|
||||
@parameterized.expand(examples.EXAMPLE_MODULES)
|
||||
def test_example(self, example_name):
|
||||
|
||||
+34
-36
@@ -12,50 +12,48 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from six.moves.urllib_parse import urlencode
|
||||
import pandas_datareader.data as pd_reader
|
||||
|
||||
|
||||
def format_yahoo_index_url(symbol, start_date, end_date):
|
||||
def get_benchmark_returns(symbol, first_date, last_date):
|
||||
"""
|
||||
Format a URL for querying Yahoo Finance for Index data.
|
||||
"""
|
||||
return (
|
||||
'https://ichart.finance.yahoo.com/table.csv?' + urlencode({
|
||||
's': symbol,
|
||||
# start_date month, zero indexed
|
||||
'a': start_date.month - 1,
|
||||
# start_date day
|
||||
'b': start_date.day,
|
||||
# start_date year
|
||||
'c': start_date.year,
|
||||
# end_date month, zero indexed
|
||||
'd': end_date.month - 1,
|
||||
# end_date day
|
||||
'e': end_date.day,
|
||||
# end_date year
|
||||
'f': end_date.year,
|
||||
# daily frequency
|
||||
'g': 'd',
|
||||
})
|
||||
)
|
||||
Get a Series of benchmark returns from Google associated with `symbol`.
|
||||
Default is `SPY`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
symbol : str
|
||||
Benchmark symbol for which we're getting the returns.
|
||||
first_date : pd.Timestamp
|
||||
First date for which we want to get data.
|
||||
last_date : pd.Timestamp
|
||||
Last date for which we want to get data.
|
||||
|
||||
def get_benchmark_returns(symbol, start_date, end_date):
|
||||
"""
|
||||
Get a Series of benchmark returns from Yahoo.
|
||||
The furthest date that Google goes back to is 2001-06-26. It has missing
|
||||
data for 2008-12-15, 2009-08-11, and 2012-02-02, so we add data for the
|
||||
dates for which Google is missing data.
|
||||
|
||||
Returns a Series with returns from (start_date, end_date].
|
||||
We're also limited to the last 4000 days worth of data.
|
||||
|
||||
start_date is **not** included because we need the close from day N - 1 to
|
||||
first_date is **not** included because we need the close from day N - 1 to
|
||||
compute the returns for day N.
|
||||
"""
|
||||
return pd.read_csv(
|
||||
format_yahoo_index_url(symbol, start_date, end_date),
|
||||
parse_dates=['Date'],
|
||||
index_col='Date',
|
||||
usecols=["Adj Close", "Date"],
|
||||
squeeze=True, # squeeze tells pandas to make this a Series
|
||||
# instead of a 1-column DataFrame
|
||||
).sort_index().tz_localize('UTC').pct_change(1).iloc[1:]
|
||||
data = pd_reader.DataReader(
|
||||
symbol,
|
||||
'google',
|
||||
first_date,
|
||||
last_date
|
||||
)
|
||||
|
||||
data = data['Close']
|
||||
|
||||
data[pd.Timestamp('2008-12-15')] = np.nan
|
||||
data[pd.Timestamp('2009-08-11')] = np.nan
|
||||
data[pd.Timestamp('2012-02-02')] = np.nan
|
||||
|
||||
data = data.fillna(method='ffill')
|
||||
|
||||
return data.sort_index().tz_localize('UTC').pct_change(1).iloc[1:]
|
||||
|
||||
+20
-10
@@ -35,7 +35,7 @@ logger = logbook.Logger('Loader')
|
||||
|
||||
# Mapping from index symbol to appropriate bond data
|
||||
INDEX_MAPPING = {
|
||||
'^GSPC':
|
||||
'SPY':
|
||||
(treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
|
||||
'^GSPTSE':
|
||||
(treasuries_can, 'treasury_curves_can.csv', 'bankofcanada.ca'),
|
||||
@@ -91,13 +91,13 @@ def has_data_for_dates(series_or_df, first_date, last_date):
|
||||
return (first <= first_date) and (last >= last_date)
|
||||
|
||||
|
||||
def load_market_data(trading_day=None, trading_days=None, bm_symbol='^GSPC',
|
||||
def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
|
||||
environ=None):
|
||||
"""
|
||||
Load benchmark returns and treasury yield curves for the given calendar and
|
||||
benchmark symbol.
|
||||
|
||||
Benchmarks are downloaded as a Series from Yahoo Finance. Treasury curves
|
||||
Benchmarks are downloaded as a Series from Google Finance. Treasury curves
|
||||
are US Treasury Bond rates and are downloaded from 'www.federalreserve.gov'
|
||||
by default. For Canadian exchanges, a loader for Canadian bonds from the
|
||||
Bank of Canada is also available.
|
||||
@@ -115,7 +115,7 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='^GSPC',
|
||||
A calendar of trading days. Also used for determining what cached
|
||||
dates we should expect to have cached. Defaults to the NYSE calendar.
|
||||
bm_symbol : str, optional
|
||||
Symbol for the benchmark index to load. Defaults to '^GSPC', the Yahoo
|
||||
Symbol for the benchmark index to load. Defaults to 'SPY', the Google
|
||||
ticker for the S&P 500.
|
||||
|
||||
Returns
|
||||
@@ -136,7 +136,10 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='^GSPC',
|
||||
if trading_days is None:
|
||||
trading_days = get_calendar('NYSE').all_sessions
|
||||
|
||||
first_date = trading_days[0]
|
||||
# We want the latest 4000 trading days
|
||||
# because Google Finance only allows downloading data
|
||||
# up to the 4000 latest trading days
|
||||
first_date = trading_days[-4000]
|
||||
now = pd.Timestamp.utcnow()
|
||||
|
||||
# We expect to have benchmark and treasury data that's current up until
|
||||
@@ -215,7 +218,13 @@ def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day,
|
||||
|
||||
# If no cached data was found or it was missing any dates then download the
|
||||
# necessary data.
|
||||
logger.info('Downloading benchmark data for {symbol!r}.', symbol=symbol)
|
||||
logger.info(
|
||||
('Downloading benchmark data for {symbol!r} '
|
||||
'from {first_date} to {last_date}'),
|
||||
symbol=symbol,
|
||||
first_date=first_date - trading_day,
|
||||
last_date=last_date
|
||||
)
|
||||
|
||||
try:
|
||||
data = get_benchmark_returns(
|
||||
@@ -225,7 +234,7 @@ def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day,
|
||||
)
|
||||
data.to_csv(get_data_filepath(filename, environ))
|
||||
except (OSError, IOError, HTTPError):
|
||||
logger.exception('failed to cache the new benchmark returns')
|
||||
logger.exception('Failed to cache the new benchmark returns')
|
||||
raise
|
||||
if not has_data_for_dates(data, first_date, last_date):
|
||||
logger.warn("Still don't have expected data after redownload!")
|
||||
@@ -260,7 +269,7 @@ def ensure_treasury_data(symbol, first_date, last_date, now, environ=None):
|
||||
path.
|
||||
"""
|
||||
loader_module, filename, source = INDEX_MAPPING.get(
|
||||
symbol, INDEX_MAPPING['^GSPC'],
|
||||
symbol, INDEX_MAPPING['SPY'],
|
||||
)
|
||||
first_date = max(first_date, loader_module.earliest_possible_date())
|
||||
|
||||
@@ -297,7 +306,8 @@ def _load_cached_data(filename, first_date, last_date, now, resource_name,
|
||||
# yet, so don't try to read from 'path'.
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
data = from_csv(path).tz_localize('UTC')
|
||||
data = from_csv(path)
|
||||
data.index = data.index.to_datetime().tz_localize('UTC')
|
||||
if has_data_for_dates(data, first_date, last_date):
|
||||
return data
|
||||
|
||||
@@ -335,7 +345,7 @@ def _load_raw_yahoo_data(indexes=None, stocks=None, start=None, end=None):
|
||||
"""Load closing prices from yahoo finance.
|
||||
|
||||
:Optional:
|
||||
indexes : dict (Default: {'SPX': '^GSPC'})
|
||||
indexes : dict (Default: {'SPX': '^SPY'})
|
||||
Financial indexes to load.
|
||||
stocks : list (Default: ['AAPL', 'GE', 'IBM', 'MSFT',
|
||||
'XOM', 'AA', 'JNJ', 'PEP', 'KO'])
|
||||
|
||||
@@ -78,7 +78,7 @@ class TradingEnvironment(object):
|
||||
def __init__(
|
||||
self,
|
||||
load=None,
|
||||
bm_symbol='^GSPC',
|
||||
bm_symbol='SPY',
|
||||
exchange_tz="US/Eastern",
|
||||
trading_calendar=None,
|
||||
asset_db_path=':memory:',
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1493,7 +1493,7 @@ def patch_read_csv(url_map, module=pd, strict=False):
|
||||
|
||||
|
||||
def copy_market_data(src_market_data_dir, dest_root_dir):
|
||||
symbol = '^GSPC'
|
||||
symbol = 'SPY'
|
||||
filenames = (get_benchmark_filename(symbol), INDEX_MAPPING[symbol][1])
|
||||
|
||||
ensure_directory(os.path.join(dest_root_dir, 'data'))
|
||||
|
||||
@@ -500,7 +500,7 @@ class WithTradingEnvironment(WithAssetFinder,
|
||||
@classmethod
|
||||
def make_load_function(cls):
|
||||
def load(*args, **kwargs):
|
||||
symbol = '^GSPC'
|
||||
symbol = 'SPY'
|
||||
|
||||
filename = get_benchmark_filename(symbol)
|
||||
source_path = os.path.join(cls.MARKET_DATA_DIR, filename)
|
||||
|
||||
Reference in New Issue
Block a user