From 1b7d0c9477d38f6e008f8a93f9b384cc1ca684ff Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Fri, 13 Nov 2015 17:08:04 -0500 Subject: [PATCH 1/3] MAINT: Add __future__ print function import. We do print(stock) in this file, which happens to work in py2, but is confusing. --- zipline/data/loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zipline/data/loader.py b/zipline/data/loader.py index 2c42759a..31179094 100644 --- a/zipline/data/loader.py +++ b/zipline/data/loader.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import os from collections import OrderedDict From 43ac9eab5cfa3b7b371cc0edd060933bd1d9e30c Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Fri, 13 Nov 2015 17:08:35 -0500 Subject: [PATCH 2/3] ENH: Check `getmtime` on download locations. Rather than repeatedly try and fail to download data that's not yet available, only try to download again if we haven't successfully downloaded in the last hour. --- zipline/data/loader.py | 61 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/zipline/data/loader.py b/zipline/data/loader.py index 31179094..30e81dd8 100644 --- a/zipline/data/loader.py +++ b/zipline/data/loader.py @@ -48,6 +48,15 @@ INDEX_MAPPING = { (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'), } +ONE_HOUR = pd.Timedelta(hours=1) + + +def last_modified_time(path): + """ + Get the last modified time of path as a Timestamp. + """ + return pd.Timestamp(os.path.getmtime(path), unit='s', tz='UTC') + def get_data_filepath(name): """ @@ -129,6 +138,7 @@ def load_market_data(trading_day=trading_day_nyse, '1year','2year','3year','5year','7year','10year','20year','30year' """ first_date = trading_days[0] + now = pd.Timestamp.utcnow() # We expect to have benchmark and treasury data that's current up until # **two** full trading days prior to the most recently completed trading @@ -144,14 +154,13 @@ def load_market_data(trading_day=trading_day_nyse, # We'll attempt to download new data if the latest entry in our cache is # before this date. - last_date = trading_days[ - trading_days.get_loc(pd.Timestamp.utcnow(), method='ffill') - 2 - ] + last_date = trading_days[trading_days.get_loc(now, method='ffill') - 2] benchmark_returns = ensure_benchmark_data( bm_symbol, first_date, last_date, + now, # We need the trading_day to figure out the close prior to the first # date so that we can compute returns for the first date. trading_day, @@ -160,11 +169,12 @@ def load_market_data(trading_day=trading_day_nyse, bm_symbol, first_date, last_date, + now, ) return benchmark_returns, treasury_curves -def ensure_benchmark_data(symbol, first_date, last_date, trading_day): +def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day): """ Ensure we have benchmark data for `symbol` from `first_date` to `last_date` @@ -176,6 +186,10 @@ def ensure_benchmark_data(symbol, first_date, last_date, trading_day): First required date for the cache. last_date : pd.Timestamp Last required date for the cache. + now : pd.Timestamp + The current time. This is used to prevent repeated attempts to + re-download data that isn't available due to scheduling quirks or other + failures. trading_day : pd.CustomBusinessDay A trading day delta. Used to find the day before first_date so we can get the close of the day prior to first_date. @@ -183,12 +197,28 @@ def ensure_benchmark_data(symbol, first_date, last_date, trading_day): We attempt to download data unless we already have data stored at the data cache for `symbol` whose first entry is before or on `first_date` and whose last entry is on or after `last_date`. + + If we perform a download and the cache criteria are not satisfied, we wait + at least one hour before attempting a redownload. This is determined by + comparing the current time to the result of os.path.getmtime on the cache + path. """ path = get_data_filepath(get_benchmark_filename(symbol)) try: data = pd.Series.from_csv(path).tz_localize('UTC') if has_data_for_dates(data, first_date, last_date): return data + + # Don't re-download if we've successfully downloaded and written a file + # in the last hour. + last_download_time = last_modified_time(path) + if (now - last_download_time) <= ONE_HOUR: + logger.warn( + "Refusing to download new benchmark " + "data because a download succeeded at %s." % last_download_time + ) + return data + except (OSError, IOError, ValueError) as e: # These can all be raised by various versions of pandas on various # classes of malformed input. Treat them all as cache misses. @@ -213,7 +243,7 @@ def ensure_benchmark_data(symbol, first_date, last_date, trading_day): return data -def ensure_treasury_data(bm_symbol, first_date, last_date): +def ensure_treasury_data(bm_symbol, first_date, last_date, now): """ Ensure we have treasury data from treasury module associated with `bm_symbol`. @@ -226,10 +256,19 @@ def ensure_treasury_data(bm_symbol, first_date, last_date): First date required to be in the cache. last_date : pd.Timestamp Last date required to be in the cache. + now : pd.Timestamp + The current time. This is used to prevent repeated attempts to + re-download data that isn't available due to scheduling quirks or other + failures. We attempt to download data unless we already have data stored in the cache for `module_name` whose first entry is before or on `first_date` and whose last entry is on or after `last_date`. + + If we perform a download and the cache criteria are not satisfied, we wait + at least one hour before attempting a redownload. This is determined by + comparing the current time to the result of os.path.getmtime on the cache + path. """ loader_module, filename, source = INDEX_MAPPING.get( bm_symbol, INDEX_MAPPING['^GSPC'] @@ -240,6 +279,17 @@ def ensure_treasury_data(bm_symbol, first_date, last_date): data = pd.DataFrame.from_csv(path).tz_localize('UTC') if has_data_for_dates(data, first_date, last_date): return data + + # Don't re-download if we've successfully downloaded and written a file + # in the last hour. + last_download_time = last_modified_time(path) + if (now - last_download_time) <= ONE_HOUR: + logger.warn( + "Refusing to download new treasury " + "data because a download succeeded at %s." % last_download_time + ) + return data + except (OSError, IOError, ValueError) as e: # These can all be raised by various versions of pandas on various # classes of malformed input. Treat them all as cache misses. @@ -274,7 +324,6 @@ def _load_raw_yahoo_data(indexes=None, stocks=None, start=None, end=None): This is based on code presented in a talk by Wes McKinney: http://wesmckinney.com/files/20111017/notebook_output.pdf """ - assert indexes is not None or stocks is not None, """ must specify stocks or indexes""" From 182cf9754c772ed907f7cbca64024a7bea111b2b Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Fri, 13 Nov 2015 18:12:10 -0500 Subject: [PATCH 3/3] DOC: Add a whatsnew. --- docs/source/whatsnew/0.8.4.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/whatsnew/0.8.4.txt b/docs/source/whatsnew/0.8.4.txt index 3e211876..6c493c52 100644 --- a/docs/source/whatsnew/0.8.4.txt +++ b/docs/source/whatsnew/0.8.4.txt @@ -69,3 +69,6 @@ Miscellaneous without ``nose_parameterized.expand`` which bloats the test output (:issue:`833`). * Limits timer report in test output to 15 longest tests (:issue:`838`). +* Treasury and benchmark downloads will now wait up to an hour to download + again if data returned from a remote source does not extend to the date + expected. (:issue:`841`).