mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-03 12:00:47 +08:00
TST: Download risk answer key from S3.
So that the answer key does not onerous on the SCM repo size, add a utility to download the answer key automatically. Prevent re-download on every test suite run if the local answer key matches the latest version.
This commit is contained in:
@@ -13,10 +13,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import requests
|
||||
|
||||
|
||||
def col_letter_to_index(col_letter):
|
||||
@@ -26,8 +28,81 @@ def col_letter_to_index(col_letter):
|
||||
|
||||
DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
ANSWER_KEY_CHECKSUMS_PATH = os.path.join(DIR, 'risk-answer-key-checksums')
|
||||
ANSWER_KEY_CHECKSUMS = open(ANSWER_KEY_CHECKSUMS_PATH, 'r').read().splitlines()
|
||||
|
||||
ANSWER_KEY_PATH = os.path.join(DIR, 'risk-answer-key.xls')
|
||||
|
||||
ANSWER_KEY_EXISTS = os.path.exists(ANSWER_KEY_PATH)
|
||||
|
||||
ANSWER_KEY_DL_TEMPLATE = """
|
||||
https://s3.amazonaws.com/zipline-test-data/risk/{md5}+/risk-answer-key.xls
|
||||
""".strip()
|
||||
|
||||
|
||||
def ensure_latest_answer_key():
|
||||
"""
|
||||
Get the latest answer key from a publically available location.
|
||||
|
||||
Logic for determining what and when to download is as such:
|
||||
|
||||
- If there is no local xls file, then get the lastest answer key,
|
||||
as defined by the last row in the checksum file.
|
||||
- If there is a local xls file:
|
||||
-- If the xls's checksum is in the checksum file:
|
||||
--- If the xls's checksum does not match the latest, then grab the
|
||||
the latest checksum and replace the local checksum file.
|
||||
--- If the xls's checksum matches the latest, then skip download, and
|
||||
use the local xls as a cached copy.
|
||||
-- If the xls's checksum is not in the checksum file, then leave the
|
||||
local file alone, assuming that the local xls's md5 is not in the list due
|
||||
to local modifications during development.
|
||||
|
||||
It is possible that md5's could collide, if that is ever case, we should
|
||||
then find an alternative naming scheme.
|
||||
|
||||
The xls answer sheet is not kept in SCM, because its size is on the order
|
||||
of 20MB, and every edit would increase the repo size.
|
||||
|
||||
xlsl and ods have smaller outputs and could be more friendly to SCM, but:
|
||||
- not using xlsl, because currently the xlsl that is generated by
|
||||
LibreOffice is not readable by the xldr module.
|
||||
- not using ods, because of the lack of a module as facile as xldr for
|
||||
extracting the data from the ods format.
|
||||
"""
|
||||
|
||||
answer_key_dl_checksum = None
|
||||
|
||||
local_answer_key_exists = os.path.exists(ANSWER_KEY_PATH)
|
||||
if local_answer_key_exists:
|
||||
with open(ANSWER_KEY_PATH, 'r') as f:
|
||||
md5 = hashlib.md5()
|
||||
while True:
|
||||
buf = f.read(1024)
|
||||
if not buf:
|
||||
break
|
||||
md5.update(buf)
|
||||
local_hash = md5.hexdigest()
|
||||
|
||||
if local_hash in ANSWER_KEY_CHECKSUMS:
|
||||
# Assume previously downloaded version.
|
||||
# Check for latest.
|
||||
if local_hash != ANSWER_KEY_CHECKSUMS[-1]:
|
||||
# More recent checksum, download
|
||||
answer_key_dl_checksum = ANSWER_KEY_CHECKSUMS[-1]
|
||||
else:
|
||||
# Assume local copy that is being developed on
|
||||
answer_key_dl_checksum = None
|
||||
|
||||
if answer_key_dl_checksum:
|
||||
res = requests.get(
|
||||
ANSWER_KEY_DL_TEMPLATE.format(md5=answer_key_dl_checksum))
|
||||
with open(ANSWER_KEY_PATH, 'w') as f:
|
||||
f.write(res.content)
|
||||
|
||||
# Get latest answer key on load.
|
||||
ensure_latest_answer_key()
|
||||
|
||||
|
||||
class DataIndex(object):
|
||||
"""
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
3ac0773c4be4e9e5bacd9c6fa0e03e15
|
||||
Reference in New Issue
Block a user