diff --git a/tests/risk/answer_key.py b/tests/risk/answer_key.py index 5c300d72..7f3931ba 100644 --- a/tests/risk/answer_key.py +++ b/tests/risk/answer_key.py @@ -13,10 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib import os import numpy as np import xlrd +import requests def col_letter_to_index(col_letter): @@ -26,8 +28,81 @@ def col_letter_to_index(col_letter): DIR = os.path.dirname(os.path.realpath(__file__)) +ANSWER_KEY_CHECKSUMS_PATH = os.path.join(DIR, 'risk-answer-key-checksums') +ANSWER_KEY_CHECKSUMS = open(ANSWER_KEY_CHECKSUMS_PATH, 'r').read().splitlines() + ANSWER_KEY_PATH = os.path.join(DIR, 'risk-answer-key.xls') +ANSWER_KEY_EXISTS = os.path.exists(ANSWER_KEY_PATH) + +ANSWER_KEY_DL_TEMPLATE = """ +https://s3.amazonaws.com/zipline-test-data/risk/{md5}+/risk-answer-key.xls +""".strip() + + +def ensure_latest_answer_key(): + """ + Get the latest answer key from a publically available location. + + Logic for determining what and when to download is as such: + + - If there is no local xls file, then get the lastest answer key, + as defined by the last row in the checksum file. + - If there is a local xls file: + -- If the xls's checksum is in the checksum file: + --- If the xls's checksum does not match the latest, then grab the + the latest checksum and replace the local checksum file. + --- If the xls's checksum matches the latest, then skip download, and + use the local xls as a cached copy. + -- If the xls's checksum is not in the checksum file, then leave the + local file alone, assuming that the local xls's md5 is not in the list due + to local modifications during development. + + It is possible that md5's could collide, if that is ever case, we should + then find an alternative naming scheme. + + The xls answer sheet is not kept in SCM, because its size is on the order + of 20MB, and every edit would increase the repo size. + + xlsl and ods have smaller outputs and could be more friendly to SCM, but: + - not using xlsl, because currently the xlsl that is generated by + LibreOffice is not readable by the xldr module. + - not using ods, because of the lack of a module as facile as xldr for + extracting the data from the ods format. + """ + + answer_key_dl_checksum = None + + local_answer_key_exists = os.path.exists(ANSWER_KEY_PATH) + if local_answer_key_exists: + with open(ANSWER_KEY_PATH, 'r') as f: + md5 = hashlib.md5() + while True: + buf = f.read(1024) + if not buf: + break + md5.update(buf) + local_hash = md5.hexdigest() + + if local_hash in ANSWER_KEY_CHECKSUMS: + # Assume previously downloaded version. + # Check for latest. + if local_hash != ANSWER_KEY_CHECKSUMS[-1]: + # More recent checksum, download + answer_key_dl_checksum = ANSWER_KEY_CHECKSUMS[-1] + else: + # Assume local copy that is being developed on + answer_key_dl_checksum = None + + if answer_key_dl_checksum: + res = requests.get( + ANSWER_KEY_DL_TEMPLATE.format(md5=answer_key_dl_checksum)) + with open(ANSWER_KEY_PATH, 'w') as f: + f.write(res.content) + +# Get latest answer key on load. +ensure_latest_answer_key() + class DataIndex(object): """ diff --git a/tests/risk/risk-answer-key-checksums b/tests/risk/risk-answer-key-checksums new file mode 100644 index 00000000..f1ec4cc6 --- /dev/null +++ b/tests/risk/risk-answer-key-checksums @@ -0,0 +1 @@ +3ac0773c4be4e9e5bacd9c6fa0e03e15