Added data_scraper

This commit is contained in:
Juan Pablo Amoroso
2019-05-13 15:27:17 -03:00
parent 6b0f7e2d32
commit 34925b8bee
26 changed files with 5573 additions and 1 deletions
+9
View File
@@ -102,3 +102,12 @@ venv.bak/
# mypy
.mypy_cache/
# exclude data from source control by default
data/
# Mac OS-specific storage files
.DS_Store
# VS Code
.vscode/
+41
View File
@@ -0,0 +1,41 @@
.PHONY: image init env testdata test test_scraper scrape aggregate backup bench
image:
docker build -t data_scraper -f ./docker/data_scraper/Dockerfile .
ops:
docker-compose -f ./docker/docker-compose.yml up -d
stop:
docker-compose -f ./docker/docker-compose.yml down
init:
pipenv --three && pipenv install
env:
pipenv shell
testdata:
pipenv run python backtester/test/create_test_data.py
test:
pipenv run python -m unittest discover -s backtester/test
test_scraper:
pipenv run python -m unittest discover -s data_scraper
scrape:
ifdef scraper
pipenv run python -m data_scraper -s $(scraper) -v
else
pipenv run python -m data_scraper -v
endif
aggregate:
pipenv run python -m data_scraper -a
backup:
pipenv run python -m data_scraper -b
bench:
pipenv run python backtester/test/run_benchmark.py
+17
View File
@@ -0,0 +1,17 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
"beautifulsoup4" = "*"
requests = "*"
pandas = "*"
altair = "*"
jupyter = "*"
pandas-datareader = "*"
pandas-market-calendars = "*"
boto3 = "*"
[dev-packages]
yapf = "*"
Generated
+613
View File
@@ -0,0 +1,613 @@
{
"_meta": {
"hash": {
"sha256": "84e05b94cbc679483f07c28e69f72f29529e1dc9564069112070850467a18431"
},
"pipfile-spec": 6,
"requires": {},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"altair": {
"hashes": [
"sha256:63934563a7a7b7186335858206a0b9be6043163b8b54a26cd3b3299a9e5e391f",
"sha256:65e243afa6da5b746c411890fd7dfd0f187c0f8e581cf3c34b07339712cf6627"
],
"index": "pypi",
"version": "==3.0.1"
},
"appnope": {
"hashes": [
"sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
"sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
],
"markers": "sys_platform == 'darwin'",
"version": "==0.1.0"
},
"attrs": {
"hashes": [
"sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
"sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
],
"version": "==19.1.0"
},
"backcall": {
"hashes": [
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
],
"version": "==0.1.0"
},
"beautifulsoup4": {
"hashes": [
"sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
],
"index": "pypi",
"version": "==4.7.1"
},
"bleach": {
"hashes": [
"sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16",
"sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa"
],
"version": "==3.1.0"
},
"boto3": {
"hashes": [
"sha256:bc6635529f85cf130813fda973a2a5240f54532c7e284b04f215dcee9721bbb8",
"sha256:d60329ff211a496016c334dbecf7ab70500b19aee013d2435b4f30d9e0a84ca1"
],
"index": "pypi",
"version": "==1.9.146"
},
"botocore": {
"hashes": [
"sha256:1517c52eaa3056d0e81f9a81b580d7f28440e7e1523d10a8acc8160c56be7113",
"sha256:19d9d56fcf4f16ffea8a929bbf3c72db3458b6c1f306c04031f3166759cd62ac"
],
"version": "==1.12.146"
},
"certifi": {
"hashes": [
"sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5",
"sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae"
],
"version": "==2019.3.9"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"decorator": {
"hashes": [
"sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
"sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
],
"version": "==4.4.0"
},
"defusedxml": {
"hashes": [
"sha256:6687150770438374ab581bb7a1b327a847dd9c5749e396102de3fad4e8a3ef93",
"sha256:f684034d135af4c6cbb949b8a4d2ed61634515257a67299e5f940fbaa34377f5"
],
"version": "==0.6.0"
},
"docutils": {
"hashes": [
"sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6",
"sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274",
"sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6"
],
"version": "==0.14"
},
"entrypoints": {
"hashes": [
"sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
"sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
],
"version": "==0.3"
},
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
],
"version": "==2.8"
},
"ipykernel": {
"hashes": [
"sha256:0aeb7ec277ac42cc2b59ae3d08b10909b2ec161dc6908096210527162b53675d",
"sha256:0fc0bf97920d454102168ec2008620066878848fcfca06c22b669696212e292f"
],
"version": "==5.1.0"
},
"ipython": {
"hashes": [
"sha256:54c5a8aa1eadd269ac210b96923688ccf01ebb2d0f21c18c3c717909583579a8",
"sha256:e840810029224b56cd0d9e7719dc3b39cf84d577f8ac686547c8ba7a06eeab26"
],
"markers": "python_version >= '3.3'",
"version": "==7.5.0"
},
"ipython-genutils": {
"hashes": [
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
],
"version": "==0.2.0"
},
"ipywidgets": {
"hashes": [
"sha256:0f2b5cde9f272cb49d52f3f0889fdd1a7ae1e74f37b48dac35a83152780d2b7b",
"sha256:a3e224f430163f767047ab9a042fc55adbcab0c24bbe6cf9f306c4f89fdf0ba3"
],
"version": "==7.4.2"
},
"jedi": {
"hashes": [
"sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b",
"sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c"
],
"version": "==0.13.3"
},
"jinja2": {
"hashes": [
"sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013",
"sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b"
],
"version": "==2.10.1"
},
"jmespath": {
"hashes": [
"sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6",
"sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c"
],
"version": "==0.9.4"
},
"jsonschema": {
"hashes": [
"sha256:0c0a81564f181de3212efa2d17de1910f8732fa1b71c42266d983cd74304e20d",
"sha256:a5f6559964a3851f59040d3b961de5e68e70971afb88ba519d27e6a039efff1a"
],
"version": "==3.0.1"
},
"jupyter": {
"hashes": [
"sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7",
"sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78",
"sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f"
],
"index": "pypi",
"version": "==1.0.0"
},
"jupyter-client": {
"hashes": [
"sha256:b5f9cb06105c1d2d30719db5ffb3ea67da60919fb68deaefa583deccd8813551",
"sha256:c44411eb1463ed77548bc2d5ec0d744c9b81c4a542d9637c7a52824e2121b987"
],
"version": "==5.2.4"
},
"jupyter-console": {
"hashes": [
"sha256:308ce876354924fb6c540b41d5d6d08acfc946984bf0c97777c1ddcb42e0b2f5",
"sha256:cc80a97a5c389cbd30252ffb5ce7cefd4b66bde98219edd16bf5cb6f84bb3568"
],
"version": "==6.0.0"
},
"jupyter-core": {
"hashes": [
"sha256:927d713ffa616ea11972534411544589976b2493fc7e09ad946e010aa7eb9970",
"sha256:ba70754aa680300306c699790128f6fbd8c306ee5927976cbe48adacf240c0b7"
],
"version": "==4.4.0"
},
"lxml": {
"hashes": [
"sha256:03984196d00670b2ab14ae0ea83d5cc0cfa4f5a42558afa9ab5fa745995328f5",
"sha256:0815b0c9f897468de6a386dc15917a0becf48cc92425613aa8bbfc7f0f82951f",
"sha256:175f3825f075cf02d15099eb52658457cf0ff103dcf11512b5d2583e1d40f58b",
"sha256:30e14c62d88d1e01a26936ecd1c6e784d4afc9aa002bba4321c5897937112616",
"sha256:3210da6f36cf4b835ff1be853962b22cc354d506f493b67a4303c88bbb40d57b",
"sha256:40f60819fbd5bad6e191ba1329bfafa09ab7f3f174b3d034d413ef5266963294",
"sha256:43b26a865a61549919f8a42e094dfdb62847113cf776d84bd6b60e4e3fc20ea3",
"sha256:4a03dd682f8e35a10234904e0b9508d705ff98cf962c5851ed052e9340df3d90",
"sha256:62f382cddf3d2e52cf266e161aa522d54fd624b8cc567bc18f573d9d50d40e8e",
"sha256:7b98f0325be8450da70aa4a796c4f06852949fe031878b4aa1d6c417a412f314",
"sha256:846a0739e595871041385d86d12af4b6999f921359b38affb99cdd6b54219a8f",
"sha256:a3080470559938a09a5d0ec558c005282e99ac77bf8211fb7b9a5c66390acd8d",
"sha256:ad841b78a476623955da270ab8d207c3c694aa5eba71f4792f65926dc46c6ee8",
"sha256:afdd75d9735e44c639ffd6258ce04a2de3b208f148072c02478162d0944d9da3",
"sha256:b4fbf9b552faff54742bcd0791ab1da5863363fb19047e68f6592be1ac2dab33",
"sha256:b90c4e32d6ec089d3fa3518436bdf5ce4d902a0787dbd9bb09f37afe8b994317",
"sha256:b91cfe4438c741aeff662d413fd2808ac901cc6229c838236840d11de4586d63",
"sha256:bdb0593a42070b0a5f138b79b872289ee73c8e25b3f0bea6564e795b55b6bcdd",
"sha256:c4e4bca2bb68ce22320297dfa1a7bf070a5b20bcbaec4ee023f83d2f6e76496f",
"sha256:cec4ab14af9eae8501be3266ff50c3c2aecc017ba1e86c160209bb4f0423df6a",
"sha256:e83b4b2bf029f5104bc1227dbb7bf5ace6fd8fabaebffcd4f8106fafc69fc45f",
"sha256:e995b3734a46d41ae60b6097f7c51ba9958648c6d1e0935b7e0ee446ee4abe22",
"sha256:f679d93dec7f7210575c85379a31322df4c46496f184ef650d3aba1484b38a2d",
"sha256:fd213bb5166e46974f113c8228daaef1732abc47cb561ce9c4c8eaed4bd3b09b",
"sha256:fdcb57b906dbc1f80666e6290e794ab8fb959a2e17aa5aee1758a85d1da4533f",
"sha256:ff424b01d090ffe1947ec7432b07f536912e0300458f9a7f48ea217dd8362b86"
],
"version": "==4.3.3"
},
"markupsafe": {
"hashes": [
"sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
"sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
"sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
"sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
"sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
"sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
"sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
"sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
"sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
"sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
"sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
"sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
"sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
"sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
"sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
"sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
"sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
"sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
],
"version": "==1.1.1"
},
"mistune": {
"hashes": [
"sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e",
"sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4"
],
"version": "==0.8.4"
},
"nbconvert": {
"hashes": [
"sha256:138381baa41d83584459b5cfecfc38c800ccf1f37d9ddd0bd440783346a4c39c",
"sha256:4a978548d8383f6b2cfca4a3b0543afb77bc7cb5a96e8b424337ab58c12da9bc"
],
"version": "==5.5.0"
},
"nbformat": {
"hashes": [
"sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b",
"sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402"
],
"version": "==4.4.0"
},
"notebook": {
"hashes": [
"sha256:573e0ae650c5d76b18b6e564ba6d21bf321d00847de1d215b418acb64f056eb8",
"sha256:f64fa6624d2323fbef6210a621817d6505a45d0d4a9367f1843b20a38a4666ee"
],
"version": "==5.7.8"
},
"numpy": {
"hashes": [
"sha256:0e2eed77804b2a6a88741f8fcac02c5499bba3953ec9c71e8b217fad4912c56c",
"sha256:1c666f04553ef70fda54adf097dbae7080645435fc273e2397f26bbf1d127bbb",
"sha256:1f46532afa7b2903bfb1b79becca2954c0a04389d19e03dc73f06b039048ac40",
"sha256:315fa1b1dfc16ae0f03f8fd1c55f23fd15368710f641d570236f3d78af55e340",
"sha256:3d5fcea4f5ed40c3280791d54da3ad2ecf896f4c87c877b113576b8280c59441",
"sha256:48241759b99d60aba63b0e590332c600fc4b46ad597c9b0a53f350b871ef0634",
"sha256:4b4f2924b36d857cf302aec369caac61e43500c17eeef0d7baacad1084c0ee84",
"sha256:54fe3b7ed9e7eb928bbc4318f954d133851865f062fa4bbb02ef8940bc67b5d2",
"sha256:5a8f021c70e6206c317974c93eaaf9bc2b56295b6b1cacccf88846e44a1f33fc",
"sha256:754a6be26d938e6ca91942804eb209307b73f806a1721176278a6038869a1686",
"sha256:771147e654e8b95eea1293174a94f34e2e77d5729ad44aefb62fbf8a79747a15",
"sha256:78a6f89da87eeb48014ec652a65c4ffde370c036d780a995edaeb121d3625621",
"sha256:7fde5c2a3a682a9e101e61d97696687ebdba47637611378b4127fe7e47fdf2bf",
"sha256:80d99399c97f646e873dd8ce87c38cfdbb668956bbc39bc1e6cac4b515bba2a0",
"sha256:88a72c1e45a0ae24d1f249a529d9f71fe82e6fa6a3fd61414b829396ec585900",
"sha256:a4f4460877a16ac73302a9c077ca545498d9fe64e6a81398d8e1a67e4695e3df",
"sha256:a61255a765b3ac73ee4b110b28fccfbf758c985677f526c2b4b39c48cc4b509d",
"sha256:ab4896a8c910b9a04c0142871d8800c76c8a2e5ff44763513e1dd9d9631ce897",
"sha256:abbd6b1c2ef6199f4b7ca9f818eb6b31f17b73a6110aadc4e4298c3f00fab24e",
"sha256:b16d88da290334e33ea992c56492326ea3b06233a00a1855414360b77ca72f26",
"sha256:b78a1defedb0e8f6ae1eb55fa6ac74ab42acc4569c3a2eacc2a407ee5d42ebcb",
"sha256:cfef82c43b8b29ca436560d51b2251d5117818a8d1fb74a8384a83c096745dad",
"sha256:d160e57731fcdec2beda807ebcabf39823c47e9409485b5a3a1db3a8c6ce763e"
],
"version": "==1.16.3"
},
"pandas": {
"hashes": [
"sha256:071e42b89b57baa17031af8c6b6bbd2e9a5c68c595bc6bf9adabd7a9ed125d3b",
"sha256:17450e25ae69e2e6b303817bdf26b2cd57f69595d8550a77c308be0cd0fd58fa",
"sha256:17916d818592c9ec891cbef2e90f98cc85e0f1e89ed0924c9b5220dc3209c846",
"sha256:2538f099ab0e9f9c9d09bbcd94b47fd889bad06dc7ae96b1ed583f1dc1a7a822",
"sha256:366f30710172cb45a6b4f43b66c220653b1ea50303fbbd94e50571637ffb9167",
"sha256:42e5ad741a0d09232efbc7fc648226ed93306551772fc8aecc6dce9f0e676794",
"sha256:4e718e7f395ba5bfe8b6f6aaf2ff1c65a09bb77a36af6394621434e7cc813204",
"sha256:4f919f409c433577a501e023943e582c57355d50a724c589e78bc1d551a535a2",
"sha256:4fe0d7e6438212e839fc5010c78b822664f1a824c0d263fd858f44131d9166e2",
"sha256:5149a6db3e74f23dc3f5a216c2c9ae2e12920aa2d4a5b77e44e5b804a5f93248",
"sha256:627594338d6dd995cfc0bacd8e654cd9e1252d2a7c959449228df6740d737eb8",
"sha256:83c702615052f2a0a7fb1dd289726e29ec87a27272d775cb77affe749cca28f8",
"sha256:8c872f7fdf3018b7891e1e3e86c55b190e6c5cee70cab771e8f246c855001296",
"sha256:90f116086063934afd51e61a802a943826d2aac572b2f7d55caaac51c13db5b5",
"sha256:a3352bacac12e1fc646213b998bce586f965c9d431773d9e91db27c7c48a1f7d",
"sha256:bcdd06007cca02d51350f96debe51331dec429ac8f93930a43eb8fb5639e3eb5",
"sha256:c1bd07ebc15285535f61ddd8c0c75d0d6293e80e1ee6d9a8d73f3f36954342d0",
"sha256:c9a4b7c55115eb278c19aa14b34fcf5920c8fe7797a09b7b053ddd6195ea89b3",
"sha256:cc8fc0c7a8d5951dc738f1c1447f71c43734244453616f32b8aa0ef6013a5dfb",
"sha256:d7b460bc316064540ce0c41c1438c416a40746fd8a4fb2999668bf18f3c4acf1"
],
"index": "pypi",
"version": "==0.24.2"
},
"pandas-datareader": {
"hashes": [
"sha256:6a5ad8c9ca27af148d06ac8eb526914cc12d04ae1d93af423d173279e2226c46",
"sha256:7dee3fe6fa483c8c2ee4f1af91a65b542c5446d75a6fc25c832cad1ffca8ef0b"
],
"index": "pypi",
"version": "==0.7.0"
},
"pandas-market-calendars": {
"hashes": [
"sha256:828178d434173f439553fa33ed3b9d40d70202f2cbf4247bb3fd42e5fcbb5acd",
"sha256:f8fb120654314bb8f2ac447ec5a2702926409df47876cd0b8983285ee9217088"
],
"index": "pypi",
"version": "==1.1"
},
"pandocfilters": {
"hashes": [
"sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9"
],
"version": "==1.4.2"
},
"parso": {
"hashes": [
"sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33",
"sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376"
],
"version": "==0.4.0"
},
"pexpect": {
"hashes": [
"sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
"sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
],
"markers": "sys_platform != 'win32'",
"version": "==4.7.0"
},
"pickleshare": {
"hashes": [
"sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
"sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
],
"version": "==0.7.5"
},
"prometheus-client": {
"hashes": [
"sha256:1b38b958750f66f208bcd9ab92a633c0c994d8859c831f7abc1f46724fcee490"
],
"version": "==0.6.0"
},
"prompt-toolkit": {
"hashes": [
"sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
"sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
"sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
],
"version": "==2.0.9"
},
"ptyprocess": {
"hashes": [
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
],
"markers": "os_name != 'nt'",
"version": "==0.6.0"
},
"pygments": {
"hashes": [
"sha256:31cba6ffb739f099a85e243eff8cb717089fdd3c7300767d9fc34cb8e1b065f5",
"sha256:5ad302949b3c98dd73f8d9fcdc7e9cb592f120e32a18e23efd7f3dc51194472b"
],
"version": "==2.4.0"
},
"pyrsistent": {
"hashes": [
"sha256:16692ee739d42cf5e39cef8d27649a8c1fdb7aa99887098f1460057c5eb75c3a"
],
"version": "==0.15.2"
},
"python-dateutil": {
"hashes": [
"sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
"sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
],
"markers": "python_version >= '2.7'",
"version": "==2.8.0"
},
"pytz": {
"hashes": [
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
"sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
],
"version": "==2019.1"
},
"pyzmq": {
"hashes": [
"sha256:1651e52ed91f0736afd6d94ef9f3259b5534ce8beddb054f3d5ca989c4ef7c4f",
"sha256:5ccb9b3d4cd20c000a9b75689d5add8cd3bce67fcbd0f8ae1b59345247d803af",
"sha256:5e120c4cd3872e332fb35d255ad5998ebcee32ace4387b1b337416b6b90436c7",
"sha256:5e2a3707c69a7281a9957f83718815fd74698cba31f6d69f9ed359921f662221",
"sha256:63d51add9af8d0442dc90f916baf98fdc04e3b0a32afec4bfc83f8d85e72959f",
"sha256:65c5a0bdc49e20f7d6b03a661f71e2fda7a99c51270cafe71598146d09810d0d",
"sha256:66828fabe911aa545d919028441a585edb7c9c77969a5fea6722ef6e6ece38ab",
"sha256:7d79427e82d9dad6e9b47c0b3e7ae5f9d489b1601e3a36ea629bb49501a4daf3",
"sha256:824ee5d3078c4eae737ffc500fbf32f2b14e6ec89b26b435b7834febd70120cf",
"sha256:89dc0a83cccec19ff3c62c091e43e66e0183d1e6b4658c16ee4e659518131494",
"sha256:8b319805f6f7c907b101c864c3ca6cefc9db8ce0791356f180b1b644c7347e4c",
"sha256:90facfb379ab47f94b19519c1ecc8ec8d10813b69d9c163117944948bdec5d15",
"sha256:a0a178c7420021fc0730180a914a4b4b3092ce9696ceb8e72d0f60f8ce1655dd",
"sha256:a7a89591ae315baccb8072f216614b3e59aed7385aef4393a6c741783d6ee9cf",
"sha256:ba2578f0ae582452c02ed9fac2dc477b08e80ce05d2c0885becf5fff6651ccb0",
"sha256:c69b0055c55702f5b0b6b354133e8325b9a56dbc80e1be2d240bead253fb9825",
"sha256:ca434e1858fe222380221ddeb81e86f45522773344c9da63c311d17161df5e06",
"sha256:d4b8ecfc3d92f114f04d5c40f60a65e5196198b827503341521dda12d8b14939",
"sha256:d706025c47b09a54f005953ebe206f6d07a22516776faa4f509aaff681cc5468",
"sha256:d8f27e958f8a2c0c8ffd4d8855c3ce8ac3fa1e105f0491ce31729aa2b3229740",
"sha256:dbd264298f76b9060ce537008eb989317ca787c857e23cbd1b3ddf89f190a9b1",
"sha256:e926d66f0df8fdbf03ba20583af0f215e475c667fb033d45fd031c66c63e34c9",
"sha256:efc3bd48237f973a749f7312f68062f1b4ca5c2032a0673ca3ea8e46aa77187b",
"sha256:f59bc782228777cbfe04555707a9c56d269c787ed25d6d28ed9d0fbb41cb1ad2",
"sha256:f8da5322f4ff5f667a0d5a27e871b560c6637153c81e318b35cb012b2a98835c"
],
"version": "==18.0.1"
},
"qtconsole": {
"hashes": [
"sha256:a667558c7b1e1442a2e5bcef1686c55e096efd0b58d8b2a0a8415f4579991ee3",
"sha256:fdfc6002d9d2834c88f9c92e0f6f590284ff3740fa53016f188a62d58bcca6d8"
],
"version": "==4.4.4"
},
"requests": {
"hashes": [
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
"sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
],
"index": "pypi",
"version": "==2.21.0"
},
"s3transfer": {
"hashes": [
"sha256:7b9ad3213bff7d357f888e0fab5101b56fa1a0548ee77d121c3a3dbfbef4cb2e",
"sha256:f23d5cb7d862b104401d9021fc82e5fa0e0cf57b7660a1331425aab0c691d021"
],
"version": "==0.2.0"
},
"send2trash": {
"hashes": [
"sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2",
"sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"
],
"version": "==1.5.0"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
},
"soupsieve": {
"hashes": [
"sha256:6898e82ecb03772a0d82bd0d0a10c0d6dcc342f77e0701d0ec4a8271be465ece",
"sha256:b20eff5e564529711544066d7dc0f7661df41232ae263619dede5059799cdfca"
],
"version": "==1.9.1"
},
"terminado": {
"hashes": [
"sha256:d9d012de63acb8223ac969c17c3043337c2fcfd28f3aea1ee429b345d01ef460",
"sha256:de08e141f83c3a0798b050ecb097ab6259c3f0331b2f7b7750c9075ced2c20c2"
],
"version": "==0.8.2"
},
"testpath": {
"hashes": [
"sha256:46c89ebb683f473ffe2aab0ed9f12581d4d078308a3cb3765d79c6b2317b0109",
"sha256:b694b3d9288dbd81685c5d2e7140b81365d46c29f5db4bc659de5aa6b98780f8"
],
"version": "==0.4.2"
},
"toolz": {
"hashes": [
"sha256:929f0a7ea7f61c178bd951bdae93920515d3fbdbafc8e6caf82d752b9b3b31c9"
],
"version": "==0.9.0"
},
"tornado": {
"hashes": [
"sha256:1174dcb84d08887b55defb2cda1986faeeea715fff189ef3dc44cce99f5fca6b",
"sha256:2613fab506bd2aedb3722c8c64c17f8f74f4070afed6eea17f20b2115e445aec",
"sha256:44b82bc1146a24e5b9853d04c142576b4e8fa7a92f2e30bc364a85d1f75c4de2",
"sha256:457fcbee4df737d2defc181b9073758d73f54a6cfc1f280533ff48831b39f4a8",
"sha256:49603e1a6e24104961497ad0c07c799aec1caac7400a6762b687e74c8206677d",
"sha256:8c2f40b99a8153893793559919a355d7b74649a11e59f411b0b0a1793e160bc0",
"sha256:e1d897889c3b5a829426b7d52828fb37b28bc181cd598624e65c8be40ee3f7fa"
],
"version": "==6.0.2"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
],
"version": "==4.3.2"
},
"urllib3": {
"hashes": [
"sha256:2393a695cd12afedd0dcb26fe5d50d0cf248e5a66f75dbd89a3d4eb333a61af4",
"sha256:a637e5fae88995b256e3409dc4d52c2e2e0ba32c42a6365fee8bbd2238de3cfb"
],
"markers": "python_version >= '3.4'",
"version": "==1.24.3"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
],
"version": "==0.1.7"
},
"webencodings": {
"hashes": [
"sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
"sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
],
"version": "==0.5.1"
},
"widgetsnbextension": {
"hashes": [
"sha256:14b2c65f9940c9a7d3b70adbe713dbd38b5ec69724eebaba034d1036cf3d4740",
"sha256:fa618be8435447a017fd1bf2c7ae922d0428056cfc7449f7a8641edf76b48265"
],
"version": "==3.4.2"
},
"wrapt": {
"hashes": [
"sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
],
"version": "==1.11.1"
}
},
"develop": {
"yapf": {
"hashes": [
"sha256:34f6f80c446dcb2c44bd644c4037a2024b6645e293a4c9c4521983dd0bb247a1",
"sha256:613deba14233623ff3432d9d5032631b5f600be97b39f66932cbe67648bfa8ea"
],
"index": "pypi",
"version": "==0.27.0"
}
}
}
+132 -1
View File
@@ -1 +1,132 @@
# backtester_options
Options Backtester
==============================
Simple backtester to evaluate and analyse options strategies over historical price data.
- [Requirements](#requirements)
- [Setup](#setup)
- [Usage](#usage)
- [Recommended Reading](#recommended-reading)
- [Data Sources](#data-sources)
## Requirements
- Python >= 3.5
- pipenv
## Setup
For backtesting, set `$OPTIONS_DATA_PATH` to the appropriate directory where the data is located. All file paths parsed by the backtester will be relative to this directory.
To use the data scraper the following environment variables need to be set:
- `$SAVE_DATA_PATH`: where the data will be saved to (default is `./data/scraped`)
- `$TIINGO_API_KEY`: used to fetch data from [Tiingo](https://api.tiingo.com)
- `$S3_BUCKET`: name of the S3 bucket to backup data
- `$AWS_ACCESS_KEY_ID`: AWS acces key id
- `$AWS_SECRET_ACCESS_KEY`: AWS secret key
You can configure the data scraper by editing the configuration file `data_scraper.conf` (json-formated).
Sample file:
```json
{
"cboe": {
"mute_notifications": ["BFB", "CBSA"]
},
"notifications": {
"slack_webhook": "https://hooks.slack.com/services/MY_WORKSPACE_WEBHOOK"
}
}
```
**HINT**: store environment variables in an `.env` file and pipenv will load them automatically when using `make env`.
## Usage
### Create environment and download dependencies
```shell
$> make init
```
### Activate environment
```shell
$> make env
```
### Run tests
```shell
$> make test
```
### Scrape data (supported scrapers: CBOE, Tiingo)
```shell
$> make scrape scraper=cboe
$> make scrape scraper=tiingo
```
### Run backtester with benchmark strategy
```shell
$> make bench
```
## Recommended reading
For complete novices in finance and economics, this [post](https://notamonadtutorial.com/how-to-earn-your-macroeconomics-and-finance-white-belt-as-a-software-developer-136e7454866f) gives a comprehensive introduction.
### Books
#### Introductory
- Option Volatility and Pricing 2nd Ed. - Natemberg, 2014
- Options, Futures, and Other Derivatives 10th Ed. - Hull 2017
- Trading Options Greeks: How Time, Volatility, and Other Pricing Factors Drive Profits 2nd Ed. - Passarelli 2012
#### Intermediate
- Trading Volatility - Bennet 2014
- Volatility Trading 2nd Ed. - Sinclair 2013
#### Advanced
- Dynamic Hedging - Taleb 1997
- The Volatility Surface: A Practitioner's Guide - Gatheral 2006
- The Volatility Smile - Derman & Miller 2016
### Papers
- [Volatility: A New Return Driver?](http://static.squarespace.com/static/53974e3ae4b0039937edb698/t/53da6400e4b0d5d5360f4918/1406821376095/Directional%20Volatility%20Research.pdf)
- [Easy Volatility Investing](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2255327)
- [Everybodys Doing It: Short Volatility Strategies and Shadow Financial Insurers](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3071457)
- [Volatility-of-Volatility Risk](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2497759)
- [The Distribution of Returns](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2828744)
- [Safe Haven Investing Part I - Not all risk mitigation is created equal](https://www.universa.net/UniversaResearch_SafeHavenPart1_RiskMitigation.pdf)
- [Safe Haven Investing Part II - Not all risk is created equal](https://www.universa.net/UniversaResearch_SafeHavenPart2_NotAllRisk.pdf)
- [Safe Haven Investing Part III - Those wonderful tenbaggers](https://www.universa.net/UniversaResearch_SafeHavenPart3_Tenbaggers.pdf)
- [Insurance makes wealth grow faster](https://arxiv.org/abs/1507.04655)
- [Ergodicity economics](https://ergodicityeconomics.files.wordpress.com/2018/06/ergodicity_economics.pdf)
- [The Rate of Return on Everything, 18702015](https://economics.harvard.edu/files/economics/files/ms28533.pdf)
- [Volatility and the Alchemy of Risk](https://static1.squarespace.com/static/5581f17ee4b01f59c2b1513a/t/59ea16dbbe42d6ff1cae589f/1508513505640/Artemis_Volatility+and+the+Alchemy+of+Risk_2017.pdf)
## Data sources
### Exchanges
- [IEX](https://iextrading.com/developer/)
- [Tiingo](https://api.tiingo.com/)
- [CBOE Options Data](http://www.cboe.com/delayedquote/quote-table-download)
### Historical Data
- [Shiller's US Stocks, Dividends, Earnings, Inflation (CPI), and long term interest rates](http://www.econ.yale.edu/~shiller/data.htm)
- [Fama/French US Stock Index Data](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html)
- [FRED CPI, Interest Rates, Trade Data](https://fred.stlouisfed.org)
- [REIT Data](https://www.reit.com/data-research/reit-market-data/reit-industry-financial-snapshot)
+8
View File
@@ -0,0 +1,8 @@
{
"cboe": {
"mute_notifications": []
},
"notifications": {
"slack_webhook": ""
}
}
View File
+48
View File
@@ -0,0 +1,48 @@
import logging.config
import os
import argparse
from data_scraper import cboe, tiingo, backup
parser = argparse.ArgumentParser(prog="data_scraper.py")
parser.add_argument("-t", "--symbols", nargs="+", help="Symbols to fetch")
parser.add_argument(
"-s",
"--scraper",
choices=["cboe", "tiingo"],
default="cboe",
help="Scraper to use")
parser.add_argument(
"-v", "--verbose", action="store_true", help="Enable logging")
parser.add_argument(
"-a",
"--aggregate",
action="store_true",
help="Aggregate daily data files")
parser.add_argument(
"-b", "--backup", action="store_true", help="Backup files in S3 bucket")
args = parser.parse_args()
module_dir = os.path.join(os.getcwd(), os.path.dirname(__file__))
if args.verbose:
config_file = os.path.realpath(os.path.join(module_dir, "logconfig.ini"))
logging.config.fileConfig(fname=config_file)
if args.aggregate:
if args.symbols:
cboe.aggregate_monthly_data(args.symbols)
else:
cboe.aggregate_monthly_data()
elif args.backup:
backup.backup_data()
else:
if args.scraper == "tiingo":
scraper = tiingo
else:
scraper = cboe
if args.symbols:
scraper.fetch_data(args.symbols)
else:
scraper.fetch_data()
+114
View File
@@ -0,0 +1,114 @@
import logging
import os
import boto3
from botocore.exceptions import ClientError
from data_scraper import utils
from data_scraper.notifications import slack_notification, Status
logger = logging.getLogger(__name__)
def backup_data():
"""Uploads scraped files to S3 bucket.
Set bucket name in environment variable $S3_BUCKET
"""
try:
bucket_name = utils.get_environment_var("S3_BUCKET")
except EnvironmentError as e:
logger.error(str(e))
slack_notification("Backup failed. Set $S3_BUCKET env variable",
__name__)
raise e
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)
data_path = utils.get_save_data_path()
cboe_data = os.path.join(data_path, "cboe")
cboe_folders = []
if os.path.exists(cboe_data):
cboe_folders = [
os.path.join(cboe_data, folder) for folder in os.listdir(cboe_data)
if not folder.endswith("daily")
]
tiingo_data = os.path.join(data_path, "tiingo")
tiingo_folders = []
if os.path.exists(tiingo_data):
tiingo_folders = [
os.path.join(tiingo_data, folder)
for folder in os.listdir(tiingo_data)
]
done_cboe, fail_cboe = _upload_folders(
bucket, "cboe", cboe_folders, remove_files=False)
done_tiingo, fail_tiingo = _upload_folders(
bucket, "tiingo", tiingo_folders, remove_files=True)
done = done_cboe + done_tiingo
failed = fail_cboe + fail_tiingo
if len(done) > 0:
msg = "Successful backup of symbols: " + ", ".join(done)
slack_notification(msg, __name__, status=Status.Success)
if len(failed) > 0:
msg = "Unable to backup symbols: " + ", ".join(done)
slack_notification(msg, __name__, status=Status.Warning)
def _upload_folders(bucket, scraper, folders, remove_files=False):
"""Uploads folders to S3 bucket and (optionally) removes old files"""
data_path = utils.get_save_data_path()
done, failed = [], []
for folder in folders:
symbol = os.path.basename(folder)
try:
if remove_files:
_remove_old_files(bucket, prefix=scraper + "/" + symbol)
_upload_folder(bucket, folder, data_path)
except Exception:
failed.append(os.path.basename(folder))
else:
done.append(os.path.basename(folder))
return (done, failed)
def _upload_folder(bucket, folder, data_path):
"""Uploads folder contents to S3 bucket"""
if not os.path.isdir(folder):
return
for root, dirs, files in os.walk(folder):
for file in files:
file_path = os.path.join(root, file)
key = os.path.relpath(file_path, data_path)
if _key_exists(bucket, key):
logger.debug("File already exists in S3")
continue
try:
bucket.upload_file(file_path, key)
logger.debug("Uploaded file %s to S3", file)
except Exception as e:
msg = "Error uploading data file {} to S3.\nReceived exception message {}".format(
file_path, str(e))
logger.error(msg, exc_info=True)
slack_notification(msg, __name__)
raise e
def _key_exists(bucket, key):
try:
bucket.Object(key).load()
except ClientError as e:
return int(e.response["Error"]["Code"]) != 404
return False
def _remove_old_files(bucket, prefix):
old_files = bucket.objects.filter(Prefix=prefix)
for file in old_files:
file.delete()
+249
View File
@@ -0,0 +1,249 @@
import logging
import os
from datetime import date
from io import StringIO
from itertools import groupby
from bs4 import BeautifulSoup
import requests
import pandas as pd
from . import utils, validation
from .notifications import slack_notification, Status
logger = logging.getLogger(__name__)
url = "http://www.cboe.com/delayedquote/quote-table-download"
def fetch_data(symbols=None):
"""Fetches options data for a given list of symbols"""
symbols = symbols or _get_all_listed_symbols()
options = utils.get_module_config("cboe")
mute_notifications = options.get("mute_notifications", [])
try:
form_data = _form_data()
except requests.ConnectionError as ce:
msg = "Connection error trying to reach {}".format(url)
logger.error(msg)
slack_notification(msg, __name__)
raise ce
except Exception as e:
msg = "Error parsing response"
logger.error(msg, exc_info=True)
slack_notification(msg, __name__)
raise e
headers = {"Referer": url}
file_url = "http://www.cboe.com/delayedquote/quotedata.dat"
symbols = [symbol.upper() for symbol in symbols]
done, failed = [], []
for symbol in symbols:
form_data["ctl00$ContentTop$C005$txtTicker"] = symbol
try:
response = requests.post(url,
data=form_data,
headers=headers,
allow_redirects=False)
symbol_req = requests.get(file_url,
cookies=response.cookies,
headers=headers)
symbol_data = symbol_req.text
if symbol_data == "" or symbol_data.startswith(" <!DOCTYPE"):
raise Exception
except Exception:
failed.append(symbol)
msg = "Error fetching symbol {} data".format(symbol)
logger.error(msg, exc_info=True)
if symbol not in mute_notifications:
slack_notification(msg, __name__)
else:
_save_data(symbol, symbol_data)
done.append(symbol)
if len(done) > 0:
msg = "Successfully scraped symbols: " + ", ".join(done)
slack_notification(msg, __name__, status=Status.Success)
if len(failed) > 0:
msg = "Failed to scrape symbols: " + ", ".join(failed)
slack_notification(msg, __name__, status=Status.Warning)
def aggregate_monthly_data(symbols=None):
"""Aggregate daily snapshots into monthly files and validate data"""
symbols = symbols or _get_all_listed_symbols()
save_data_path = utils.get_save_data_path()
scraper_dir = os.path.join(save_data_path, "cboe")
symbols = [symbol.upper() for symbol in symbols]
for symbol in symbols:
daily_dir = os.path.join(scraper_dir, symbol + "_daily")
if not os.path.exists(daily_dir):
msg = "Error aggregating data. Dir {} not found.".format(daily_dir)
logger.error(msg)
slack_notification(msg, __name__)
continue
monthly_dir = os.path.join(scraper_dir, symbol)
symbol_files = [
file for file in os.listdir(daily_dir) if file.endswith(".csv")
]
for month, files in groupby(symbol_files, _monthly_grouper):
file_names = list(files)
daily_files = [
os.path.join(daily_dir, name) for name in file_names
]
try:
symbol_df = concatenate_files(daily_files)
except Exception:
msg = "Error concatenating daily files for period " + month
logger.error(msg, exc_info=True)
slack_notification(msg, __name__)
continue
date_range = pd.to_datetime(symbol_df["quotedate"].unique())
if not validation.validate_dates_in_month(symbol, date_range):
today = pd.Timestamp.today()
first_date = date_range[0]
if first_date.year != today.year or first_date.month != today.month:
msg = "Some trading dates where missing for symbol {}".format(
symbol)
slack_notification(msg, __name__)
continue
if not os.path.exists(monthly_dir):
os.makedirs(monthly_dir)
logger.debug("Symbol dir %s created", monthly_dir)
file_name = _monthly_filename(file_names)
monthly_file = os.path.join(monthly_dir, file_name)
symbol_df.to_csv(monthly_file, index=False)
if not validation.validate_aggregate_file(monthly_file,
daily_files):
utils.remove_file(monthly_file)
msg = "Data in {} differs from the daily files".format(
monthly_file)
logger.error(msg)
slack_notification(msg, __name__)
continue
logger.debug("Saved monthly data %s", monthly_file)
for file in daily_files:
utils.remove_file(file, logger)
def _get_all_listed_symbols():
"""Returns array of all listed symbols.
http://www.cboe.com/publish/scheduledtask/mktdata/cboesymboldir2.csv
"""
current_dir = os.path.join(os.getcwd(), os.path.dirname(__file__))
symbols_file = os.path.realpath(
os.path.join(current_dir, "cboesymboldir2.csv"))
symbols_df = pd.read_csv(symbols_file, skiprows=1)
return symbols_df["Stock Symbol"].array
def concatenate_files(files):
"""Returns a dataframe of the concatenated data from `files`."""
df_generator = (pd.read_csv(file) for file in sorted(files))
return pd.concat(df_generator, ignore_index=True)
def _form_data():
"""Return validation form data"""
homepage = requests.get(url)
soup = BeautifulSoup(homepage.content, "lxml")
data = {
"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]
}
return data
def _save_data(symbol, symbol_data):
"""Saves the contents of `symbol_data` to
`$SAVE_DATA_PATH/cboe/{symbol}_daily/{symbol}_{%date}.csv`
"""
filename = date.today().strftime(symbol + "_%Y%m%d.csv")
save_data_path = utils.get_save_data_path()
symbol_dir = os.path.join(save_data_path, "cboe", symbol + "_daily")
if not os.path.exists(symbol_dir):
os.makedirs(symbol_dir)
logger.debug("Symbol dir %s created", symbol_dir)
file_path = os.path.join(symbol_dir, filename)
if os.path.exists(file_path) and validation.file_hash_matches_data(
file_path, symbol_data):
logger.debug("File %s already downloaded", file_path)
else:
daily_df = _wrangle_data(symbol, symbol_data)
daily_df.to_csv(file_path, index=False)
logger.debug("Saved daily symbol data as %s", file_path)
def _wrangle_data(symbol, symbol_data):
"""Returns a properly formated (_tidy_) dataframe"""
string_data = StringIO(symbol_data)
first_line = string_data.readline()
spot_price = float(first_line.split(",")[-2])
quote_date = date.today().strftime("%m/%d/%Y")
data = pd.read_csv(string_data, skiprows=1)
call_columns = [
"Calls", "Expiration Date", "Strike", "Last Sale", "Net", "Bid", "Ask",
"Vol", "Open Int", "IV", "Delta", "Gamma"
]
calls = data[call_columns]
put_columns = [
"Puts", "Expiration Date", "Strike", "Last Sale.1", "Net.1", "Bid.1",
"Ask.1", "Vol.1", "Open Int.1", "IV.1", "Delta.1", "Gamma.1"
]
puts = data[put_columns]
renamed_columns = [
"optionroot", "expiration", "strike", "last", "net", "bid", "ask",
"volume", "openinterest", "impliedvol", "delta", "gamma"
]
calls.columns = renamed_columns
calls.insert(loc=1, column="type", value="call")
puts.columns = renamed_columns
puts.insert(loc=1, column="type", value="put")
merged = pd.concat([calls, puts])
merged.insert(loc=0, column="underlying", value=symbol)
merged.insert(loc=1, column="underlying_last", value=spot_price)
merged.insert(loc=2, column="exchange", value="CBOE")
merged.insert(loc=6, column="quotedate", value=quote_date)
return merged
def _monthly_grouper(filename):
"""Returns `{year}{month}` string. Used to group files by month."""
basename = filename.split(".")[0]
file_date = basename.split("_")[1]
return file_date[:-2]
def _monthly_filename(filenames):
"""Returns filename of monthly aggregate file in the form
`{symbol}_{start_date}_to_{end_date}.csv`
"""
sorted_files = list(sorted(filenames))
first_file = sorted_files[0]
last_file = sorted_files[-1]
last_day = last_file.split(".")[0][-8:] # Get only the date
file_name = first_file.split(".")[0] + "_to_" + last_day + ".csv"
return file_name
+66
View File
@@ -0,0 +1,66 @@
# CBOE data scraper
# Requires Selenium and a headless Chrome driver
import tempfile
import time
import os
import shutil
from datetime import date
from selenium import webdriver
class CBOE():
"""CBOE data downloader."""
url = "http://www.cboe.com/delayedquote/quote-table-download"
def __init__(self):
self.data_path = self._get_data_path()
self.tmp_dir = tempfile.TemporaryDirectory()
self.driver = self._initilize_driver(self.tmp_dir.name)
def _get_data_path():
path = os.getenv("OPTIONS_DATA_PATH")
if not path:
raise EnvironmentError("Environment variable $OPTIONS_DATA_PATH not set")
return os.path.expanduser(path)
def _initilize_driver(download_dir):
"""Initilizes the Chrome driver to silently download files
to a temporary directory.
"""
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_argument("disable-gpu")
driver = webdriver.Chrome(options=options)
driver.command_executor._commands["send_command"] = (
"POST",
"/session/$sessionId/chromium/send_command"
)
params = {
"cmd": "Page.setDownloadBehavior",
"params": {
"behavior": "allow",
"downloadPath": download_dir
}
}
driver.execute("send_command", params)
driver.implicitly_wait(10)
return driver
def fetch_data(self, symbols):
"""Fetches options data for a given list of symbols"""
self.driver.get(CBOE.url)
for symbol in symbols:
ticker = self.driver.find_element_by_css_selector("input#txtTicker")
ticker.send_keys(symbol)
submit = self.driver.find_element_by_css_selector("input#cmdSubmit")
submit.click()
time.sleep(15) # Horrible hack
download_path = os.path.join(self.tmp_dir.name, "quotedata.dat")
renamed_file = date.today().strftime(symbol + "_%Y%m%d.csv")
full_path = os.path.join(self.data_path, renamed_file)
shutil.move(download_path, full_path)
def __del__(self):
self.tmp_dir.cleanup()
File diff suppressed because it is too large Load Diff
+39
View File
@@ -0,0 +1,39 @@
[loggers]
keys=root,tiingo,cboe
[handlers]
keys=consoleHandler,fileHandler
[formatters]
keys=simpleFormatter
[logger_root]
level=ERROR
handlers=consoleHandler
[logger_tiingo]
level=DEBUG
handlers=consoleHandler,fileHandler
qualname=data_scraper.tiingo
propagate=0
[logger_cboe]
level=DEBUG
handlers=consoleHandler,fileHandler
qualname=data_scraper.cboe
propagate=0
[handler_consoleHandler]
class=StreamHandler
level=ERROR
formatter=simpleFormatter
args=(sys.stdout,)
[handler_fileHandler]
class=handlers.RotatingFileHandler
level=DEBUG
formatter=simpleFormatter
args=("data_scraper.log", "a", 3000000, 10)
[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+59
View File
@@ -0,0 +1,59 @@
import logging
from datetime import datetime
from enum import Enum
import requests
from .utils import get_module_config
logger = logging.getLogger(__name__)
Status = Enum("Status", "Success Warning Error")
options = get_module_config("notifications")
try:
webhook = options["slack_webhook"]
except KeyError as e:
logger.error("Missing slack webhook from configuration file")
raise e
payload = {
"channel": "#algotrading",
"username": "Talebot",
"icon_emoji": ":taleb:",
"attachments": [{
"footer": "Talebot"
}]
}
def slack_notification(text, scraper, status=Status.Error):
"""Post Slack notification"""
if status == Status.Error:
emoji = ":thumbsdown: "
title = "data_scraper error"
color = "#B22222"
else:
title = "data_scraper status report"
if status == Status.Success:
emoji = ":thumbsup: "
color = "#49C39E"
else:
emoji = ":warning: "
color = "#EDB625"
msg = emoji + text
payload["attachments"][0]["fallback"] = msg
payload["attachments"][0]["text"] = msg
payload["attachments"][0]["color"] = color
payload["attachments"][0]["title"] = title
payload["attachments"][0]["fields"] = [{"title": scraper}]
payload["attachments"][0]["ts"] = datetime.today().timestamp()
response = requests.post(webhook, json=payload)
if response.status_code != 200:
msg = "Error connecting to Slack {}. Response is:\n{}".format(
response.status_code, response.text)
logger.error(msg)
+41
View File
@@ -0,0 +1,41 @@
#!/bin/bash
## This script downloads stock options data,
## veryfies zipfile integrity, saves md5 signature
## and uploads them to S3.
## To run, pass a list of files to download:
## $> ./backup.sh files.txt
TMPDIR=tmp
NOW=$(date +"%m-%d-%Y-%H%M%S")
RETRY="${NOW}.txt"
MD5SUMS=md5sums.txt
mkdir -p $TMPDIR
while read filename
do
echo "Downloading file $filename to $TMPDIR"
wget --quiet -P $TMPDIR "ftp://l3_hdall:JKNRH7LYXV@ftp.deltaneutral.com/${filename}"
newpath="$TMPDIR/$filename"
echo "Verifying zipfile $newpath"
if unzip -t -q $newpath
then
echo "File check OK"
else
echo "ERROR: File check failed for $newfile"
echo $filename >> $RETRY
rm $newpath
continue
fi
echo "Appending md5 sum for $f"
md5sum $newpath >> $MD5SUMS
echo "Copying $newpath to S3 bucket"
rclone copy -v $newpath longueduree:longueduree
echo "Deleting $newpath"
rm $newpath
done <$1
View File
+104
View File
@@ -0,0 +1,104 @@
import logging
import unittest
from unittest.mock import patch
import os
import shutil
from requests import ConnectionError
import pandas as pd
from data_scraper import cboe
logging.disable(level=logging.CRITICAL)
class TestCBOE(unittest.TestCase):
"""Tests CBOE data scraper"""
test_dir = os.path.join(os.getcwd(), os.path.dirname(__file__))
test_data_path = os.path.realpath(os.path.join(test_dir, "data"))
cboe_data_path = os.path.join(test_data_path, "cboe")
spx_data_path = os.path.join(cboe_data_path, "SPX_March_2019.csv")
@classmethod
def setUpClass(cls):
cls.save_data_path = os.environ.get("SAVE_DATA_PATH", None)
os.environ["SAVE_DATA_PATH"] = cls.test_data_path
@classmethod
def tearDownClass(cls):
if cls.save_data_path:
os.environ["SAVE_DATA_PATH"] = cls.save_data_path
@patch("data_scraper.cboe.slack_notification", return_value=None)
def test_fetch_spy(self, mocked_notification):
"""Fetch todays SPY quote"""
cboe.fetch_data(["SPY"])
spy_dir = os.path.join(TestCBOE.cboe_data_path, "SPY_daily")
self.addCleanup(TestCBOE.remove_files, spy_dir)
if self.assertTrue(os.path.exists(spy_dir)):
self.assertTrue(mocked_notification.called)
file_name = "SPY_" + pd.Timestamp.today().strftime(
"%Y%m%d") + ".csv"
file_path = os.path.join(spy_dir, file_name)
spy_df = pd.read_csv(file_path, parse_dates=["quotedate"])
self.assertTrue(all(spy_df["underlying"] == "SPX"))
self.assertEqual(spy_df["quotedate"].nunique(), 1)
counts = spy_df["type"].value_counts()
self.assertEqual(counts["put"] + counts["call"], len(spy_df))
@patch("data_scraper.cboe.slack_notification", return_value=None)
def test_fetch_invalid_symbol(self, mocked_notification):
"""Fetching invalid symbol should send notification"""
cboe.fetch_data(["FOOBAR"])
self.assertTrue(mocked_notification.called)
@patch("data_scraper.cboe.url", new="http://www.aldkfjaskldfjsa.com")
@patch("data_scraper.cboe.slack_notification", return_value=None)
def test_no_connection(self, mocked_notification):
"""Raise ConnectionError and send notification when host is unreachable"""
with self.assertRaises(ConnectionError):
cboe.fetch_data(["SPX"])
self.assertTrue(mocked_notification.called)
@patch("data_scraper.cboe.utils.remove_file", return_value=None)
@patch("data_scraper.cboe.slack_notification", return_value=None)
def test_data_aggregation(self, mocked_notification, mocked_remove):
"""Test data aggregation happy path"""
cboe.aggregate_monthly_data(["SPX"])
aggregate_file = os.path.join(TestCBOE.cboe_data_path, "SPX",
"SPX_20190301_to_20190329.csv")
self.addCleanup(TestCBOE.remove_files, os.path.dirname(aggregate_file))
self.assertTrue(mocked_remove.called)
self.assertFalse(mocked_notification.called)
if self.assertTrue(os.path.exists(aggregate_file)):
spx_df = pd.read_csv(TestCBOE.spx_data_path)
aggregate_df = pd.read_csv(aggregate_file)
self.assertTrue(spx_df.equals(aggregate_df))
@patch("data_scraper.cboe.utils.remove_file", return_value=None)
@patch("data_scraper.cboe.slack_notification", return_value=None)
def test_aggregate_missing_days(self, mocked_notification, mocked_remove):
"""Data aggregation should send notification when there are missing days"""
cboe.aggregate_monthly_data(["GOOG"])
self.assertTrue(mocked_notification.called)
self.assertFalse(mocked_remove.called)
@patch("data_scraper.cboe.utils.remove_file", return_value=None)
@patch("data_scraper.cboe.slack_notification", return_value=None)
def test_aggregate_invalid_symbol(self, mocked_notification,
mocked_remove):
"""Data aggregation should fail and send notification on invalid symbol"""
cboe.aggregate_monthly_data(["FOOBAR"])
self.assertTrue(mocked_notification.called)
self.assertFalse(mocked_remove.called)
def remove_files(file_path):
if os.path.exists(file_path):
shutil.rmtree(file_path)
if __name__ == "__main__":
unittest.main()
+75
View File
@@ -0,0 +1,75 @@
import logging
import unittest
from unittest.mock import patch
import os
import shutil
import pandas as pd
from data_scraper import tiingo
logging.disable(level=logging.CRITICAL)
class TestTiingo(unittest.TestCase):
"""Tests Tiingo data scraper"""
test_dir = os.path.join(os.getcwd(), os.path.dirname(__file__))
test_data_path = os.path.realpath(os.path.join(test_dir, "data"))
tiingo_data_path = os.path.join(test_data_path, "tiingo")
@classmethod
def setUpClass(cls):
assert "TIINGO_API_KEY" in os.environ, "$TIINGO_API_KEY env variable must be set"
cls.save_data_path = os.environ.get("SAVE_DATA_PATH", None)
os.environ["SAVE_DATA_PATH"] = cls.test_data_path
@classmethod
def tearDownClass(cls):
if cls.save_data_path:
os.environ["SAVE_DATA_PATH"] = cls.save_data_path
@patch("data_scraper.tiingo.slack_notification", return_value=None)
def test_fetch_gld(self, mocked_notification):
"""Fetch GLD data"""
tiingo.fetch_data(["GLD"])
gld_dir = os.path.join(TestTiingo.tiingo_data_path, "GLD")
self.addCleanup(TestTiingo.remove_files, gld_dir)
if self.assertTrue(os.path.exists(gld_dir)):
self.assertTrue(mocked_notification.called)
file_name = "GLD_" + pd.Timestamp.today().strftime(
"%Y%m%d") + ".csv"
file_path = os.path.join(gld_dir, file_name)
gld_df = pd.read_csv(file_path)
self.assertTrue(all(gld_df["symbol"] == "GLD"))
expected_columns = [
"symbol", "date", "adjClose", "adjHigh", "adjLow", "adjOpen",
"adjVolume", "close", "divCash", "high", "low", "open",
"splitFactor", "volume"
]
self.assertEqual(gld_df.columns, expected_columns)
@patch("data_scraper.tiingo.slack_notification", return_value=None)
def test_fetch_invalid_symbol(self, mocked_notification):
"""Fetching invalid symbol data should send notification"""
tiingo.fetch_data(["FOOBAR"])
self.assertTrue(mocked_notification.called)
@patch("data_scraper.tiingo.pdr.get_data_tiingo") # mock pandas_datareader
@patch("data_scraper.tiingo.slack_notification", return_value=None)
def test_no_connection(self, mocked_notification, mocked_pdr):
"""Raise ConnectionError and send notification when host is unreachable"""
mocked_pdr.side_effect = ConnectionError("This is a test")
with self.assertRaises(ConnectionError):
tiingo.fetch_data(["IBM"])
self.assertTrue(mocked_notification.called)
def remove_files(file_path):
if os.path.exists(file_path):
shutil.rmtree(file_path)
if __name__ == "__main__":
unittest.main()
+122
View File
@@ -0,0 +1,122 @@
import logging
import os
from datetime import date
import pandas as pd
import pandas_datareader as pdr
from . import utils, validation
from .notifications import slack_notification, Status
logger = logging.getLogger(__name__)
# Default symbols to fetch
assets = [
"VTSMX", "VFINX", "VIVAX", "VIGRX", "VIMSX", "VMVIX", "VMGIX", "NAESX",
"VISVX", "VISGX", "BRSIX", "VGTSX", "VTMGX", "VFSVX", "EFV", "VEURX",
"VPACX", "VEIEX", "VFISX", "VFITX", "IEF", "VUSTX", "VBMFX", "VIPSX",
"PIGLX", "PGBIX", "VFSTX", "LQD", "VWESX", "VWEHX", "VWSTX", "VWITX",
"VWLTX", "VGSIX", "GLD", "PSAU", "GSG"
]
def fetch_data(symbols=assets):
"""Fetches historical data for given symbols from Tiingo"""
api_key = utils.get_environment_var("TIINGO_API_KEY")
symbols = [symbol.upper() for symbol in symbols]
done, failed = [], []
for symbol in symbols:
try:
symbol_data = pdr.get_data_tiingo(symbol, api_key=api_key)
except ConnectionError as ce:
msg = "Unable to connect to api.tiingo.com when fetching symbol {}".format(
symbol)
logger.error(msg, exc_info=True)
slack_notification(msg, __name__)
raise ce
except TypeError:
# pandas_datareader raises TypeError when fetching invalid symbol
failed.append(symbol)
msg = "Attempted to fetch invalid symbol {}".format(symbol)
logger.error(msg, exc_info=True)
slack_notification(msg, __name__)
except Exception:
msg = "Error fetching symbol {}".format(symbol)
logger.error(msg, exc_info=True)
slack_notification(msg, __name__)
else:
_save_data(symbol, symbol_data.reset_index())
done.append(symbol)
if len(done) > 0:
msg = "Successfully scraped symbols: " + ", ".join(done)
slack_notification(msg, __name__, status=Status.Success)
if len(failed) > 0:
msg = "Failed to scrape symbols: " + ", ".join(failed)
slack_notification(msg, __name__, status=Status.Warning)
def _save_data(symbol, symbol_df):
"""Saves the contents of `symbol_df` to
`$SAVE_DATA_PATH/tiingo/{symbol}/{symbol}_{%date}.csv`"""
filename = date.today().strftime(symbol + "_%Y%m%d.csv")
save_data_path = utils.get_save_data_path()
symbol_dir = os.path.join(save_data_path, "tiingo", symbol)
if not os.path.exists(symbol_dir):
os.makedirs(symbol_dir)
logger.debug("Symbol dir %s created", symbol_dir)
file_path = os.path.join(symbol_dir, filename)
if os.path.exists(file_path) and validation.file_hash_matches_data(
file_path, symbol_df.to_csv()):
logger.debug("File %s already downloaded", file_path)
else:
expected_columns = [
"symbol", "date", "adjClose", "adjHigh", "adjLow", "adjOpen",
"adjVolume", "close", "divCash", "high", "low", "open",
"splitFactor", "volume"
]
if validation.validate_historical_dates(
symbol, symbol_df["date"]) and validation.validate_columns(
expected_columns, symbol_df.columns):
merged_df = _merge(symbol, symbol_df)
pattern = symbol + "_*"
utils.remove_files(symbol_dir, pattern, logger)
merged_df.to_csv(file_path, index=False)
logger.debug("Saved symbol data as %s", file_path)
def _merge(symbol, symbol_df):
"""Merge `symbol_df` with previous data file."""
save_data_path = utils.get_save_data_path()
symbol_dir = os.path.join(save_data_path, "tiingo", symbol)
files = os.listdir(symbol_dir)
if len(files) == 0:
return symbol_df
last_file = sorted(files)[-1]
old_df = pd.read_csv(os.path.join(symbol_dir, last_file),
parse_dates=["date"],
index_col="date")
symbol_df.index = symbol_df["date"]
diffs = old_df.index.difference(symbol_df.index)
if diffs.empty:
return symbol_df
else:
msg = """Old data included dates not present in scraped file for symbol {}
Merged new data with previous file.""".format(symbol)
logger.error(msg)
slack_notification(msg, __name__)
merged_df = pd.concat([symbol_df, old_df.loc[diffs]])
merged_df.sort_index(inplace=True)
return merged_df.reset_index()
+52
View File
@@ -0,0 +1,52 @@
import glob
import json
import os
def get_environment_var(variable):
"""Returns the value of a given environment variable.
Raises `EnvironmentError` if not found.
"""
if variable not in os.environ:
raise EnvironmentError(
"Environment variable {} not set".format(variable))
return os.path.expanduser(os.environ[variable])
def get_save_data_path():
"""Reads data path from environment variable `$SAVE_DATA_PATH`.
If it is not set, defaults to `./data/scraped`.
"""
try:
data_dir = get_environment_var("SAVE_DATA_PATH")
except EnvironmentError:
data_dir = "data/scraped"
os.makedirs(data_dir)
return data_dir
def get_module_config(module, config_file="data_scraper.conf"):
"""Parses configuration file and returns the configuration options
for the chosen `module`.
"""
options = {}
if os.path.exists(config_file):
with open(config_file) as file:
config = json.load(file)
options = config.get(module, {})
return options
def remove_files(data_dir, pattern, logger=None):
"""Removes files in `data_dir` that match `pattern`"""
for file in glob.glob(os.path.join(data_dir, pattern)):
remove_file(file, logger)
def remove_file(file, logger=None):
os.remove(file)
if logger:
logger.debug("Removed file %s", file)
+93
View File
@@ -0,0 +1,93 @@
import logging
import hashlib
import pandas as pd
import pandas_market_calendars as mcal
from . import cboe
from .notifications import slack_notification
logger = logging.getLogger(__name__)
def file_hash_matches_data(file_path, data):
file_hash = file_md5(file_path)
data_md5 = hashlib.md5(data.encode()).hexdigest()
return file_hash == data_md5
def file_md5(file, chunk_size=4096):
md5 = hashlib.md5()
with open(file, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
md5.update(chunk)
return md5.hexdigest()
def validate_dates_in_month(symbol, date_range):
"""Compares `date_range` (month) with NYSE trading calendar.
Returns `True` if there are no missing days.
"""
# NYSE and CBOE have the same trading calendar
# https://www.nyse.com/markets/hours-calendars
# http://cfe.cboe.com/about-cfe/holiday-calendar
nyse = mcal.get_calendar("NYSE")
first_date = date_range[0]
period = pd.Period(year=first_date.year, month=first_date.month, freq="M")
trading_days = nyse.valid_days(start_date=period.start_time,
end_date=period.end_time)
# Remove timezone info
trading_days = trading_days.tz_convert(tz=None)
missing_days = trading_days.difference(date_range)
if not missing_days.empty:
logger.error("Error validating monthly dates. Missing: %s",
missing_days)
return missing_days.empty
def validate_historical_dates(symbol, date_range):
"""Compares `date_range` (any time range) with trading calendar.
Returns `True` if there are no missing days.
"""
nyse = mcal.get_calendar("NYSE")
start_date = date_range.min()
end_date = date_range.max()
trading_days = nyse.valid_days(start_date=start_date, end_date=end_date)
# Remove timezone info
trading_days = trading_days.tz_convert(tz=None)
date_range = date_range.dt.tz_convert(tz=None)
missing_days = trading_days.difference(date_range)
if not missing_days.empty:
logger.error("Error validating historical dates. Missing: %s",
missing_days)
return missing_days.empty
def validate_columns(expected, received):
"""Verify that the `received` columns scraped are equal to `expected`"""
valid = all(expected == received)
if not valid:
expected_cols = ", ".join(expected)
received_cols = ", ".join(received)
msg = """Columns expected differ from those received.
Expected: {}
Received: {}""".format(expected_cols, received_cols)
logger.error(msg)
slack_notification(msg, __name__)
return valid
def validate_aggregate_file(aggregate_file, daily_files):
"""Compares `aggregate_file` with the data from `daily_files`."""
aggregate_df = pd.read_csv(aggregate_file)
recreated_df = cboe.concatenate_files(daily_files)
return aggregate_df.equals(recreated_df)
+19
View File
@@ -0,0 +1,19 @@
FROM debian:latest
MAINTAINER Juan Pablo Amoroso <jamoroso@lambdaclass.com>
RUN apt-get update
RUN apt-get install -y python3 python3-pip make cron build-essential pkg-config openssl libssl-dev
RUN python3 -m pip install pipenv
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
COPY . /finance
WORKDIR /finance
RUN make init
COPY ./docker/data_scraper/crontab /etc/cron.d/scraper-cron
COPY ./docker/data_scraper/entrypoint.sh /usr/bin/entrypoint.sh
COPY ./docker/data_scraper/run-task.sh /usr/bin/run-task
RUN chmod 0644 /etc/cron.d/scraper-cron && crontab /etc/cron.d/scraper-cron
ENTRYPOINT ["entrypoint.sh"]
CMD ["cron", "-f"]
+4
View File
@@ -0,0 +1,4 @@
0 19 * * 1-5 root cd /finance && run-task make scrape scraper=cboe
0 19 * * 1-5 root cd /finance && run-task make scrape scraper=tiingo
0 0 1 * * root cd /finance && run-task make aggregate && run-task make backup
+5
View File
@@ -0,0 +1,5 @@
#!/bin/bash
# cron does not read env, save it here
env > /root/env
exec "$@"
+4
View File
@@ -0,0 +1,4 @@
#!/bin/bash
# import env vars that were written in entrypoint
env - `cat /root/env` $@
+8
View File
@@ -0,0 +1,8 @@
version: "3"
services:
scraper:
image: data_scraper:latest
container_name: data_scraper
volumes:
- ~/finance/data:/finance/data