From 460df8d648953e3cdbc73b8d61c44d68944dea11 Mon Sep 17 00:00:00 2001 From: wassname Date: Thu, 23 Jun 2022 20:40:11 +0800 Subject: [PATCH] working --- .gitignore | 2 + Pipfile | 14 ----- Pipfile.lock | 141 ----------------------------------------------- requirements.txt | 5 +- scraper.py | 86 ++++++++++++++++++++++++----- 5 files changed, 77 insertions(+), 171 deletions(-) create mode 100644 .gitignore delete mode 100644 Pipfile delete mode 100644 Pipfile.lock diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4509a30 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/output +.demo_cache.sqlite diff --git a/Pipfile b/Pipfile deleted file mode 100644 index 4e475b1..0000000 --- a/Pipfile +++ /dev/null @@ -1,14 +0,0 @@ -[[source]] -name = "pypi" -url = "https://pypi.org/simple" -verify_ssl = true - -[dev-packages] - -[packages] -requests = "*" -beautifulsoup4 = "*" -lxml = "*" - -[requires] -python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index ee97a7d..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,141 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "b4132b4e5879e89d02d9b476d89df9ef85ea6659012e068b23e04b55d0bc31da" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.7" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "beautifulsoup4": { - "hashes": [ - "sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", - "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", - "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718" - ], - "index": "pypi", - "version": "==4.7.1" - }, - "certifi": { - "hashes": [ - "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", - "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" - ], - "version": "==2021.10.8" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "idna": { - "hashes": [ - "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", - "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" - ], - "version": "==2.8" - }, - "lxml": { - "hashes": [ - "sha256:11ae552a78612620afd15625be9f1b82e3cc2e634f90d6b11709b10a100cba59", - "sha256:121fc6f71c692b49af6c963b84ab7084402624ffbe605287da362f8af0668ea3", - "sha256:124f09614f999551ac65e5b9875981ce4b66ac4b8e2ba9284572f741935df3d9", - "sha256:12ae2339d32a2b15010972e1e2467345b7bf962e155671239fba74c229564b7f", - "sha256:12d8d6fe3ddef629ac1349fa89a638b296a34b6529573f5055d1cb4e5245f73b", - "sha256:1a2a7659b8eb93c6daee350a0d844994d49245a0f6c05c747f619386fb90ba04", - "sha256:1ccbfe5d17835db906f2bab6f15b34194db1a5b07929cba3cf45a96dbfbfefc0", - "sha256:2f77556266a8fe5428b8759fbfc4bd70be1d1d9c9b25d2a414f6a0c0b0f09120", - "sha256:3534d7c468c044f6aef3c0aff541db2826986a29ea73f2ca831f5d5284d9b570", - "sha256:3884476a90d415be79adfa4e0e393048630d0d5bcd5757c4c07d8b4b00a1096b", - "sha256:3b95fb7e6f9c2f53db88f4642231fc2b8907d854e614710996a96f1f32018d5c", - "sha256:46515773570a33eae13e451c8fcf440222ef24bd3b26f40774dd0bd8b6db15b2", - "sha256:46f21f2600d001af10e847df9eb3b832e8a439f696c04891bcb8a8cedd859af9", - "sha256:473701599665d874919d05bb33b56180447b3a9da8d52d6d9799f381ce23f95c", - "sha256:4b9390bf973e3907d967b75be199cf1978ca8443183cf1e78ad80ad8be9cf242", - "sha256:4f415624cf8b065796649a5e4621773dc5c9ea574a944c76a7f8a6d3d2906b41", - "sha256:534032a5ceb34bba1da193b7d386ac575127cc39338379f39a164b10d97ade89", - "sha256:558485218ee06458643b929765ac1eb04519ca3d1e2dcc288517de864c747c33", - "sha256:57cf05466917e08f90e323f025b96f493f92c0344694f5702579ab4b7e2eb10d", - "sha256:59d77bfa3bea13caee95bc0d3f1c518b15049b97dd61ea8b3d71ce677a67f808", - "sha256:5d5254c815c186744c8f922e2ce861a2bdeabc06520b4b30b2f7d9767791ce6e", - "sha256:5ea121cb66d7e5cb396b4c3ca90471252b94e01809805cfe3e4e44be2db3a99c", - "sha256:60aeb14ff9022d2687ef98ce55f6342944c40d00916452bb90899a191802137a", - "sha256:642eb4cabd997c9b949a994f9643cd8ae00cf4ca8c5cd9c273962296fadf1c44", - "sha256:6548fc551de15f310dd0564751d9dc3d405278d45ea9b2b369ed1eccf142e1f5", - "sha256:68a851176c931e2b3de6214347b767451243eeed3bea34c172127bbb5bf6c210", - "sha256:6e84edecc3a82f90d44ddee2ee2a2630d4994b8471816e226d2b771cda7ac4ca", - "sha256:73e8614258404b2689a26cb5d002512b8bc4dfa18aca86382f68f959aee9b0c8", - "sha256:7679bb6e4d9a3978a46ab19a3560e8d2b7265ef3c88152e7fdc130d649789887", - "sha256:76b6c296e4f7a1a8a128aec42d128646897f9ae9a700ef6839cdc9b3900db9b5", - "sha256:7f00cc64b49d2ef19ddae898a3def9dd8fda9c3d27c8a174c2889ee757918e71", - "sha256:8021eeff7fabde21b9858ed058a8250ad230cede91764d598c2466b0ba70db8b", - "sha256:87f8f7df70b90fbe7b49969f07b347e3f978f8bd1046bb8ecae659921869202b", - "sha256:916d457ad84e05b7db52700bad0a15c56e0c3000dcaf1263b2fb7a56fe148996", - "sha256:925174cafb0f1179a7fd38da90302555d7445e34c9ece68019e53c946be7f542", - "sha256:9801bcd52ac9c795a7d81ea67471a42cffe532e46cfb750cd5713befc5c019c0", - "sha256:99cf827f5a783038eb313beee6533dddb8bdb086d7269c5c144c1c952d142ace", - "sha256:a21b78af7e2e13bec6bea12fc33bc05730197674f3e5402ce214d07026ccfebd", - "sha256:a52e8f317336a44836475e9c802f51c2dc38d612eaa76532cb1d17690338b63b", - "sha256:a702005e447d712375433ed0499cb6e1503fadd6c96a47f51d707b4d37b76d3c", - "sha256:a708c291900c40a7ecf23f1d2384ed0bc0604e24094dd13417c7e7f8f7a50d93", - "sha256:a7790a273225b0c46e5f859c1327f0f659896cc72eaa537d23aa3ad9ff2a1cc1", - "sha256:abcf7daa5ebcc89328326254f6dd6d566adb483d4d00178892afd386ab389de2", - "sha256:add017c5bd6b9ec3a5f09248396b6ee2ce61c5621f087eb2269c813cd8813808", - "sha256:af4139172ff0263d269abdcc641e944c9de4b5d660894a3ec7e9f9db63b56ac9", - "sha256:b4015baed99d046c760f09a4c59d234d8f398a454380c3cf0b859aba97136090", - "sha256:ba0006799f21d83c3717fe20e2707a10bbc296475155aadf4f5850f6659b96b9", - "sha256:bdb98f4c9e8a1735efddfaa995b0c96559792da15d56b76428bdfc29f77c4cdb", - "sha256:c34234a1bc9e466c104372af74d11a9f98338a3f72fae22b80485171a64e0144", - "sha256:c580c2a61d8297a6e47f4d01f066517dbb019be98032880d19ece7f337a9401d", - "sha256:ca9a40497f7e97a2a961c04fa8a6f23d790b0521350a8b455759d786b0bcb203", - "sha256:cab343b265e38d4e00649cbbad9278b734c5715f9bcbb72c85a1f99b1a58e19a", - "sha256:ce52aad32ec6e46d1a91ff8b8014a91538800dd533914bfc4a82f5018d971408", - "sha256:da07c7e7fc9a3f40446b78c54dbba8bfd5c9100dfecb21b65bfe3f57844f5e71", - "sha256:dc8a0dbb2a10ae8bb609584f5c504789f0f3d0d81840da4849102ec84289f952", - "sha256:e5b4b0d9440046ead3bd425eb2b852499241ee0cef1ae151038e4f87ede888c4", - "sha256:f33d8efb42e4fc2b31b3b4527940b25cdebb3026fb56a80c1c1c11a4271d2352", - "sha256:f6befb83bca720b71d6bd6326a3b26e9496ae6649e26585de024890fe50f49b8", - "sha256:fcc849b28f584ed1dbf277291ded5c32bb3476a37032df4a1d523b55faa5f944", - "sha256:ff44de36772b05c2eb74f2b4b6d1ae29b8f41ed5506310ce1258d44826ee38c1" - ], - "index": "pypi", - "version": "==4.6.5" - }, - "requests": { - "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" - ], - "index": "pypi", - "version": "==2.22.0" - }, - "soupsieve": { - "hashes": [ - "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb", - "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9" - ], - "markers": "python_version >= '3.6'", - "version": "==2.3.1" - }, - "urllib3": { - "hashes": [ - "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2", - "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.25.11" - } - }, - "develop": {} -} diff --git a/requirements.txt b/requirements.txt index 118c8d7..b168a7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ requests==2.22.0 lxml>=4.6.2 -beautifulsoup4==4.7.1 \ No newline at end of file +beautifulsoup4==4.7.1 +pandas +requests-cache +tqdm diff --git a/scraper.py b/scraper.py index ab3446b..90b4868 100644 --- a/scraper.py +++ b/scraper.py @@ -1,10 +1,24 @@ from unicodedata import name import pandas as pd -import requests +from requests.adapters import HTTPAdapter, Retry +import requests_cache +from pathlib import Path import re import csv import lxml from bs4 import BeautifulSoup +from tqdm.auto import tqdm +import logging, sys + +logger = logging.getLogger(__file__) +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +# cache and retry +session = requests_cache.CachedSession('.demo_cache') +retries = Retry(total=5, + backoff_factor=0.1, + status_forcelist=[ 500, 502, 503, 504 ]) +session.mount('http://', HTTPAdapter(max_retries=retries)) sec_url = 'https://www.sec.gov' @@ -14,26 +28,30 @@ def get_request(url): 'Accept-Encoding': 'gzip, deflate, br', 'HOST': 'www.sec.gov', } - return requests.get(url, headers=headers) + return session.get(url, headers=headers) def create_url(cik): return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type=13F-HR'.format(cik) def get_user_input(): - cik = input("Enter 10-digit CIK number: ") + cik = eval(input("Enter 10-digit CIK number: ")) return cik -def scrap_company_report(requested_cik): +def scrap_company_report(requested_cik, name): # Find mutual fund by CIK number on EDGAR - response = get_request(create_url(requested_cik)) + url = create_url(requested_cik) + logger.debug(f"index '{url}'") + response = get_request(url) soup = BeautifulSoup(response.text, "html.parser") - tags = soup.findAll('a', id="documentsbutton") - - last_report = (sec_url + tags[0]['href']) - previous_report = (sec_url + tags[1]['href']) - scrap_report_by_url(last_report, "last_report") - scrap_report_by_url(previous_report, "previous_report") + main = soup.find(id="seriesDiv") + rows = main.findAll('tr')[1:] # skip header + for row in rows[:4]: + date = row.findAll('td')[3].text + tag = row.find('a', id="documentsbutton") + last_report = (sec_url + tag['href']) + logger.debug(f"scrap_report_by_url '{last_report}' '{name}/{date}.csv'") + scrap_report_by_url(last_report, f"{name}/{date}") def scrap_report_by_url(url, filename): @@ -81,8 +99,46 @@ def xml_to_csv(soup_xml, name): df = df.append(row, ignore_index=True) - df.to_csv(f"{name}.csv") + fo = Path(f"output/{name}.csv") + fo.parent.mkdir(exist_ok=True) + df.to_csv(fo) - -requested_cik = get_user_input() -scrap_company_report(requested_cik) +# List of Investments +CIK_LIST = [{ + 'name': 'Buffett', + 'cik': '0001067983' +}, { + 'name': 'JPMorgan', + 'cik': '0000019617' +}, { + 'name': 'Bridgewater', + 'cik': '0001350694' +}, { + 'name': 'Renaissance', + 'cik': '0001037389' +}, { + 'name': 'TwoSigma', + 'cik': '0001179392' +}, { + 'name': 'DEShaw', + 'cik': '0001009207' +}, { + 'name': 'Millenium', + 'cik': '0001273087' +}, { + 'name': 'Bluecrest', + 'cik': '0001610880' +}, { + 'name': 'AQR', + 'cik': '0001167557' +},{ + 'name': 'Scion Asset Management', + 'cik': '0001649339' +}, +{ + 'name': 'Burry Michael J', + 'cik': '0001342573' +}, +] +for row in tqdm(CIK_LIST): + scrap_company_report(row['cik'], row['name'])