Files
2022-06-23 22:18:33 +08:00

118 lines
3.3 KiB
Python

from unicodedata import name
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import requests_cache
from pathlib import Path
import re
import csv
import lxml
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import logging, sys
logger = logging.getLogger(__file__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# cache and retry
session = requests_cache.CachedSession('.demo_cache')
retries = Retry(total=5,
backoff_factor=0.1,
status_forcelist=[ 500, 502, 503, 504 ])
session.mount('http://', HTTPAdapter(max_retries=retries))
sec_url = 'https://www.sec.gov'
def get_request(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, br',
'HOST': 'www.sec.gov',
}
return session.get(url, headers=headers)
def create_url(cik):
return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type=13F-HR'.format(cik)
def get_user_input():
cik = eval(input("Enter 10-digit CIK number: "))
return cik
def scrap_company_report(requested_cik, name):
# Find mutual fund by CIK number on EDGAR
url = create_url(requested_cik)
logger.debug(f"index '{url}'")
response = get_request(url)
soup = BeautifulSoup(response.text, "html.parser")
main = soup.find(id="seriesDiv")
rows = main.findAll('tr')[1:] # skip header
if len(rows)==0:
logger.warn(f"no reports for {name} {url}")
for row in rows[:2]:
date = row.findAll('td')[3].text
tag = row.find('a', id="documentsbutton")
last_report = (sec_url + tag['href'])
logger.debug(f"scrap_report_by_url '{last_report}' '{name}/{date}.csv'")
scrap_report_by_url(last_report, f"{name}/{date}")
def scrap_report_by_url(url, name):
response_two = get_request(url)
soup_two = BeautifulSoup(response_two.text, "html.parser")
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
xml_url = tags_two[2].get('href')
response_xml = get_request(sec_url + xml_url)
soup_xml = BeautifulSoup(response_xml.content, "html.parser")
table = soup_xml.find(summary="Form 13F-NT Header Information")
df = pd.read_html(str(table), header=[1, 2])[0]
df.columns = [(b if a.startswith('Unnamed') else f"{a} {b}")for a,b in df.columns ]
# df.columns = [" ",] + list(df.columns[1:])
fo = Path(f"output/{name}.csv")
fo.parent.mkdir(exist_ok=True)
df.to_csv(fo, index=False)
logger.info(fo)
# List of Investments
CIK_LIST = [
{
'name': 'Buffett',
'cik': '0001067983'
}, {
'name': 'JPMorgan',
'cik': '0000019617'
}, {
'name': 'Bridgewater',
'cik': '0001350694'
}, {
'name': 'Renaissance',
'cik': '0001037389'
}, {
'name': 'TwoSigma',
'cik': '0001179392'
}, {
'name': 'DEShaw',
'cik': '0001009207'
}, {
'name': 'Millenium',
'cik': '0001273087'
}, {
'name': 'Bluecrest',
'cik': '0001610880'
}, {
'name': 'AQR',
'cik': '0001167557'
},
{
'name': 'Scion Asset Management',
'cik': '0001649339'
},
{
'name': 'KYNIKOS ASSOCIATES LP',
'cik': '0001446440'
},
]
for row in tqdm(CIK_LIST):
scrap_company_report(row['cik'], row['name'])