mirror of
https://github.com/wassname/sec-web-scraper-13f.git
synced 2026-06-27 18:06:45 +08:00
145 lines
4.3 KiB
Python
145 lines
4.3 KiB
Python
from unicodedata import name
|
|
import pandas as pd
|
|
from requests.adapters import HTTPAdapter, Retry
|
|
import requests_cache
|
|
from pathlib import Path
|
|
import re
|
|
import csv
|
|
import lxml
|
|
from bs4 import BeautifulSoup
|
|
from tqdm.auto import tqdm
|
|
import logging, sys
|
|
|
|
logger = logging.getLogger(__file__)
|
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
|
|
# cache and retry
|
|
session = requests_cache.CachedSession('.demo_cache')
|
|
retries = Retry(total=5,
|
|
backoff_factor=0.1,
|
|
status_forcelist=[ 500, 502, 503, 504 ])
|
|
session.mount('http://', HTTPAdapter(max_retries=retries))
|
|
|
|
sec_url = 'https://www.sec.gov'
|
|
|
|
def get_request(url):
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'HOST': 'www.sec.gov',
|
|
}
|
|
return session.get(url, headers=headers)
|
|
|
|
def create_url(cik):
|
|
return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type=13F-HR'.format(cik)
|
|
|
|
def get_user_input():
|
|
cik = eval(input("Enter 10-digit CIK number: "))
|
|
return cik
|
|
|
|
|
|
def scrap_company_report(requested_cik, name):
|
|
# Find mutual fund by CIK number on EDGAR
|
|
url = create_url(requested_cik)
|
|
logger.debug(f"index '{url}'")
|
|
response = get_request(url)
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
main = soup.find(id="seriesDiv")
|
|
rows = main.findAll('tr')[1:] # skip header
|
|
for row in rows[:4]:
|
|
date = row.findAll('td')[3].text
|
|
tag = row.find('a', id="documentsbutton")
|
|
last_report = (sec_url + tag['href'])
|
|
logger.debug(f"scrap_report_by_url '{last_report}' '{name}/{date}.csv'")
|
|
scrap_report_by_url(last_report, f"{name}/{date}")
|
|
|
|
|
|
def scrap_report_by_url(url, filename):
|
|
response_two = get_request(url)
|
|
soup_two = BeautifulSoup(response_two.text, "html.parser")
|
|
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
|
|
xml_url = tags_two[3].get('href')
|
|
|
|
response_xml = get_request(sec_url + xml_url)
|
|
soup_xml = BeautifulSoup(response_xml.content, "lxml")
|
|
xml_to_csv(soup_xml, filename)
|
|
|
|
|
|
def xml_to_csv(soup_xml, name):
|
|
|
|
columns = [
|
|
"Name of Issuer",
|
|
"CUSIP",
|
|
"Value (x$1000)",
|
|
"Shares",
|
|
"Investment Discretion",
|
|
"Voting Sole / Shared / None"
|
|
]
|
|
issuers = soup_xml.body.findAll(re.compile('nameofissuer'))
|
|
cusips = soup_xml.body.findAll(re.compile('cusip'))
|
|
values = soup_xml.body.findAll(re.compile('value'))
|
|
sshprnamts = soup_xml.body.findAll('sshprnamt')
|
|
sshprnamttypes = soup_xml.body.findAll(re.compile('sshprnamttype'))
|
|
investmentdiscretions = soup_xml.body.findAll(re.compile('investmentdiscretion'))
|
|
soles = soup_xml.body.findAll(re.compile('sole'))
|
|
shareds = soup_xml.body.findAll(re.compile('shared'))
|
|
nones = soup_xml.body.findAll(re.compile('none'))
|
|
|
|
df = pd.DataFrame(columns= columns)
|
|
|
|
for issuer, cusip, value, sshprnamt, sshprnamttype, investmentdiscretion, sole, shared, none in zip(issuers, cusips, values, sshprnamts, sshprnamttypes, investmentdiscretions, soles, shareds, nones):
|
|
row = {
|
|
"Name of Issuer": issuer.text,
|
|
"CUSIP": cusip.text,
|
|
"Value (x$1000)": value.text,
|
|
"Shares": f"{sshprnamt.text} {sshprnamttype.text}",
|
|
"Investment Discretion": investmentdiscretion.text,
|
|
"Voting Sole / Shared / None": f"{sole.text} / {shared.text} / {none.text}"
|
|
}
|
|
df = df.append(row, ignore_index=True)
|
|
|
|
|
|
fo = Path(f"output/{name}.csv")
|
|
fo.parent.mkdir(exist_ok=True)
|
|
df.to_csv(fo)
|
|
|
|
# List of Investments
|
|
CIK_LIST = [{
|
|
'name': 'Buffett',
|
|
'cik': '0001067983'
|
|
}, {
|
|
'name': 'JPMorgan',
|
|
'cik': '0000019617'
|
|
}, {
|
|
'name': 'Bridgewater',
|
|
'cik': '0001350694'
|
|
}, {
|
|
'name': 'Renaissance',
|
|
'cik': '0001037389'
|
|
}, {
|
|
'name': 'TwoSigma',
|
|
'cik': '0001179392'
|
|
}, {
|
|
'name': 'DEShaw',
|
|
'cik': '0001009207'
|
|
}, {
|
|
'name': 'Millenium',
|
|
'cik': '0001273087'
|
|
}, {
|
|
'name': 'Bluecrest',
|
|
'cik': '0001610880'
|
|
}, {
|
|
'name': 'AQR',
|
|
'cik': '0001167557'
|
|
},{
|
|
'name': 'Scion Asset Management',
|
|
'cik': '0001649339'
|
|
},
|
|
{
|
|
'name': 'Burry Michael J',
|
|
'cik': '0001342573'
|
|
},
|
|
]
|
|
for row in tqdm(CIK_LIST):
|
|
scrap_company_report(row['cik'], row['name'])
|