use html parse

This commit is contained in:
wassname
2022-06-23 22:18:33 +08:00
parent 460df8d648
commit 241200c541
4 changed files with 11894 additions and 46 deletions
File diff suppressed because one or more lines are too long
+5
View File
@@ -22,3 +22,8 @@ This repository contains a Python Web scraper for parsing 13F filings (mutual fu
## References ## References
- [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm) - [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm)
# TODO
- form 4 Form 4 - Insider Buying/Selling
+19 -46
View File
@@ -46,7 +46,10 @@ def scrap_company_report(requested_cik, name):
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
main = soup.find(id="seriesDiv") main = soup.find(id="seriesDiv")
rows = main.findAll('tr')[1:] # skip header rows = main.findAll('tr')[1:] # skip header
for row in rows[:4]: if len(rows)==0:
logger.warn(f"no reports for {name} {url}")
for row in rows[:2]:
date = row.findAll('td')[3].text date = row.findAll('td')[3].text
tag = row.find('a', id="documentsbutton") tag = row.find('a', id="documentsbutton")
last_report = (sec_url + tag['href']) last_report = (sec_url + tag['href'])
@@ -54,57 +57,26 @@ def scrap_company_report(requested_cik, name):
scrap_report_by_url(last_report, f"{name}/{date}") scrap_report_by_url(last_report, f"{name}/{date}")
def scrap_report_by_url(url, filename): def scrap_report_by_url(url, name):
response_two = get_request(url) response_two = get_request(url)
soup_two = BeautifulSoup(response_two.text, "html.parser") soup_two = BeautifulSoup(response_two.text, "html.parser")
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')}) tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
xml_url = tags_two[3].get('href') xml_url = tags_two[2].get('href')
response_xml = get_request(sec_url + xml_url) response_xml = get_request(sec_url + xml_url)
soup_xml = BeautifulSoup(response_xml.content, "lxml") soup_xml = BeautifulSoup(response_xml.content, "html.parser")
xml_to_csv(soup_xml, filename) table = soup_xml.find(summary="Form 13F-NT Header Information")
df = pd.read_html(str(table), header=[1, 2])[0]
df.columns = [(b if a.startswith('Unnamed') else f"{a} {b}")for a,b in df.columns ]
def xml_to_csv(soup_xml, name): # df.columns = [" ",] + list(df.columns[1:])
columns = [
"Name of Issuer",
"CUSIP",
"Value (x$1000)",
"Shares",
"Investment Discretion",
"Voting Sole / Shared / None"
]
issuers = soup_xml.body.findAll(re.compile('nameofissuer'))
cusips = soup_xml.body.findAll(re.compile('cusip'))
values = soup_xml.body.findAll(re.compile('value'))
sshprnamts = soup_xml.body.findAll('sshprnamt')
sshprnamttypes = soup_xml.body.findAll(re.compile('sshprnamttype'))
investmentdiscretions = soup_xml.body.findAll(re.compile('investmentdiscretion'))
soles = soup_xml.body.findAll(re.compile('sole'))
shareds = soup_xml.body.findAll(re.compile('shared'))
nones = soup_xml.body.findAll(re.compile('none'))
df = pd.DataFrame(columns= columns)
for issuer, cusip, value, sshprnamt, sshprnamttype, investmentdiscretion, sole, shared, none in zip(issuers, cusips, values, sshprnamts, sshprnamttypes, investmentdiscretions, soles, shareds, nones):
row = {
"Name of Issuer": issuer.text,
"CUSIP": cusip.text,
"Value (x$1000)": value.text,
"Shares": f"{sshprnamt.text} {sshprnamttype.text}",
"Investment Discretion": investmentdiscretion.text,
"Voting Sole / Shared / None": f"{sole.text} / {shared.text} / {none.text}"
}
df = df.append(row, ignore_index=True)
fo = Path(f"output/{name}.csv") fo = Path(f"output/{name}.csv")
fo.parent.mkdir(exist_ok=True) fo.parent.mkdir(exist_ok=True)
df.to_csv(fo) df.to_csv(fo, index=False)
logger.info(fo)
# List of Investments # List of Investments
CIK_LIST = [{ CIK_LIST = [
{
'name': 'Buffett', 'name': 'Buffett',
'cik': '0001067983' 'cik': '0001067983'
}, { }, {
@@ -131,13 +103,14 @@ CIK_LIST = [{
}, { }, {
'name': 'AQR', 'name': 'AQR',
'cik': '0001167557' 'cik': '0001167557'
},{ },
{
'name': 'Scion Asset Management', 'name': 'Scion Asset Management',
'cik': '0001649339' 'cik': '0001649339'
}, },
{ {
'name': 'Burry Michael J', 'name': 'KYNIKOS ASSOCIATES LP',
'cik': '0001342573' 'cik': '0001446440'
}, },
] ]
for row in tqdm(CIK_LIST): for row in tqdm(CIK_LIST):
+5939
View File
File diff suppressed because one or more lines are too long