mirror of
https://github.com/wassname/sec-web-scraper-13f.git
synced 2026-06-27 16:46:51 +08:00
use html parse
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -22,3 +22,8 @@ This repository contains a Python Web scraper for parsing 13F filings (mutual fu
|
|||||||
|
|
||||||
## References
|
## References
|
||||||
- [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm)
|
- [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
|
||||||
|
- form 4 Form 4 - Insider Buying/Selling
|
||||||
|
|||||||
+19
-46
@@ -46,7 +46,10 @@ def scrap_company_report(requested_cik, name):
|
|||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
main = soup.find(id="seriesDiv")
|
main = soup.find(id="seriesDiv")
|
||||||
rows = main.findAll('tr')[1:] # skip header
|
rows = main.findAll('tr')[1:] # skip header
|
||||||
for row in rows[:4]:
|
if len(rows)==0:
|
||||||
|
logger.warn(f"no reports for {name} {url}")
|
||||||
|
|
||||||
|
for row in rows[:2]:
|
||||||
date = row.findAll('td')[3].text
|
date = row.findAll('td')[3].text
|
||||||
tag = row.find('a', id="documentsbutton")
|
tag = row.find('a', id="documentsbutton")
|
||||||
last_report = (sec_url + tag['href'])
|
last_report = (sec_url + tag['href'])
|
||||||
@@ -54,57 +57,26 @@ def scrap_company_report(requested_cik, name):
|
|||||||
scrap_report_by_url(last_report, f"{name}/{date}")
|
scrap_report_by_url(last_report, f"{name}/{date}")
|
||||||
|
|
||||||
|
|
||||||
def scrap_report_by_url(url, filename):
|
def scrap_report_by_url(url, name):
|
||||||
response_two = get_request(url)
|
response_two = get_request(url)
|
||||||
soup_two = BeautifulSoup(response_two.text, "html.parser")
|
soup_two = BeautifulSoup(response_two.text, "html.parser")
|
||||||
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
|
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
|
||||||
xml_url = tags_two[3].get('href')
|
xml_url = tags_two[2].get('href')
|
||||||
|
|
||||||
response_xml = get_request(sec_url + xml_url)
|
response_xml = get_request(sec_url + xml_url)
|
||||||
soup_xml = BeautifulSoup(response_xml.content, "lxml")
|
soup_xml = BeautifulSoup(response_xml.content, "html.parser")
|
||||||
xml_to_csv(soup_xml, filename)
|
table = soup_xml.find(summary="Form 13F-NT Header Information")
|
||||||
|
df = pd.read_html(str(table), header=[1, 2])[0]
|
||||||
|
df.columns = [(b if a.startswith('Unnamed') else f"{a} {b}")for a,b in df.columns ]
|
||||||
def xml_to_csv(soup_xml, name):
|
# df.columns = [" ",] + list(df.columns[1:])
|
||||||
|
|
||||||
columns = [
|
|
||||||
"Name of Issuer",
|
|
||||||
"CUSIP",
|
|
||||||
"Value (x$1000)",
|
|
||||||
"Shares",
|
|
||||||
"Investment Discretion",
|
|
||||||
"Voting Sole / Shared / None"
|
|
||||||
]
|
|
||||||
issuers = soup_xml.body.findAll(re.compile('nameofissuer'))
|
|
||||||
cusips = soup_xml.body.findAll(re.compile('cusip'))
|
|
||||||
values = soup_xml.body.findAll(re.compile('value'))
|
|
||||||
sshprnamts = soup_xml.body.findAll('sshprnamt')
|
|
||||||
sshprnamttypes = soup_xml.body.findAll(re.compile('sshprnamttype'))
|
|
||||||
investmentdiscretions = soup_xml.body.findAll(re.compile('investmentdiscretion'))
|
|
||||||
soles = soup_xml.body.findAll(re.compile('sole'))
|
|
||||||
shareds = soup_xml.body.findAll(re.compile('shared'))
|
|
||||||
nones = soup_xml.body.findAll(re.compile('none'))
|
|
||||||
|
|
||||||
df = pd.DataFrame(columns= columns)
|
|
||||||
|
|
||||||
for issuer, cusip, value, sshprnamt, sshprnamttype, investmentdiscretion, sole, shared, none in zip(issuers, cusips, values, sshprnamts, sshprnamttypes, investmentdiscretions, soles, shareds, nones):
|
|
||||||
row = {
|
|
||||||
"Name of Issuer": issuer.text,
|
|
||||||
"CUSIP": cusip.text,
|
|
||||||
"Value (x$1000)": value.text,
|
|
||||||
"Shares": f"{sshprnamt.text} {sshprnamttype.text}",
|
|
||||||
"Investment Discretion": investmentdiscretion.text,
|
|
||||||
"Voting Sole / Shared / None": f"{sole.text} / {shared.text} / {none.text}"
|
|
||||||
}
|
|
||||||
df = df.append(row, ignore_index=True)
|
|
||||||
|
|
||||||
|
|
||||||
fo = Path(f"output/{name}.csv")
|
fo = Path(f"output/{name}.csv")
|
||||||
fo.parent.mkdir(exist_ok=True)
|
fo.parent.mkdir(exist_ok=True)
|
||||||
df.to_csv(fo)
|
df.to_csv(fo, index=False)
|
||||||
|
logger.info(fo)
|
||||||
|
|
||||||
# List of Investments
|
# List of Investments
|
||||||
CIK_LIST = [{
|
CIK_LIST = [
|
||||||
|
{
|
||||||
'name': 'Buffett',
|
'name': 'Buffett',
|
||||||
'cik': '0001067983'
|
'cik': '0001067983'
|
||||||
}, {
|
}, {
|
||||||
@@ -131,13 +103,14 @@ CIK_LIST = [{
|
|||||||
}, {
|
}, {
|
||||||
'name': 'AQR',
|
'name': 'AQR',
|
||||||
'cik': '0001167557'
|
'cik': '0001167557'
|
||||||
},{
|
},
|
||||||
|
{
|
||||||
'name': 'Scion Asset Management',
|
'name': 'Scion Asset Management',
|
||||||
'cik': '0001649339'
|
'cik': '0001649339'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
'name': 'Burry Michael J',
|
'name': 'KYNIKOS ASSOCIATES LP',
|
||||||
'cik': '0001342573'
|
'cik': '0001446440'
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
for row in tqdm(CIK_LIST):
|
for row in tqdm(CIK_LIST):
|
||||||
|
|||||||
+5939
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user