mirror of
https://github.com/wassname/sec-web-scraper-13f.git
synced 2026-06-27 16:31:41 +08:00
Init commit
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
[[source]]
|
||||
name = "pypi"
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[packages]
|
||||
requests = "*"
|
||||
beautifulsoup4 = "*"
|
||||
lxml = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
Generated
+103
@@ -0,0 +1,103 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "b4132b4e5879e89d02d9b476d89df9ef85ea6659012e068b23e04b55d0bc31da"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.7"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"beautifulsoup4": {
|
||||
"hashes": [
|
||||
"sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
|
||||
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
|
||||
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.7.1"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939",
|
||||
"sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695"
|
||||
],
|
||||
"version": "==2019.6.16"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
|
||||
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
|
||||
],
|
||||
"version": "==2.8"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
"sha256:06c7616601430aa140a69f97e3116308fffe0848f543b639a5ec2e8920ae72fd",
|
||||
"sha256:177202792f9842374a8077735c69c41a4282183f7851443d2beb8ee310720819",
|
||||
"sha256:19317ad721ceb9e39847d11131903931e2794e447d4751ebb0d9236f1b349ff2",
|
||||
"sha256:36d206e62f3e5dbaafd4ec692b67157e271f5da7fd925fda8515da675eace50d",
|
||||
"sha256:387115b066c797c85f9861a9613abf50046a15aac16759bc92d04f94acfad082",
|
||||
"sha256:3ce1c49d4b4a7bc75fb12acb3a6247bb7a91fe420542e6d671ba9187d12a12c2",
|
||||
"sha256:4d2a5a7d6b0dbb8c37dab66a8ce09a8761409c044017721c21718659fa3365a1",
|
||||
"sha256:58d0a1b33364d1253a88d18df6c0b2676a1746d27c969dc9e32d143a3701dda5",
|
||||
"sha256:62a651c618b846b88fdcae0533ec23f185bb322d6c1845733f3123e8980c1d1b",
|
||||
"sha256:69ff21064e7debc9b1b1e2eee8c2d686d042d4257186d70b338206a80c5bc5ea",
|
||||
"sha256:7060453eba9ba59d821625c6af6a266bd68277dce6577f754d1eb9116c094266",
|
||||
"sha256:7d26b36a9c4bce53b9cfe42e67849ae3c5c23558bc08363e53ffd6d94f4ff4d2",
|
||||
"sha256:83b427ad2bfa0b9705e02a83d8d607d2c2f01889eb138168e462a3a052c42368",
|
||||
"sha256:923d03c84534078386cf50193057aae98fa94cace8ea7580b74754493fda73ad",
|
||||
"sha256:b773715609649a1a180025213f67ffdeb5a4878c784293ada300ee95a1f3257b",
|
||||
"sha256:baff149c174e9108d4a2fee192c496711be85534eab63adb122f93e70aa35431",
|
||||
"sha256:bca9d118b1014b4c2d19319b10a3ebed508ff649396ce1855e1c96528d9b2fa9",
|
||||
"sha256:ce580c28845581535dc6000fc7c35fdadf8bea7ccb57d6321b044508e9ba0685",
|
||||
"sha256:d34923a569e70224d88e6682490e24c842907ba2c948c5fd26185413cbe0cd96",
|
||||
"sha256:dd9f0e531a049d8b35ec5e6c68a37f1ba6ec3a591415e6804cbdf652793d15d7",
|
||||
"sha256:ecb805cbfe9102f3fd3d2ef16dfe5ae9e2d7a7dfbba92f4ff1e16ac9784dbfb0",
|
||||
"sha256:ede9aad2197a0202caff35d417b671f5f91a3631477441076082a17c94edd846",
|
||||
"sha256:ef2d1fc370400e0aa755aab0b20cf4f1d0e934e7fd5244f3dd4869078e4942b9",
|
||||
"sha256:f2fec194a49bfaef42a548ee657362af5c7a640da757f6f452a35da7dd9f923c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.3.4"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
|
||||
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.22.0"
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:72b5f1aea9101cf720a36bb2327ede866fd6f1a07b1e87c92a1cc18113cbc946",
|
||||
"sha256:e4e9c053d59795e440163733a7fec6c5972210e1790c507e4c7b051d6c5259de"
|
||||
],
|
||||
"version": "==1.9.2"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
|
||||
"sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
|
||||
],
|
||||
"version": "==1.25.3"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
# EDGAR Python Web Scraper
|
||||
This repository contains Gary Pang's Python Web scraper for parsing fund holdings pulled from SEC website, [EDGAR](https://www.sec.gov/edgar/searchedgar/companysearch.html), and writing a .tsv file from the data.
|
||||
|
||||
|
||||
## Requirements
|
||||
|
||||
#### Getting Started
|
||||
- `pip install -r requirements.txt` (or `pipenv install` if you are using pipenv)
|
||||
- `python scraper.py` (or `pipenv run python scraper.py`)
|
||||
- When prompted, enter the 10-digit CIK number of a mutual fund
|
||||
|
||||
#### Key Dependencies
|
||||
|
||||
- [Requests](https://2.python-requests.org/en/master/), Python library for making HTTP requests
|
||||
- [lxml](https://lxml.de/), Python library for processing XML and HTML
|
||||
- [Beautiful Soup](https://pypi.org/project/beautifulsoup4/), Python library for scraping information from Web pages
|
||||
- [re](https://docs.python.org/3/library/re.html), Python module for using regular expressions
|
||||
- [csv](https://docs.python.org/3/library/csv.html), Python module for parsing and writing CSV and TSV files
|
||||
|
||||
## Contributor
|
||||
- [Gary Pang](https://github.com/CodeWritingCow)
|
||||
|
||||
## References
|
||||
- [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm)
|
||||
@@ -0,0 +1,3 @@
|
||||
requests==2.22.0
|
||||
lxml==4.3.4
|
||||
beautifulsoup4==4.7.1
|
||||
+44
@@ -0,0 +1,44 @@
|
||||
import requests
|
||||
import re
|
||||
import csv
|
||||
import lxml
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
sec_url = 'https://www.sec.gov'
|
||||
|
||||
def get_request(url):
|
||||
return requests.get(url)
|
||||
|
||||
def create_url(cik):
|
||||
return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type=13F-HR'.format(cik)
|
||||
|
||||
def get_user_input():
|
||||
cik = input("Enter 10-digit CIK number: ")
|
||||
return cik
|
||||
|
||||
requested_cik = get_user_input()
|
||||
|
||||
# Find mutual fund by CIK number on EDGAR
|
||||
response = get_request(create_url(requested_cik))
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
tags = soup.findAll('a', id="documentsbutton")
|
||||
|
||||
# Find latest 13F report for mutual fund
|
||||
response_two = get_request(sec_url + tags[0]['href'])
|
||||
soup_two = BeautifulSoup(response_two.text, "html.parser")
|
||||
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
|
||||
xml_url = tags_two[3].get('href')
|
||||
|
||||
response_xml = get_request(sec_url + xml_url)
|
||||
soup_xml = BeautifulSoup(response_xml.content, "lxml")
|
||||
|
||||
# Find all issuers
|
||||
issuers = soup_xml.body.findAll(re.compile('nameofissuer'))
|
||||
for issuer in issuers:
|
||||
print(issuer.text)
|
||||
|
||||
# Write issuer names to TSV file
|
||||
with open('{}.tsv'.format(requested_cik), 'wt') as out_file:
|
||||
tsv_writer = csv.writer(out_file, delimiter='\t')
|
||||
for issuer in issuers:
|
||||
tsv_writer.writerow([issuer.text])
|
||||
Reference in New Issue
Block a user