Init commit

This commit is contained in:
Gary Pang
2019-07-18 17:37:40 -04:00
commit 6d711ba63b
5 changed files with 188 additions and 0 deletions
+14
View File
@@ -0,0 +1,14 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
requests = "*"
beautifulsoup4 = "*"
lxml = "*"
[requires]
python_version = "3.7"
Generated
+103
View File
@@ -0,0 +1,103 @@
{
"_meta": {
"hash": {
"sha256": "b4132b4e5879e89d02d9b476d89df9ef85ea6659012e068b23e04b55d0bc31da"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.7"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"beautifulsoup4": {
"hashes": [
"sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
],
"index": "pypi",
"version": "==4.7.1"
},
"certifi": {
"hashes": [
"sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939",
"sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695"
],
"version": "==2019.6.16"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
],
"version": "==2.8"
},
"lxml": {
"hashes": [
"sha256:06c7616601430aa140a69f97e3116308fffe0848f543b639a5ec2e8920ae72fd",
"sha256:177202792f9842374a8077735c69c41a4282183f7851443d2beb8ee310720819",
"sha256:19317ad721ceb9e39847d11131903931e2794e447d4751ebb0d9236f1b349ff2",
"sha256:36d206e62f3e5dbaafd4ec692b67157e271f5da7fd925fda8515da675eace50d",
"sha256:387115b066c797c85f9861a9613abf50046a15aac16759bc92d04f94acfad082",
"sha256:3ce1c49d4b4a7bc75fb12acb3a6247bb7a91fe420542e6d671ba9187d12a12c2",
"sha256:4d2a5a7d6b0dbb8c37dab66a8ce09a8761409c044017721c21718659fa3365a1",
"sha256:58d0a1b33364d1253a88d18df6c0b2676a1746d27c969dc9e32d143a3701dda5",
"sha256:62a651c618b846b88fdcae0533ec23f185bb322d6c1845733f3123e8980c1d1b",
"sha256:69ff21064e7debc9b1b1e2eee8c2d686d042d4257186d70b338206a80c5bc5ea",
"sha256:7060453eba9ba59d821625c6af6a266bd68277dce6577f754d1eb9116c094266",
"sha256:7d26b36a9c4bce53b9cfe42e67849ae3c5c23558bc08363e53ffd6d94f4ff4d2",
"sha256:83b427ad2bfa0b9705e02a83d8d607d2c2f01889eb138168e462a3a052c42368",
"sha256:923d03c84534078386cf50193057aae98fa94cace8ea7580b74754493fda73ad",
"sha256:b773715609649a1a180025213f67ffdeb5a4878c784293ada300ee95a1f3257b",
"sha256:baff149c174e9108d4a2fee192c496711be85534eab63adb122f93e70aa35431",
"sha256:bca9d118b1014b4c2d19319b10a3ebed508ff649396ce1855e1c96528d9b2fa9",
"sha256:ce580c28845581535dc6000fc7c35fdadf8bea7ccb57d6321b044508e9ba0685",
"sha256:d34923a569e70224d88e6682490e24c842907ba2c948c5fd26185413cbe0cd96",
"sha256:dd9f0e531a049d8b35ec5e6c68a37f1ba6ec3a591415e6804cbdf652793d15d7",
"sha256:ecb805cbfe9102f3fd3d2ef16dfe5ae9e2d7a7dfbba92f4ff1e16ac9784dbfb0",
"sha256:ede9aad2197a0202caff35d417b671f5f91a3631477441076082a17c94edd846",
"sha256:ef2d1fc370400e0aa755aab0b20cf4f1d0e934e7fd5244f3dd4869078e4942b9",
"sha256:f2fec194a49bfaef42a548ee657362af5c7a640da757f6f452a35da7dd9f923c"
],
"index": "pypi",
"version": "==4.3.4"
},
"requests": {
"hashes": [
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
],
"index": "pypi",
"version": "==2.22.0"
},
"soupsieve": {
"hashes": [
"sha256:72b5f1aea9101cf720a36bb2327ede866fd6f1a07b1e87c92a1cc18113cbc946",
"sha256:e4e9c053d59795e440163733a7fec6c5972210e1790c507e4c7b051d6c5259de"
],
"version": "==1.9.2"
},
"urllib3": {
"hashes": [
"sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
"sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
],
"version": "==1.25.3"
}
},
"develop": {}
}
+24
View File
@@ -0,0 +1,24 @@
# EDGAR Python Web Scraper
This repository contains Gary Pang's Python Web scraper for parsing fund holdings pulled from SEC website, [EDGAR](https://www.sec.gov/edgar/searchedgar/companysearch.html), and writing a .tsv file from the data.
## Requirements
#### Getting Started
- `pip install -r requirements.txt` (or `pipenv install` if you are using pipenv)
- `python scraper.py` (or `pipenv run python scraper.py`)
- When prompted, enter the 10-digit CIK number of a mutual fund
#### Key Dependencies
- [Requests](https://2.python-requests.org/en/master/), Python library for making HTTP requests
- [lxml](https://lxml.de/), Python library for processing XML and HTML
- [Beautiful Soup](https://pypi.org/project/beautifulsoup4/), Python library for scraping information from Web pages
- [re](https://docs.python.org/3/library/re.html), Python module for using regular expressions
- [csv](https://docs.python.org/3/library/csv.html), Python module for parsing and writing CSV and TSV files
## Contributor
- [Gary Pang](https://github.com/CodeWritingCow)
## References
- [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm)
+3
View File
@@ -0,0 +1,3 @@
requests==2.22.0
lxml==4.3.4
beautifulsoup4==4.7.1
+44
View File
@@ -0,0 +1,44 @@
import requests
import re
import csv
import lxml
from bs4 import BeautifulSoup
sec_url = 'https://www.sec.gov'
def get_request(url):
return requests.get(url)
def create_url(cik):
return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type=13F-HR'.format(cik)
def get_user_input():
cik = input("Enter 10-digit CIK number: ")
return cik
requested_cik = get_user_input()
# Find mutual fund by CIK number on EDGAR
response = get_request(create_url(requested_cik))
soup = BeautifulSoup(response.text, "html.parser")
tags = soup.findAll('a', id="documentsbutton")
# Find latest 13F report for mutual fund
response_two = get_request(sec_url + tags[0]['href'])
soup_two = BeautifulSoup(response_two.text, "html.parser")
tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
xml_url = tags_two[3].get('href')
response_xml = get_request(sec_url + xml_url)
soup_xml = BeautifulSoup(response_xml.content, "lxml")
# Find all issuers
issuers = soup_xml.body.findAll(re.compile('nameofissuer'))
for issuer in issuers:
print(issuer.text)
# Write issuer names to TSV file
with open('{}.tsv'.format(requested_cik), 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter='\t')
for issuer in issuers:
tsv_writer.writerow([issuer.text])