From 6d711ba63b708c856fc22d36d9085941da7e4b27 Mon Sep 17 00:00:00 2001 From: Gary Pang Date: Thu, 18 Jul 2019 17:37:40 -0400 Subject: [PATCH] Init commit --- Pipfile | 14 +++++++ Pipfile.lock | 103 +++++++++++++++++++++++++++++++++++++++++++++++ README.md | 24 +++++++++++ requirements.txt | 3 ++ scraper.py | 44 ++++++++++++++++++++ 5 files changed, 188 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 scraper.py diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..4e475b1 --- /dev/null +++ b/Pipfile @@ -0,0 +1,14 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +requests = "*" +beautifulsoup4 = "*" +lxml = "*" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..39a4ea7 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,103 @@ +{ + "_meta": { + "hash": { + "sha256": "b4132b4e5879e89d02d9b476d89df9ef85ea6659012e068b23e04b55d0bc31da" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "beautifulsoup4": { + "hashes": [ + "sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", + "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", + "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718" + ], + "index": "pypi", + "version": "==4.7.1" + }, + "certifi": { + "hashes": [ + "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", + "sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695" + ], + "version": "==2019.6.16" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "version": "==2.8" + }, + "lxml": { + "hashes": [ + "sha256:06c7616601430aa140a69f97e3116308fffe0848f543b639a5ec2e8920ae72fd", + "sha256:177202792f9842374a8077735c69c41a4282183f7851443d2beb8ee310720819", + "sha256:19317ad721ceb9e39847d11131903931e2794e447d4751ebb0d9236f1b349ff2", + "sha256:36d206e62f3e5dbaafd4ec692b67157e271f5da7fd925fda8515da675eace50d", + "sha256:387115b066c797c85f9861a9613abf50046a15aac16759bc92d04f94acfad082", + "sha256:3ce1c49d4b4a7bc75fb12acb3a6247bb7a91fe420542e6d671ba9187d12a12c2", + "sha256:4d2a5a7d6b0dbb8c37dab66a8ce09a8761409c044017721c21718659fa3365a1", + "sha256:58d0a1b33364d1253a88d18df6c0b2676a1746d27c969dc9e32d143a3701dda5", + "sha256:62a651c618b846b88fdcae0533ec23f185bb322d6c1845733f3123e8980c1d1b", + "sha256:69ff21064e7debc9b1b1e2eee8c2d686d042d4257186d70b338206a80c5bc5ea", + "sha256:7060453eba9ba59d821625c6af6a266bd68277dce6577f754d1eb9116c094266", + "sha256:7d26b36a9c4bce53b9cfe42e67849ae3c5c23558bc08363e53ffd6d94f4ff4d2", + "sha256:83b427ad2bfa0b9705e02a83d8d607d2c2f01889eb138168e462a3a052c42368", + "sha256:923d03c84534078386cf50193057aae98fa94cace8ea7580b74754493fda73ad", + "sha256:b773715609649a1a180025213f67ffdeb5a4878c784293ada300ee95a1f3257b", + "sha256:baff149c174e9108d4a2fee192c496711be85534eab63adb122f93e70aa35431", + "sha256:bca9d118b1014b4c2d19319b10a3ebed508ff649396ce1855e1c96528d9b2fa9", + "sha256:ce580c28845581535dc6000fc7c35fdadf8bea7ccb57d6321b044508e9ba0685", + "sha256:d34923a569e70224d88e6682490e24c842907ba2c948c5fd26185413cbe0cd96", + "sha256:dd9f0e531a049d8b35ec5e6c68a37f1ba6ec3a591415e6804cbdf652793d15d7", + "sha256:ecb805cbfe9102f3fd3d2ef16dfe5ae9e2d7a7dfbba92f4ff1e16ac9784dbfb0", + "sha256:ede9aad2197a0202caff35d417b671f5f91a3631477441076082a17c94edd846", + "sha256:ef2d1fc370400e0aa755aab0b20cf4f1d0e934e7fd5244f3dd4869078e4942b9", + "sha256:f2fec194a49bfaef42a548ee657362af5c7a640da757f6f452a35da7dd9f923c" + ], + "index": "pypi", + "version": "==4.3.4" + }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, + "soupsieve": { + "hashes": [ + "sha256:72b5f1aea9101cf720a36bb2327ede866fd6f1a07b1e87c92a1cc18113cbc946", + "sha256:e4e9c053d59795e440163733a7fec6c5972210e1790c507e4c7b051d6c5259de" + ], + "version": "==1.9.2" + }, + "urllib3": { + "hashes": [ + "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", + "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + ], + "version": "==1.25.3" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..a12c729 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# EDGAR Python Web Scraper +This repository contains Gary Pang's Python Web scraper for parsing fund holdings pulled from SEC website, [EDGAR](https://www.sec.gov/edgar/searchedgar/companysearch.html), and writing a .tsv file from the data. + + +## Requirements + +#### Getting Started +- `pip install -r requirements.txt` (or `pipenv install` if you are using pipenv) +- `python scraper.py` (or `pipenv run python scraper.py`) +- When prompted, enter the 10-digit CIK number of a mutual fund + +#### Key Dependencies + +- [Requests](https://2.python-requests.org/en/master/), Python library for making HTTP requests +- [lxml](https://lxml.de/), Python library for processing XML and HTML +- [Beautiful Soup](https://pypi.org/project/beautifulsoup4/), Python library for scraping information from Web pages +- [re](https://docs.python.org/3/library/re.html), Python module for using regular expressions +- [csv](https://docs.python.org/3/library/csv.html), Python module for parsing and writing CSV and TSV files + +## Contributor +- [Gary Pang](https://github.com/CodeWritingCow) + +## References +- [SEC: Frequently Asked Questions About Form 13F](https://www.sec.gov/divisions/investment/13ffaq.htm) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..62e180c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.22.0 +lxml==4.3.4 +beautifulsoup4==4.7.1 \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..96b415e --- /dev/null +++ b/scraper.py @@ -0,0 +1,44 @@ +import requests +import re +import csv +import lxml +from bs4 import BeautifulSoup + +sec_url = 'https://www.sec.gov' + +def get_request(url): + return requests.get(url) + +def create_url(cik): + return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type=13F-HR'.format(cik) + +def get_user_input(): + cik = input("Enter 10-digit CIK number: ") + return cik + +requested_cik = get_user_input() + +# Find mutual fund by CIK number on EDGAR +response = get_request(create_url(requested_cik)) +soup = BeautifulSoup(response.text, "html.parser") +tags = soup.findAll('a', id="documentsbutton") + +# Find latest 13F report for mutual fund +response_two = get_request(sec_url + tags[0]['href']) +soup_two = BeautifulSoup(response_two.text, "html.parser") +tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')}) +xml_url = tags_two[3].get('href') + +response_xml = get_request(sec_url + xml_url) +soup_xml = BeautifulSoup(response_xml.content, "lxml") + +# Find all issuers +issuers = soup_xml.body.findAll(re.compile('nameofissuer')) +for issuer in issuers: + print(issuer.text) + +# Write issuer names to TSV file +with open('{}.tsv'.format(requested_cik), 'wt') as out_file: + tsv_writer = csv.writer(out_file, delimiter='\t') + for issuer in issuers: + tsv_writer.writerow([issuer.text])