diff --git a/AUTHORS.rst b/AUTHORS.rst index 089df49..dd06919 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -5,7 +5,7 @@ Credits Development Lead ---------------- -* Ryan S. McCoy <18177650+ryansmccoy@users.noreply.github.com> +* Ryan S. McCoy Contributors ------------ diff --git a/README.rst b/README.rst index 4fb5008..62be293 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,9 @@ Python SEC Edgar ================ -A Python application used to download and parse complete submission filings from the sec.gov/edgar website. The goal for this project is to make it easy to get filings from the SEC website onto your computer for the companies and forms you desire. A few hurdles that I've tried to ease with this project: +A Python application used to download and parse complete submission filings from the sec.gov/edgar website. The goal for this project is to make it easy to get filings from the SEC website onto your computer for the companies and forms you desire. + +A few hurdles that I've tried to ease with this project: * CIK to Ticker Equivalent - probably the biggest hurdle is just figuring out the CIK for the company you want. I've tried to bypass this via a reference file mapping CIK to tickers. I'm sure there is a better way, but for now it seems to work. * Organizing the Data - I decided to keep it simple and organize the data similar to the SEC Edgar website (which is explained below) @@ -13,7 +15,7 @@ Features * Extract contents of Complete Submission Filing Quick Start Guide ------------------ +-------------------- Setup Environment (Windows) ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -22,7 +24,7 @@ Setup Environment (Windows) git clone https://github.com/ryansmccoy/py-sec-edgar.git cd py-sec-edgar - conda create -n py-sec-edgar python=3.6 pandas numpy lxml -y + conda create -n py-sec-edgar python=3.8 pandas numpy lxml -y activate py-sec-edgar pip install -r requirements.txt @@ -33,7 +35,7 @@ Setup Environment (Linux): git clone https://github.com/ryansmccoy/py-sec-edgar.git cd py-sec-edgar - conda create -n py-sec-edgar python=3.6 pandas numpy lxml -y + conda create -n py-sec-edgar python=3.8 pandas numpy lxml -y source activate py-sec-edgar sudo mkdir /sec_gov sudo chown -R $USER:$USER /sec_gov @@ -42,12 +44,24 @@ Setup Environment (Linux): Configure Settings (Optional) ------------------------------- + # py-sec-edgar/py_sec_edgar/settings.py + +Extracting Contents from Complete Submission Filing: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + # extract all contents from txt file + # Set this to True and everything will be extracted from Complete Submission Filing + # Note: There is a lot of content in these filings, so be prepared + + extract_filing_contents = False + Specify Form Types, Start, and End Dates: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :: - # py-sec-edgar/py_sec_edgar/settings.py # complete list @ py-sec-edgar/refdata/filing_types.xlsx @@ -58,7 +72,7 @@ Specify Form Types, Start, and End Dates: # below just says download all of them start_date = "1/1/2018" - end_date = "1/1/2020" + end_date = "1/1/2025" Specify Tickers: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,13 +83,12 @@ Specify Tickers: AAPL MSFT - BRK.B XOM GOOGL WFC -Run Example +Run Application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: console @@ -83,7 +96,7 @@ Run Example $ cd py-sec-edgar $ python py_sec_edgar -OR +Above, is the same as running (See notes at top of __main__.py file for explanation): .. code-block:: console @@ -228,6 +241,13 @@ local folder equivalent: c:\sec_gov\Archives\edgar\data\\\.txt <- follow this format +Why download the Complete Submission Filing? +---------------------------------------------- + +* Most Efficient and Courteous way of getting data from SEC website + * Contains everything the company filed in filing in one file + * Not making multiple download requests per filing + Central Index Key (CIK) ----------------------- @@ -274,7 +294,7 @@ Todo - Make Full-Index more efficient - Incorporate RSS Feed -- Add Celery +- Add Multi-Threading - need to figure out way to quickly access downloaded content - extract earnings data from 8-K - setup proper logging instead of print diff --git a/py_sec_edgar/__init__.py b/py_sec_edgar/__init__.py index 7850d1e..6e3afc3 100644 --- a/py_sec_edgar/__init__.py +++ b/py_sec_edgar/__init__.py @@ -1,6 +1,14 @@ # -*- coding: utf-8 -*- """Top-level package for Python SEC Edgar Data.""" + +__author__ = """Ryan S. McCoy""" +__email__ = 'github@ryansmccoy.com' +__version__ = '0.1.0' +import logging + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + import os import time from datetime import datetime, timedelta @@ -13,12 +21,6 @@ pd.set_option('display.max_rows', 100) pd.set_option('display.width', 600) -__author__ = """Ryan S. McCoy""" -__email__ = '18177650+ryansmccoy@users.noreply.github.com' -__version__ = '0.1.0' - -import logging -logging.getLogger(__name__).addHandler(logging.NullHandler()) header_list = ["ACCESSION NUMBER", "CONFORMED SUBMISSION TYPE", "PUBLIC DOCUMENT COUNT", "CONFORMED PERIOD OF REPORT", "FILED AS OF DATE", "DATE AS OF CHANGE", "FILER", "COMPANY data", @@ -109,9 +111,9 @@ class Folders: DAILY_INDEX_DIR = os.path.join(EDGAR_DIR, "daily-index") FILING_DIR = os.path.join(EDGAR_DIR, "filings") - MERGED_IDX_FILE = os.path.join(REF_DIR, 'merged_idx_files.pq') - TICKER_LIST = os.path.join(REF_DIR, "tickers.csv") - TICKER_CIK = os.path.join(REF_DIR, "cik_tickers.csv") + MERGED_IDX_FILEPATH = os.path.join(REF_DIR, 'merged_idx_files.pq') + TICKER_LIST_FILEPATH = os.path.join(REF_DIR, "tickers.csv") + TICKER_CIK_FILEPATH = os.path.join(REF_DIR, "cik_tickers.csv") # used as template TXT_FILING_DIR = os.path.join(EDGAR_DIR, "data", "CIK", "FOLDER") @@ -119,10 +121,12 @@ class Folders: dirs_all = [SEC_DIR, DATA_DIR, EDGAR_DIR, MONTHLY_DIR, FULL_INDEX_DIR, DAILY_INDEX_DIR] + print("Checking for Output Folders") + for _ in dirs_all: if not os.path.exists(_): - print("{} Doesn't Exists".format(_)) - print("Creating Directory {}".format(SEC_DIR)) + print(f"{_} Doesn't Exists") + print(f"Creating Directory {SEC_DIR}") try: os.makedirs(_) except: diff --git a/py_sec_edgar/__main__.py b/py_sec_edgar/__main__.py index 0431141..2c83490 100644 --- a/py_sec_edgar/__main__.py +++ b/py_sec_edgar/__main__.py @@ -9,39 +9,72 @@ - https://docs.python.org/2/using/cmdline.html#cmdoption-m - https://docs.python.org/3/using/cmdline.html#cmdoption-m """ +import logging + +logger = logging.getLogger(__name__) import os import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from pprint import pprint +from urllib.parse import urljoin import click - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import pandas as pd +import pyarrow.parquet as pq import py_sec_edgar.feeds as py_sec_edgar_feeds -from py_sec_edgar.broker import BrokerManager -from py_sec_edgar.settings import CONFIG +from py_sec_edgar.broker import FilingBroker +from py_sec_edgar.feeds import cik_column_to_list @click.command() -@click.option('--ticker_list_filter', default=True) -@click.option('--form_list_filter', default=True) -@click.option('--save_output', default=False) -def main(ticker_list_filter, form_list_filter, save_output): +def main(CONFIG): + # Downloads the list of filings on the SEC Edgar website py_sec_edgar_feeds.update_full_index_feed(skip_if_exists=True) - # ticker_list_filer is set in refdata/tickers.csv - # form_list_filter is specified in the settings file - df_filings_idx = py_sec_edgar_feeds.load_filings_feed(ticker_list_filter=ticker_list_filter, form_list_filter=form_list_filter) + # Used to convert CIK to Tickers + df_cik_tickers = pd.read_csv(CONFIG.TICKER_CIK_FILEPATH) + + # IDX Files contain URLs to the Filings, so we need them + df_merged_idx = pq.read_table(CONFIG.MERGED_IDX_FILEPATH).to_pandas().sort_values("Date Filed", ascending=False) - filing_broker = BrokerManager(CONFIG) + # If you specified tickers in py-sec-edgar/py_sec_edgar/settings.py + # Then load the file and filter out only the companies specified + if CONFIG.ticker_list_filter is True: + ticker_list = pd.read_csv(CONFIG.TICKER_LIST_FILEPATH, header=None).iloc[:, 0].tolist() + df_cik_tickers = df_cik_tickers[df_cik_tickers['SYMBOL'].isin(ticker_list)] - for i, sec_filing in df_filings_idx.iterrows(): + # If you specified forms in py-sec-edgar/py_sec_edgar/settings.py + # Then Filter the URL list to only the forms specified + if CONFIG.form_list_filter is True: + logging.info('\n\n\n\tLoading Forms Filter\n\n\n') + df_merged_idx = df_merged_idx[df_merged_idx['Form Type'].isin(CONFIG.forms_list)] + + # return only list of CIK tickers for companies and forms specified + cik_list = cik_column_to_list(df_cik_tickers) + + if CONFIG.ticker_list_filter: + df_merged_idx = df_merged_idx[df_merged_idx['CIK'].isin(cik_list)] + + # Create a new column in the dataframe of filings with the Output Filepaths + df_filings = df_merged_idx.assign(url=df_merged_idx['Filename'].apply(lambda x: urljoin(CONFIG.edgar_Archives_url, x))) + + # Initialize the Broker which will oversee the Extraction process + filing_broker = FilingBroker(CONFIG) + + for i, sec_filing in df_filings.iterrows(): pprint(str(sec_filing)) - filing_broker.process_filing(sec_filing) + filing_broker.process(sec_filing) + + return 0 if __name__ == "__main__": - main() + from py_sec_edgar.settings import CONFIG + + main(CONFIG) diff --git a/py_sec_edgar/broker.py b/py_sec_edgar/broker.py index d254687..f82e17f 100644 --- a/py_sec_edgar/broker.py +++ b/py_sec_edgar/broker.py @@ -6,24 +6,23 @@ from py_sec_edgar.filing import download_filing, extract_filing -class BrokerManager: +class FilingBroker: - def __init__(self, CONFIG=None): + def __init__(self, CONFIG): - if CONFIG == None: - from py_sec_edgar.settings import CONFIG + logger.info("Initalizing Broker...") self.CONFIG = CONFIG - self.extract_filing_contents = CONFIG.extract_filing_contents + self.CONFIG.BEGIN_PROCESS_FILINGS = True self.download_filing = download_filing self.extract_filing = extract_filing - logger.info("Initalizing Broker...") + self.filings_processed = 0 - def prepare_message(self, sec_filing): + def pre_process(self, sec_filing): """ - Sets parameters needed for various aspects. + Sets up the filepaths for the filing :param feed_item: :return: feed_item: @@ -42,13 +41,25 @@ def prepare_message(self, sec_filing): return feed_item - def process_filing(self, sec_filing): + def process(self, sec_filing): """ Manages the individual filing extraction process """ - broker_message = self.prepare_message(sec_filing) + filing_json = self.pre_process(sec_filing) + + filing_json = self.download_filing(filing_json) + + filing_content = self.extract_filing(filing_json) - broker_message = self.download_filing(broker_message) + self.post_process(filing_content) + + def post_process(self, filing_contents): + """ + Insert Custom Processing here + + :param filing_contents: + :return: + """ + pass - filing_content = self.extract_filing(broker_message) diff --git a/py_sec_edgar/feeds.py b/py_sec_edgar/feeds.py index cb2ac7e..8aa7162 100644 --- a/py_sec_edgar/feeds.py +++ b/py_sec_edgar/feeds.py @@ -21,15 +21,15 @@ def load_filings_feed(ticker_list_filter=True, form_list_filter=True): - df_cik_tickers = pd.read_csv(CONFIG.TICKER_CIK) + df_cik_tickers = pd.read_csv(CONFIG.TICKER_CIK_FILEPATH) logging.info('\n\n\n\tLoaded IDX files\n\n\n') - df_merged_idx_filings = pq.read_table(CONFIG.MERGED_IDX_FILE).to_pandas().sort_values("Date Filed", ascending=False) - # df_merged_idx_filings = pd.read_csv(CONFIG.MERGED_IDX_FILE, index_col=0, dtype={"CIK": int}, encoding='latin-1') + df_merged_idx_filings = pq.read_table(CONFIG.MERGED_IDX_FILEPATH).to_pandas().sort_values("Date Filed", ascending=False) + # df_merged_idx_filings = pd.read_csv(CONFIG.MERGED_IDX_FILEPATH, index_col=0, dtype={"CIK": int}, encoding='latin-1') if ticker_list_filter: - ticker_list = pd.read_csv(CONFIG.TICKER_LIST, header=None).iloc[:, 0].tolist() + ticker_list = pd.read_csv(CONFIG.TICKER_LIST_FILEPATH, header=None).iloc[:, 0].tolist() df_cik_tickers = df_cik_tickers[df_cik_tickers['SYMBOL'].isin(ticker_list)] if form_list_filter: @@ -393,3 +393,11 @@ def parse_monthly(): # consume_complete_submission_filing.delay(basename, item, ticker) logging.info('yes') + +def cik_column_to_list(df): + + df_cik_tickers = df.dropna(subset=['CIK']) + + df_cik_tickers['CIK'] = df_cik_tickers['CIK'].astype(int) + + return df_cik_tickers['CIK'].tolist() diff --git a/py_sec_edgar/settings.py b/py_sec_edgar/settings.py index 5bf6b43..264a1e8 100644 --- a/py_sec_edgar/settings.py +++ b/py_sec_edgar/settings.py @@ -26,6 +26,7 @@ class Config(Folders): # if going to use proxy, please only download on the weekends VPN_PROVIDER = "PP" + TEST_MODE = False CONFIG = Config() diff --git a/readthedocs.yml b/readthedocs.yml index d5e26b8..0978842 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -18,7 +18,7 @@ formats: all # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 + version: 3.8 install: - requirements: requirements.txt - method: pip diff --git a/setup.py b/setup.py index 8e4d58e..6f69b47 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setup( author="Ryan S. McCoy", - author_email='18177650+ryansmccoy@users.noreply.github.com', + author_email='github@ryansmccoy.com', classifiers=[ 'Development Status :: 2 - Pre-Alpha', 'Intended Audience :: Developers', @@ -41,13 +41,14 @@ 'Operating System :: Microsoft :: Windows', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], - description="Python package used to download SEC Edgar filings", + description="Python application used to download SEC Edgar filings", entry_points={ 'console_scripts': [ - 'py-sec-edgar=py_sec_edgar.run:main', + 'py-sec-edgar=py_sec_edgar.__main__:main', ], }, install_requires=requirements,