More refactoring, added comments to main function, and updated for Py…

…thon 3.8
ryansmccoy · Sep 9, 2020 · bc2f5a0 · bc2f5a0
1 parent 307b009
commit bc2f5a0
Show file tree

Hide file tree

Showing 9 changed files with 136 additions and 58 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -5,7 +5,7 @@ Credits
 Development Lead
 ----------------
 
-* Ryan S. McCoy <[email protected].github.com>
+* Ryan S. McCoy <github@ryansmccoy.com>
 
 Contributors
 ------------

diff --git a/README.rst b/README.rst
@@ -1,7 +1,9 @@
 Python SEC Edgar
 ================
 
-A Python application used to download and parse complete submission filings from the sec.gov/edgar website.  The goal for this project is to make it easy to get filings from the SEC website onto your computer for the companies and forms you desire.  A few hurdles that I've tried to ease with this project:
+A Python application used to download and parse complete submission filings from the sec.gov/edgar website.  The goal for this project is to make it easy to get filings from the SEC website onto your computer for the companies and forms you desire.
+
+A few hurdles that I've tried to ease with this project:
 
 * CIK to Ticker Equivalent - probably the biggest hurdle is just figuring out the CIK for the company you want.  I've tried to bypass this via a reference file mapping CIK to tickers.  I'm sure there is a better way, but for now it seems to work.
 * Organizing the Data - I decided to keep it simple and organize the data similar to the SEC Edgar website (which is explained below)
@@ -13,7 +15,7 @@ Features
 * Extract contents of Complete Submission Filing
 
 Quick Start Guide
------------------
+--------------------
 
 Setup Environment (Windows)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -22,7 +24,7 @@ Setup Environment (Windows)
 
    git clone https://github.com/ryansmccoy/py-sec-edgar.git
    cd py-sec-edgar
-   conda create -n py-sec-edgar python=3.6 pandas numpy lxml -y
+   conda create -n py-sec-edgar python=3.8 pandas numpy lxml -y
    activate py-sec-edgar
    pip install -r requirements.txt
 
@@ -33,7 +35,7 @@ Setup Environment (Linux):
 
    git clone https://github.com/ryansmccoy/py-sec-edgar.git
    cd py-sec-edgar
-   conda create -n py-sec-edgar python=3.6 pandas numpy lxml -y
+   conda create -n py-sec-edgar python=3.8 pandas numpy lxml -y
    source activate py-sec-edgar
    sudo mkdir /sec_gov
    sudo chown -R $USER:$USER /sec_gov
@@ -42,12 +44,24 @@ Setup Environment (Linux):
 Configure Settings (Optional)
 -------------------------------
 
+    # py-sec-edgar/py_sec_edgar/settings.py
+
+Extracting Contents from Complete Submission Filing:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+    # extract all contents from txt file
+    # Set this to True and everything will be extracted from Complete Submission Filing
+    # Note:  There is a lot of content in these filings, so be prepared
+
+    extract_filing_contents = False
+
 Specify Form Types, Start, and End Dates:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ::
 
-   # py-sec-edgar/py_sec_edgar/settings.py
 
    # complete list @ py-sec-edgar/refdata/filing_types.xlsx
 
@@ -58,7 +72,7 @@ Specify Form Types, Start, and End Dates:
    # below just says download all of them
 
    start_date = "1/1/2018"
-   end_date = "1/1/2020"
+   end_date = "1/1/2025"
 
 Specify Tickers:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -69,21 +83,20 @@ Specify Tickers:
 
    AAPL
    MSFT
-   BRK.B
    XOM
    GOOGL
    WFC
 
 
-Run Example
+Run Application
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: console
 
     $ cd py-sec-edgar
     $ python py_sec_edgar
 
-OR
+Above, is the same as running (See notes at top of __main__.py file for explanation):
 
 .. code-block:: console
 
@@ -228,6 +241,13 @@ local folder equivalent:
 
     c:\sec_gov\Archives\edgar\data\<CIK>\<ACCESSION_NUMBER_WITHOUT_DASHES>\<ACCESSION_NUMBER>.txt <-  follow this format
 
+Why download the Complete Submission Filing?
+----------------------------------------------
+
+* Most Efficient and Courteous way of getting data from SEC website
+    * Contains everything the company filed in filing in one file
+    * Not making multiple download requests per filing
+
 Central Index Key (CIK)
 -----------------------
 
@@ -274,7 +294,7 @@ Todo
    -  Make Full-Index more efficient
    -  Incorporate RSS Feed
 
--  Add Celery
+-  Add Multi-Threading
 -  need to figure out way to quickly access downloaded content
 -  extract earnings data from 8-K
 -  setup proper logging instead of print

diff --git a/py_sec_edgar/__init__.py b/py_sec_edgar/__init__.py
@@ -1,6 +1,14 @@
 # -*- coding: utf-8 -*-
 
 """Top-level package for Python SEC Edgar Data."""
+
+__author__ = """Ryan S. McCoy"""
+__email__ = '[email protected]'
+__version__ = '0.1.0'
+import logging
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
 import os
 import time
 from datetime import datetime, timedelta
@@ -13,12 +21,6 @@
 pd.set_option('display.max_rows', 100)
 pd.set_option('display.width', 600)
 
-__author__ = """Ryan S. McCoy"""
-__email__ = '[email protected]'
-__version__ = '0.1.0'
-
-import logging
-logging.getLogger(__name__).addHandler(logging.NullHandler())
 
 header_list = ["ACCESSION NUMBER", "CONFORMED SUBMISSION TYPE", "PUBLIC DOCUMENT COUNT",
                "CONFORMED PERIOD OF REPORT", "FILED AS OF DATE", "DATE AS OF CHANGE", "FILER", "COMPANY data",
@@ -109,20 +111,22 @@ class Folders:
     DAILY_INDEX_DIR = os.path.join(EDGAR_DIR, "daily-index")
     FILING_DIR = os.path.join(EDGAR_DIR, "filings")
 
-    MERGED_IDX_FILE = os.path.join(REF_DIR, 'merged_idx_files.pq')
-    TICKER_LIST = os.path.join(REF_DIR, "tickers.csv")
-    TICKER_CIK = os.path.join(REF_DIR, "cik_tickers.csv")
+    MERGED_IDX_FILEPATH = os.path.join(REF_DIR, 'merged_idx_files.pq')
+    TICKER_LIST_FILEPATH = os.path.join(REF_DIR, "tickers.csv")
+    TICKER_CIK_FILEPATH = os.path.join(REF_DIR, "cik_tickers.csv")
 
     # used as template
     TXT_FILING_DIR = os.path.join(EDGAR_DIR, "data", "CIK", "FOLDER")
 
     dirs_all = [SEC_DIR, DATA_DIR, EDGAR_DIR,
                 MONTHLY_DIR, FULL_INDEX_DIR, DAILY_INDEX_DIR]
 
+    print("Checking for Output Folders")
+
     for _ in dirs_all:
         if not os.path.exists(_):
-            print("{} Doesn't Exists".format(_))
-            print("Creating Directory {}".format(SEC_DIR))
+            print(f"{_} Doesn't Exists")
+            print(f"Creating Directory {SEC_DIR}")
             try:
                 os.makedirs(_)
             except:

diff --git a/py_sec_edgar/__main__.py b/py_sec_edgar/__main__.py
@@ -9,39 +9,72 @@
 - https://docs.python.org/2/using/cmdline.html#cmdoption-m
 - https://docs.python.org/3/using/cmdline.html#cmdoption-m
 """
+import logging
+
+logger = logging.getLogger(__name__)
 
 import os
 import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 from pprint import pprint
+from urllib.parse import urljoin
 
 import click
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import pandas as pd
+import pyarrow.parquet as pq
 
 import py_sec_edgar.feeds as py_sec_edgar_feeds
-from py_sec_edgar.broker import BrokerManager
-from py_sec_edgar.settings import CONFIG
+from py_sec_edgar.broker import FilingBroker
+from py_sec_edgar.feeds import cik_column_to_list
 
 @click.command()
-@click.option('--ticker_list_filter', default=True)
-@click.option('--form_list_filter', default=True)
-@click.option('--save_output', default=False)
-def main(ticker_list_filter, form_list_filter, save_output):
+def main(CONFIG):
 
+    # Downloads the list of filings on the SEC Edgar website
     py_sec_edgar_feeds.update_full_index_feed(skip_if_exists=True)
 
-    # ticker_list_filer is set in refdata/tickers.csv
-    # form_list_filter is specified in the settings file
-    df_filings_idx = py_sec_edgar_feeds.load_filings_feed(ticker_list_filter=ticker_list_filter, form_list_filter=form_list_filter)
+    # Used to convert CIK to Tickers
+    df_cik_tickers = pd.read_csv(CONFIG.TICKER_CIK_FILEPATH)
+
+    # IDX Files contain URLs to the Filings, so we need them
+    df_merged_idx = pq.read_table(CONFIG.MERGED_IDX_FILEPATH).to_pandas().sort_values("Date Filed", ascending=False)
 
-    filing_broker = BrokerManager(CONFIG)
+    # If you specified tickers in py-sec-edgar/py_sec_edgar/settings.py
+    # Then load the file and filter out only the companies specified
+    if CONFIG.ticker_list_filter is True:
+        ticker_list = pd.read_csv(CONFIG.TICKER_LIST_FILEPATH, header=None).iloc[:, 0].tolist()
+        df_cik_tickers = df_cik_tickers[df_cik_tickers['SYMBOL'].isin(ticker_list)]
 
-    for i, sec_filing in df_filings_idx.iterrows():
+    # If you specified forms in py-sec-edgar/py_sec_edgar/settings.py
+    # Then Filter the URL list to only the forms specified
+    if CONFIG.form_list_filter is True:
+        logging.info('\n\n\n\tLoading Forms Filter\n\n\n')
+        df_merged_idx = df_merged_idx[df_merged_idx['Form Type'].isin(CONFIG.forms_list)]
+
+    # return only list of CIK tickers for companies and forms specified
+    cik_list = cik_column_to_list(df_cik_tickers)
+
+    if CONFIG.ticker_list_filter:
+        df_merged_idx = df_merged_idx[df_merged_idx['CIK'].isin(cik_list)]
+
+    # Create a new column in the dataframe of filings with the Output Filepaths
+    df_filings = df_merged_idx.assign(url=df_merged_idx['Filename'].apply(lambda x: urljoin(CONFIG.edgar_Archives_url, x)))
+
+    # Initialize the Broker which will oversee the Extraction process
+    filing_broker = FilingBroker(CONFIG)
+
+    for i, sec_filing in df_filings.iterrows():
 
         pprint(str(sec_filing))
 
-        filing_broker.process_filing(sec_filing)
+        filing_broker.process(sec_filing)
+
+    return 0
 
 if __name__ == "__main__":
 
-    main()
+    from py_sec_edgar.settings import CONFIG
+
+    main(CONFIG)
diff --git a/py_sec_edgar/broker.py b/py_sec_edgar/broker.py
@@ -6,24 +6,23 @@
 
 from py_sec_edgar.filing import download_filing, extract_filing
 
-class BrokerManager:
+class FilingBroker:
 
-    def __init__(self, CONFIG=None):
+    def __init__(self, CONFIG):
 
-        if CONFIG == None:
-            from py_sec_edgar.settings import CONFIG
+        logger.info("Initalizing Broker...")
 
         self.CONFIG = CONFIG
-        self.extract_filing_contents = CONFIG.extract_filing_contents
+        self.CONFIG.BEGIN_PROCESS_FILINGS = True
 
         self.download_filing = download_filing
         self.extract_filing = extract_filing
 
-        logger.info("Initalizing Broker...")
+        self.filings_processed = 0
 
-    def prepare_message(self, sec_filing):
+    def pre_process(self, sec_filing):
         """
-        Sets parameters needed for various aspects.
+        Sets up the filepaths for the filing
 
         :param feed_item:
         :return: feed_item:
@@ -42,13 +41,25 @@ def prepare_message(self, sec_filing):
 
         return feed_item
 
-    def process_filing(self, sec_filing):
+    def process(self, sec_filing):
         """
         Manages the individual filing extraction process
         """
 
-        broker_message = self.prepare_message(sec_filing)
+        filing_json = self.pre_process(sec_filing)
+
+        filing_json = self.download_filing(filing_json)
+
+        filing_content = self.extract_filing(filing_json)
 
-        broker_message = self.download_filing(broker_message)
+        self.post_process(filing_content)
+
+    def post_process(self, filing_contents):
+        """
+        Insert Custom Processing here
+
+        :param filing_contents:
+        :return:
+        """
+        pass
 
-        filing_content = self.extract_filing(broker_message)
diff --git a/py_sec_edgar/feeds.py b/py_sec_edgar/feeds.py
@@ -21,15 +21,15 @@
 
 def load_filings_feed(ticker_list_filter=True, form_list_filter=True):
 
-    df_cik_tickers = pd.read_csv(CONFIG.TICKER_CIK)
+    df_cik_tickers = pd.read_csv(CONFIG.TICKER_CIK_FILEPATH)
 
     logging.info('\n\n\n\tLoaded IDX files\n\n\n')
 
-    df_merged_idx_filings = pq.read_table(CONFIG.MERGED_IDX_FILE).to_pandas().sort_values("Date Filed", ascending=False)
-    # df_merged_idx_filings = pd.read_csv(CONFIG.MERGED_IDX_FILE, index_col=0,  dtype={"CIK": int}, encoding='latin-1')
+    df_merged_idx_filings = pq.read_table(CONFIG.MERGED_IDX_FILEPATH).to_pandas().sort_values("Date Filed", ascending=False)
+    # df_merged_idx_filings = pd.read_csv(CONFIG.MERGED_IDX_FILEPATH, index_col=0,  dtype={"CIK": int}, encoding='latin-1')
 
     if ticker_list_filter:
-        ticker_list = pd.read_csv(CONFIG.TICKER_LIST, header=None).iloc[:, 0].tolist()
+        ticker_list = pd.read_csv(CONFIG.TICKER_LIST_FILEPATH, header=None).iloc[:, 0].tolist()
         df_cik_tickers = df_cik_tickers[df_cik_tickers['SYMBOL'].isin(ticker_list)]
 
     if form_list_filter:
@@ -393,3 +393,11 @@ def parse_monthly():
                         # consume_complete_submission_filing.delay(basename, item, ticker)
                         logging.info('yes')
 
+
+def cik_column_to_list(df):
+
+    df_cik_tickers = df.dropna(subset=['CIK'])
+
+    df_cik_tickers['CIK'] = df_cik_tickers['CIK'].astype(int)
+
+    return df_cik_tickers['CIK'].tolist()
diff --git a/py_sec_edgar/settings.py b/py_sec_edgar/settings.py
@@ -26,6 +26,7 @@ class Config(Folders):
     # if going to use proxy, please only download on the weekends
     VPN_PROVIDER = "PP"
 
+    TEST_MODE = False
 
 
 CONFIG = Config()
diff --git a/readthedocs.yml b/readthedocs.yml
@@ -18,7 +18,7 @@ formats: all
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: 3.7
+  version: 3.8
   install:
       - requirements: requirements.txt
       - method: pip