Skip to content

Commit

Permalink
add in final match between ex 21 subs and eia utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
katie-lamb committed Dec 18, 2024
1 parent f4cceb7 commit fa9e52e
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 11 deletions.
17 changes: 6 additions & 11 deletions notebooks/18-kl-splink-sec-eia.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,18 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "1107fe42-197c-4fea-9c48-06d08699af0b",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n",
"from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n",
"from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n",
"import splink.comparison_library as cl\n",
"import splink.comparison_level_library as cll\n",
"from splink.exploratory import completeness_chart, profile_columns\n",
"from upath import UPath\n",
"\n",
"from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n",
" BLOCKING_RULES,\n",
Expand Down Expand Up @@ -61,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb",
"metadata": {},
"outputs": [],
Expand All @@ -71,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -247,7 +242,7 @@
"2 2 1001 ebenezer church solar limited liability c... 176 ebenezer church rd 63186 8567.0 1001 ebenezer church solar, llc 2020-01-01 state road nc 28676 True None None None Q None None None None None None None None None None None None None final 2020 1001 ebenezer church solar EBNSR XRX SLR"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -258,7 +253,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3",
"metadata": {},
"outputs": [
Expand All @@ -268,7 +263,7 @@
"20821"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
114 changes: 114 additions & 0 deletions notebooks/20-kl-validate-sec-output-table.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,120 @@
"from upath import UPath"
]
},
{
"cell_type": "markdown",
"id": "511b2c77-ebd2-43b0-8e45-1d1c76fb321d",
"metadata": {},
"source": [
"### EIA"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4907820f-2552-4a3b-866a-30c3181af91b",
"metadata": {},
"outputs": [],
"source": [
"eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")"
]
},
{
"cell_type": "markdown",
"id": "5f488f86-4b34-4a94-985f-588f991ba86b",
"metadata": {},
"source": [
"### Ex. 21"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c1795acc-8005-4b6d-be4d-27c722b634f1",
"metadata": {},
"outputs": [],
"source": [
"ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "304d929b-ce6c-4508-b511-475f287a6b37",
"metadata": {},
"outputs": [],
"source": [
"merged_df = ex21_df.merge(\n",
" eia_df.drop_duplicates(subset=\"company_name\")[[\"company_name\", \"utility_id_eia\"]], how=\"left\", on=\"company_name\", suffixes=(\"_ex21\", \"_eia\")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d315f8d5-7166-4161-bc4e-79c45ed3ad59",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1055987, 20821)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(ex21_df), len(eia_df)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "3aae6d2c-a941-478e-8178-84cf1321e0b3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"utility_id_eia\n",
"True 1050887\n",
"False 5100\n",
"Name: count, dtype: int64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.utility_id_eia.isnull().value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "6aba0ae8-a8ee-47ef-8eb9-a0ef9f283b51",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1675"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(merged_df.utility_id_eia.unique())"
]
},
{
"cell_type": "markdown",
"id": "8d178634-b494-4769-93e3-c0213e4a0326",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,11 +339,13 @@ def core_sec_10k__filers(
ins={
"sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"),
"clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
"clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"),
},
)
def out_sec_10k__parents_and_subsidiaries(
sec_10k_filers_matched_df: pd.DataFrame,
clean_ex21_df: pd.DataFrame,
clean_eia_df: pd.DataFrame,
) -> pd.DataFrame:
"""Asset for creating an SEC 10K output table.
Expand All @@ -364,6 +366,21 @@ def out_sec_10k__parents_and_subsidiaries(
ex21_df_with_cik["central_index_key"].isnull()
]
ex21_non_filing_subs_df.loc[:, "files_10k"] = False
# the last step is to take the EIA utilities that haven't been matched
# to a filer company, and merge them by company name onto the Ex. 21 subs
unmatched_eia_df = clean_eia_df[
~clean_eia_df["utility_id_eia"].isin(
sec_10k_filers_matched_df.utility_id_eia.unique()
)
].drop_duplicates(subset="company_name")
ex21_non_filing_subs_df = ex21_non_filing_subs_df.merge(
unmatched_eia_df[["utility_id_eia", "company_name"]],
how="left",
on="company_name",
)
logger.info(
f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}"
)
out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df])
return out_df

Expand Down

0 comments on commit fa9e52e

Please sign in to comment.