Skip to content

Commit

Permalink
Merge branch 'main' into add_new_ml_algs
Browse files Browse the repository at this point in the history
  • Loading branch information
riley-harper committed Dec 4, 2024
2 parents c5bf26e + 71c4fea commit 52d7721
Show file tree
Hide file tree
Showing 20 changed files with 229 additions and 10 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ on:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:

env:
HLINK_TAG: hlink:githubactions
Expand Down
1 change: 1 addition & 0 deletions docs/_sources/model_exploration.md.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Configuring Model Exploration
6 changes: 5 additions & 1 deletion docs/_sources/substitutions.md.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ You must supply a substitution file and either specify `regex_word_replace=true`

## 1:1 substitution by data table

Performs a 1:1 replacement on a filtered subset of the data table. If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file. Used to replace variant name forms with standardized name forms, filtering on sex.
Performs a 1:1 replacement on a filtered subset of the data table. If the
input column data equals a value in the second column of the substitution file,
it is replaced with the data in the first column of the substitution file.
Used to replace variant name forms with standardized name forms, filtering on
a column like sex which may affect common names.

* Attributes:
* `join_column` -- Type: `string`. Column to filter input data on.
Expand Down
1 change: 1 addition & 0 deletions docs/column_mappings.html
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/comparison_features.html
Original file line number Diff line number Diff line change
Expand Up @@ -1301,6 +1301,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/comparisons.html
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/config.html
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/feature_selection_transforms.html
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/installation.html
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/introduction.html
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/link_tasks.html
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
121 changes: 121 additions & 0 deletions docs/model_exploration.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
<!DOCTYPE html>

<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>Configuring Model Exploration &#8212; hlink 3.7.0 documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
<link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
<script src="_static/documentation_options.js?v=229cbe3b"></script>
<script src="_static/doctools.js?v=9bcbadda"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="prev" title="Models" href="models.html" />

<link rel="stylesheet" href="_static/custom.css" type="text/css" />





</head><body>


<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">


<div class="body" role="main">

<section id="configuring-model-exploration">
<h1>Configuring Model Exploration<a class="headerlink" href="#configuring-model-exploration" title="Link to this heading"></a></h1>
</section>


</div>

</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="Main">
<div class="sphinxsidebarwrapper">
<h1 class="logo"><a href="index.html">hlink</a></h1>









<search id="searchbox" style="display: none" role="search">
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
<input type="submit" value="Go" />
</form>
</div>
</search>
<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
<ul>
<li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="link_tasks.html">Link Tasks</a></li>
<li class="toctree-l1"><a class="reference internal" href="running_the_program.html">Running hlink</a></li>
<li class="toctree-l1"><a class="reference internal" href="use_examples.html">Advanced Workflows</a></li>
<li class="toctree-l1"><a class="reference internal" href="config.html">Configuration</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li>
<li class="toctree-l1"><a class="reference internal" href="comparisons.html">Comparisons</a></li>
<li class="toctree-l1"><a class="reference internal" href="comparison_features.html">Comparison Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Model Exploration</a></li>
</ul>

<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
<li>Previous: <a href="models.html" title="previous chapter">Models</a></li>
</ul></li>
</ul>
</div>








</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&#169;2019-2022, IPUMS.

|
Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
&amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>

|
<a href="_sources/model_exploration.md.txt"
rel="nofollow">Page source</a>
</div>




</body>
</html>
1 change: 1 addition & 0 deletions docs/pipeline_features.html
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
</li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
1 change: 1 addition & 0 deletions docs/running_the_program.html
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
2 changes: 1 addition & 1 deletion docs/searchindex.js

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion docs/substitutions.html
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@ <h1>Substitutions<a class="headerlink" href="#substitutions" title="Link to this
<p>You must supply a substitution file and either specify <code class="docutils literal notranslate"><span class="pre">regex_word_replace=true</span></code> or supply a join value.</p>
<section id="substitution-by-data-table">
<h2>1:1 substitution by data table<a class="headerlink" href="#substitution-by-data-table" title="Link to this heading"></a></h2>
<p>Performs a 1:1 replacement on a filtered subset of the data table. If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file. Used to replace variant name forms with standardized name forms, filtering on sex.</p>
<p>Performs a 1:1 replacement on a filtered subset of the data table. If the
input column data equals a value in the second column of the substitution file,
it is replaced with the data in the first column of the substitution file.
Used to replace variant name forms with standardized name forms, filtering on
a column like sex which may affect common names.</p>
<ul class="simple">
<li><p>Attributes:</p>
<ul>
Expand Down
1 change: 1 addition & 0 deletions docs/use_examples.html
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
</ul>

<div class="relations">
Expand Down
20 changes: 16 additions & 4 deletions hlink/linking/core/substitutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@
# https://github.com/ipums/hlink

from collections import namedtuple
from typing import Any

from pyspark import SparkContext
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import concat_ws, lit, regexp_replace, split, when


def generate_substitutions(spark, df_selected, substitution_columns):
def generate_substitutions(
spark: SparkSession,
df_selected: DataFrame,
substitution_columns: list[dict[str, Any]],
) -> DataFrame:
for substitution_column in substitution_columns:
column_name = substitution_column["column_name"]
for substitution in substitution_column["substitutions"]:
Expand All @@ -29,7 +37,7 @@ def generate_substitutions(spark, df_selected, substitution_columns):
return df_selected


def _load_substitutions(file_name):
def _load_substitutions(file_name: str) -> tuple[list[str], list[str]]:
"""Reads in the substitution file and returns a 2-tuple representing it.
Parameters
Expand All @@ -51,7 +59,9 @@ def _load_substitutions(file_name):
return (sub_froms, sub_tos)


def _apply_substitution(df, column_name, substitution, sc):
def _apply_substitution(
df: DataFrame, column_name: str, substitution: dict[str, Any], sc: SparkContext
) -> DataFrame:
"""Returns a new df with the values in the column column_name replaced using substitutions defined in substitution_file."""
substitution_file = substitution["substitution_file"]
join_value = substitution["join_value"]
Expand Down Expand Up @@ -81,7 +91,9 @@ def _apply_substitution(df, column_name, substitution, sc):
return df_sub.select(df_sub_selects)


def _apply_regex_substitution(df, column_name, substitution, sc):
def _apply_regex_substitution(
df: DataFrame, column_name: str, substitution: dict[str, Any], sc: SparkContext
) -> DataFrame:
"""Returns a new df with the values in the column column_name replaced using substitutions defined in substitution_file."""

substitution_file = substitution["substitution_file"]
Expand Down
63 changes: 63 additions & 0 deletions hlink/tests/core/substitutions_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# This file is part of the ISRDI's hlink.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

from pathlib import Path

from pyspark.sql import Row, SparkSession

from hlink.linking.core.substitutions import generate_substitutions, _load_substitutions


def test_load_substitutions(tmp_path: Path) -> None:
file_contents = """a,b
to this,from this"""

tmp_file = tmp_path / "substitutions.csv"
tmp_file.write_text(file_contents)
sub_froms, sub_tos = _load_substitutions(str(tmp_file))

assert sub_froms == ["b", "from this"]
assert sub_tos == ["a", "to this"]


def test_generate_substitutions(spark: SparkSession, tmp_path: Path) -> None:
tmp_file = tmp_path / "substitutions.csv"
tmp_file.write_text(
"""rose,rosie
sophia,sophy
sophia,sofia
amanda,mandy
jane,jean"""
)

df = spark.createDataFrame(
[("agnes", 2), ("mandy", 2), ("sophy", 2), ("rosie", 2), ("jean", 1)],
schema=["first_name", "sex"],
)

substitution_columns = [
{
"column_name": "first_name",
"substitutions": [
{
"join_column": "sex",
"join_value": 2,
"substitution_file": str(tmp_file),
}
],
}
]

subbed_df = generate_substitutions(spark, df, substitution_columns)
rows = subbed_df.select("first_name", "sex").collect()

assert rows == [
Row(first_name="agnes", sex=2),
Row(first_name="amanda", sex=2),
Row(first_name="sophia", sex=2),
Row(first_name="rose", sex=2),
# Note that this name is not substituted because we join on sex=2
Row(first_name="jean", sex=1),
]
6 changes: 5 additions & 1 deletion sphinx-docs/substitutions.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ You must supply a substitution file and either specify `regex_word_replace=true`

## 1:1 substitution by data table

Performs a 1:1 replacement on a filtered subset of the data table. If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file. Used to replace variant name forms with standardized name forms, filtering on sex.
Performs a 1:1 replacement on a filtered subset of the data table. If the
input column data equals a value in the second column of the substitution file,
it is replaced with the data in the first column of the substitution file.
Used to replace variant name forms with standardized name forms, filtering on
a column like sex which may affect common names.

* Attributes:
* `join_column` -- Type: `string`. Column to filter input data on.
Expand Down

0 comments on commit 52d7721

Please sign in to comment.