-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into add_new_ml_algs
- Loading branch information
Showing
20 changed files
with
229 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Configuring Model Exploration |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
<!DOCTYPE html> | ||
|
||
<html lang="en" data-content_root="./"> | ||
<head> | ||
<meta charset="utf-8" /> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" /> | ||
|
||
<title>Configuring Model Exploration — hlink 3.7.0 documentation</title> | ||
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" /> | ||
<link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" /> | ||
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" /> | ||
<script src="_static/documentation_options.js?v=229cbe3b"></script> | ||
<script src="_static/doctools.js?v=9bcbadda"></script> | ||
<script src="_static/sphinx_highlight.js?v=dc90522c"></script> | ||
<link rel="index" title="Index" href="genindex.html" /> | ||
<link rel="search" title="Search" href="search.html" /> | ||
<link rel="prev" title="Models" href="models.html" /> | ||
|
||
<link rel="stylesheet" href="_static/custom.css" type="text/css" /> | ||
|
||
|
||
|
||
|
||
|
||
</head><body> | ||
|
||
|
||
<div class="document"> | ||
<div class="documentwrapper"> | ||
<div class="bodywrapper"> | ||
|
||
|
||
<div class="body" role="main"> | ||
|
||
<section id="configuring-model-exploration"> | ||
<h1>Configuring Model Exploration<a class="headerlink" href="#configuring-model-exploration" title="Link to this heading">¶</a></h1> | ||
</section> | ||
|
||
|
||
</div> | ||
|
||
</div> | ||
</div> | ||
<div class="sphinxsidebar" role="navigation" aria-label="Main"> | ||
<div class="sphinxsidebarwrapper"> | ||
<h1 class="logo"><a href="index.html">hlink</a></h1> | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<search id="searchbox" style="display: none" role="search"> | ||
<div class="searchformwrapper"> | ||
<form class="search" action="search.html" method="get"> | ||
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/> | ||
<input type="submit" value="Go" /> | ||
</form> | ||
</div> | ||
</search> | ||
<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3> | ||
<ul> | ||
<li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="link_tasks.html">Link Tasks</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="running_the_program.html">Running hlink</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="use_examples.html">Advanced Workflows</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="config.html">Configuration</a></li> | ||
</ul> | ||
<p class="caption" role="heading"><span class="caption-text">Configuration API</span></p> | ||
<ul class="current"> | ||
<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="comparisons.html">Comparisons</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="comparison_features.html">Comparison Features</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li> | ||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Model Exploration</a></li> | ||
</ul> | ||
|
||
<div class="relations"> | ||
<h3>Related Topics</h3> | ||
<ul> | ||
<li><a href="index.html">Documentation overview</a><ul> | ||
<li>Previous: <a href="models.html" title="previous chapter">Models</a></li> | ||
</ul></li> | ||
</ul> | ||
</div> | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</div> | ||
</div> | ||
<div class="clearer"></div> | ||
</div> | ||
<div class="footer"> | ||
©2019-2022, IPUMS. | ||
|
||
| | ||
Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a> | ||
& <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a> | ||
|
||
| | ||
<a href="_sources/model_exploration.md.txt" | ||
rel="nofollow">Page source</a> | ||
</div> | ||
|
||
|
||
|
||
|
||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# This file is part of the ISRDI's hlink. | ||
# For copyright and licensing information, see the NOTICE and LICENSE files | ||
# in this project's top-level directory, and also on-line at: | ||
# https://github.com/ipums/hlink | ||
|
||
from pathlib import Path | ||
|
||
from pyspark.sql import Row, SparkSession | ||
|
||
from hlink.linking.core.substitutions import generate_substitutions, _load_substitutions | ||
|
||
|
||
def test_load_substitutions(tmp_path: Path) -> None: | ||
file_contents = """a,b | ||
to this,from this""" | ||
|
||
tmp_file = tmp_path / "substitutions.csv" | ||
tmp_file.write_text(file_contents) | ||
sub_froms, sub_tos = _load_substitutions(str(tmp_file)) | ||
|
||
assert sub_froms == ["b", "from this"] | ||
assert sub_tos == ["a", "to this"] | ||
|
||
|
||
def test_generate_substitutions(spark: SparkSession, tmp_path: Path) -> None: | ||
tmp_file = tmp_path / "substitutions.csv" | ||
tmp_file.write_text( | ||
"""rose,rosie | ||
sophia,sophy | ||
sophia,sofia | ||
amanda,mandy | ||
jane,jean""" | ||
) | ||
|
||
df = spark.createDataFrame( | ||
[("agnes", 2), ("mandy", 2), ("sophy", 2), ("rosie", 2), ("jean", 1)], | ||
schema=["first_name", "sex"], | ||
) | ||
|
||
substitution_columns = [ | ||
{ | ||
"column_name": "first_name", | ||
"substitutions": [ | ||
{ | ||
"join_column": "sex", | ||
"join_value": 2, | ||
"substitution_file": str(tmp_file), | ||
} | ||
], | ||
} | ||
] | ||
|
||
subbed_df = generate_substitutions(spark, df, substitution_columns) | ||
rows = subbed_df.select("first_name", "sex").collect() | ||
|
||
assert rows == [ | ||
Row(first_name="agnes", sex=2), | ||
Row(first_name="amanda", sex=2), | ||
Row(first_name="sophia", sex=2), | ||
Row(first_name="rose", sex=2), | ||
# Note that this name is not substituted because we join on sex=2 | ||
Row(first_name="jean", sex=1), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters