Skip to content

Commit

Permalink
Merge pull request #139 from ipums/unskip_tests
Browse files Browse the repository at this point in the history
Unskip several skipped tests
  • Loading branch information
riley-harper authored Jun 18, 2024
2 parents 0370afb + 4ea6298 commit 2c9c787
Show file tree
Hide file tree
Showing 9 changed files with 254 additions and 367 deletions.
62 changes: 31 additions & 31 deletions hlink/tests/input_data/potential_matches_agg.csv
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
namelast_clean_a,namelast_clean_b,histid_a,histid_b,bpl_a,bpl_b,namefrst_unstd_a,namefrst_unstd_b,sex_a,sex_b,namefrst_jw,namelast_jw,regionf,state_distance,exact,exact_all
cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00
cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00
cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00
symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00
symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
eilbatt,eilbott,6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,4700,4700,reginald,reginald,1,1,1.0,0.9428571428571428,6,0,1.00,0.00
knopke,knopke,EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,2100,andrew,andrew,1,1,1.0,1.0,6,0,1.00,1.00
caldwell,caldwell,AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,15010,15010,daisy,daisy,2,2,1.0,1.0,99,0,1.00,1.00
sonnenschein,sonnenschein,8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1700,1700,max,max,1,1,1.0,1.0,3,0,1.00,1.00
gibson,gebson,F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,5500,5500,dwight,dwight,1,1,1.0,0.9,3,0,1.00,0.00
hegewald,hegewald,D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,5600,karl,karl,1,1,1.0,1.0,8,0,1.00,1.00
king,king,CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,3800,virgel,virgil,1,1,0.9333333333333333,1.0,4,0,0.00,0.00
looney,looney,4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,4700,4700,sadie,sadye,2,2,0.9066666666666667,1.0,6,0,0.00,0.00
rydstrom,rydstrom,CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,1700,hubert,hubert,1,1,1.0,1.0,3,0,1.00,1.00
mugrdickian,mugrdichian,2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,3600,3600,misak,misak,1,1,1.0,0.977961432506887,2,0,1.00,0.00
brightman,brightman,195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,3900,austin,anstin,1,1,0.9,1.0,3,0,0.00,0.00
harman,harman,74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,5400,5400,eston,estan,1,1,0.9066666666666667,1.0,5,0,0.00,0.00
oglesby,oglesby,F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,4000,stephen,stephen,1,1,1.0,1.0,7,0,1.00,1.00
kassik,kassek,6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,5600,5600,james,james,1,1,1.0,0.9333333333333333,8,0,1.00,0.00
wood,wood,EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,1700,dudley,dudley,1,1,1.0,1.0,3,0,1.00,1.00
foulkrod,foulkrod,47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,4200,s,s,1,1,1.0,1.0,2,0,1.00,1.00
huges,hughes,7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,100,keneth,kenneth,1,1,0.9666666666666667,0.9611111111111111,6,0,0.00,0.00
caldwell,caldwell,A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,5000,nathan,nathan,1,1,1.0,1.0,1,0,1.00,1.00
platta,platts,E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1200,1200,norman,norman,1,1,1.0,0.9444444444444444,5,0,1.00,0.00
lipscomb,lipscomb,671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,1300,roy,roy,1,1,1.0,1.0,5,0,1.00,1.00
woodburne,woodburn,81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,200,walter,walter,1,1,1.0,0.9925925925925926,9,0,1.00,0.00
histid_a,histid_b,namefrst_jw,namelast_jw,regionf,state_distance
0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0
0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0
0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0
1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,1.0,1.0,2,0
1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,1.0,1.0,2,0
095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,1.0,0.9428571428571428,6,0
EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,1.0,1.0,6,0
AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,1.0,1.0,99,0
8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1.0,1.0,3,0
F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,1.0,0.9,3,0
D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,1.0,1.0,8,0
CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,0.9333333333333333,1.0,4,0
4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,0.9066666666666667,1.0,6,0
CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1.0,1.0,3,0
2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,1.0,0.977961432506887,2,0
195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,0.9,1.0,3,0
74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,0.9066666666666667,1.0,5,0
F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,1.0,1.0,7,0
6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,1.0,0.9333333333333333,8,0
EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1.0,1.0,3,0
47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,1.0,1.0,2,0
7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,0.9666666666666667,0.9611111111111111,6,0
A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,1.0,1.0,1,0
E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1.0,0.9444444444444444,5,0
671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1.0,1.0,5,0
81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,1.0,0.9925925925925926,9,0
25 changes: 25 additions & 0 deletions hlink/tests/input_data/prepped_df_a_agg.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
histid,bpl,namelast_clean,namefrst_unstd,sex
0202928A-AC3E-48BB-8568-3372067F35C7,3100,cridlebaugh,gerald,1
1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,3600,symonds,horace,1
095AD921-9B08-468E-817A-44879FBCADDE,60094,abrahams,isiah,1
6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,4700,eilbatt,reginald,1
EAD03D68-F21D-4A74-8C16-F9123F5288D7,2100,knopke,andrew,1
AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,15010,caldwell,daisy,2
8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,1700,sonnenschein,max,1
F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,5500,gibson,dwight,1
D30C40B9-2E7C-4933-84CE-CEAAB37E3209,5600,hegewald,karl,1
CCBA170F-93D0-42C3-A57B-CCABBF2772FB,3800,king,virgel,1
4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,4700,looney,sadie,2
CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,1700,rydstrom,hubert,1
2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,3600,mugrdickian,misak,1
195EA695-D047-4045-8757-E7A22F12E148,3900,brightman,austin,1
74941094-9737-40F0-BF3C-0C2380B08040,5400,harman,eston,1
F0F34E2F-49CC-4F06-8CC4-691CF3150244,4000,oglesby,stephen,1
6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,5600,kassik,james,1
EE22ED8E-9544-4C77-A689-75895376E3EB,1700,wood,dudley,1
47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,4200,foulkrod,s,1
7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,100,huges,keneth,1
A859D9BC-6106-43A2-8A47-B12D9D2C49C8,5000,caldwell,nathan,1
E19E5381-C68D-4E03-A688-597DF13311CE,1200,platta,norman,1
671DE512-479B-4EEB-85B4-93A848E6BDD7,1300,lipscomb,roy,1
81E992C0-3796-4BE7-B02E-9CAD0289C6EC,200,woodburne,walter,1
31 changes: 31 additions & 0 deletions hlink/tests/input_data/prepped_df_b_agg.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
histid,bpl,namelast_clean,namefrst_unstd,sex
001B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1
002B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1
003B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1
00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,symonds,horace,1
00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,symonds,horace,1
01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
00669345-C937-4405-A0F0-1FCA5204DF64,4700,eilbott,reginald,1
007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,knopke,andrew,1
00849961-E52F-42F2-9B70-052606223052,15010,caldwell,daisy,2
00C4291F-7064-4A81-8589-5854C367EEC4,1700,sonnenschein,max,1
010F244F-94D0-4295-82DB-0E172724358A,5500,gebson,dwight,1
01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,hegewald,karl,1
0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,king,virgil,1
016EF43B-E70F-440E-882E-E447663F682F,4700,looney,sadye,2
018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,rydstrom,hubert,1
019D26A0-0335-48B5-A6D6-1D499424BE84,3600,mugrdichian,misak,1
0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,brightman,anstin,1
0282109F-581C-4B8E-A99D-135CF0077C2E,5400,harman,estan,1
02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,oglesby,stephen,1
033FD0FA-C523-42B5-976A-751E830F7021,5600,kassek,james,1
0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,wood,dudley,1
03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,foulkrod,s,1
038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,hughes,kenneth,1
039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,caldwell,nathan,1
03B89FD5-872A-4504-9758-F5AA1607BA01,1200,platts,norman,1
03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,lipscomb,roy,1
03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,woodburn,walter,1
4 changes: 0 additions & 4 deletions hlink/tests/main_loop_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@ def test_do_get_steps(capsys, main, spark):
main.do_get_steps("")
output = capsys.readouterr().out
for step in steps:
if str(step) not in output:
print(type(step))
print(step)
print(output)
assert str(step) in output


Expand Down
84 changes: 0 additions & 84 deletions hlink/tests/matching_potential_matches_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,90 +4,6 @@
# https://github.com/ipums/hlink

from jinja2 import Environment, PackageLoader
import pytest


@pytest.mark.skip(
reason="We still want to test that these aggregate features are being created correctly, but we need to refactor this test to account for the fact that aggregate features are now being created in a different step (step 4 doesn't exist anymore and the functionality was moved in the code)."
)
def test_step_4_aggregate_features(
spark, matching_conf, matching, potential_matches_agg_path
):
"""Test adding aggregate features (hits, hits2, exact_all_mult, etc.) to potential matches"""
matching_conf["id_column"] = "histid"
matching_conf["comparison_features"] = [
{
"alias": "namelast_jw",
"column_name": "namelast",
"comparison_type": "jaro_winkler",
},
{"alias": "exact"},
{"alias": "exact_all"},
]
matching_conf["training"] = {
"independent_vars": [
"namelast_jw",
"exact",
"exact_all",
"hits",
"hits2",
"exact_mult",
"exact_all_mult",
"exact_all_mult2",
]
}

potential_matches = matching.spark.read.csv(
potential_matches_agg_path, header=True, inferSchema=True
)
potential_matches.write.mode("overwrite").saveAsTable("potential_matches")
matching.step_4_aggregate_features()

pm_df = matching.spark.table("potential_matches").toPandas()

assert pm_df.shape == (30, 21)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["exact"].iloc[0]
== 1
)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["exact_all"].iloc[0]
== 1
)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["hits"].iloc[0]
== 3
)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["hits2"].iloc[0]
== 9
)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["exact_mult"].iloc[0]
== 3
)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["exact_all_mult"].iloc[0]
== 3
)
assert (
pm_df.query(
"namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)["exact_all_mult2"].iloc[0]
== 9
)


def test_potential_matches_sql_template() -> None:
Expand Down
Loading

0 comments on commit 2c9c787

Please sign in to comment.