diff --git a/hlink/tests/input_data/potential_matches_agg.csv b/hlink/tests/input_data/potential_matches_agg.csv index 1416be9..95f836f 100644 --- a/hlink/tests/input_data/potential_matches_agg.csv +++ b/hlink/tests/input_data/potential_matches_agg.csv @@ -1,31 +1,31 @@ -namelast_clean_a,namelast_clean_b,histid_a,histid_b,bpl_a,bpl_b,namefrst_unstd_a,namefrst_unstd_b,sex_a,sex_b,namefrst_jw,namelast_jw,regionf,state_distance,exact,exact_all -cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 -cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 -cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 -symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00 -symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00 -abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 -abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 -abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 -abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 -eilbatt,eilbott,6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,4700,4700,reginald,reginald,1,1,1.0,0.9428571428571428,6,0,1.00,0.00 -knopke,knopke,EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,2100,andrew,andrew,1,1,1.0,1.0,6,0,1.00,1.00 -caldwell,caldwell,AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,15010,15010,daisy,daisy,2,2,1.0,1.0,99,0,1.00,1.00 -sonnenschein,sonnenschein,8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1700,1700,max,max,1,1,1.0,1.0,3,0,1.00,1.00 -gibson,gebson,F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,5500,5500,dwight,dwight,1,1,1.0,0.9,3,0,1.00,0.00 -hegewald,hegewald,D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,5600,karl,karl,1,1,1.0,1.0,8,0,1.00,1.00 -king,king,CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,3800,virgel,virgil,1,1,0.9333333333333333,1.0,4,0,0.00,0.00 -looney,looney,4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,4700,4700,sadie,sadye,2,2,0.9066666666666667,1.0,6,0,0.00,0.00 -rydstrom,rydstrom,CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,1700,hubert,hubert,1,1,1.0,1.0,3,0,1.00,1.00 -mugrdickian,mugrdichian,2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,3600,3600,misak,misak,1,1,1.0,0.977961432506887,2,0,1.00,0.00 -brightman,brightman,195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,3900,austin,anstin,1,1,0.9,1.0,3,0,0.00,0.00 -harman,harman,74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,5400,5400,eston,estan,1,1,0.9066666666666667,1.0,5,0,0.00,0.00 -oglesby,oglesby,F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,4000,stephen,stephen,1,1,1.0,1.0,7,0,1.00,1.00 -kassik,kassek,6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,5600,5600,james,james,1,1,1.0,0.9333333333333333,8,0,1.00,0.00 -wood,wood,EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,1700,dudley,dudley,1,1,1.0,1.0,3,0,1.00,1.00 -foulkrod,foulkrod,47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,4200,s,s,1,1,1.0,1.0,2,0,1.00,1.00 -huges,hughes,7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,100,keneth,kenneth,1,1,0.9666666666666667,0.9611111111111111,6,0,0.00,0.00 -caldwell,caldwell,A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,5000,nathan,nathan,1,1,1.0,1.0,1,0,1.00,1.00 -platta,platts,E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1200,1200,norman,norman,1,1,1.0,0.9444444444444444,5,0,1.00,0.00 -lipscomb,lipscomb,671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,1300,roy,roy,1,1,1.0,1.0,5,0,1.00,1.00 -woodburne,woodburn,81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,200,walter,walter,1,1,1.0,0.9925925925925926,9,0,1.00,0.00 \ No newline at end of file +histid_a,histid_b,namefrst_jw,namelast_jw,regionf,state_distance +0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0 +0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0 +0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0 +1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,1.0,1.0,2,0 +1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,1.0,1.0,2,0 +095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0 +095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0 +095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0 +095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0 +6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,1.0,0.9428571428571428,6,0 +EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,1.0,1.0,6,0 +AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,1.0,1.0,99,0 +8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1.0,1.0,3,0 +F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,1.0,0.9,3,0 +D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,1.0,1.0,8,0 +CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,0.9333333333333333,1.0,4,0 +4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,0.9066666666666667,1.0,6,0 +CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1.0,1.0,3,0 +2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,1.0,0.977961432506887,2,0 +195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,0.9,1.0,3,0 +74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,0.9066666666666667,1.0,5,0 +F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,1.0,1.0,7,0 +6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,1.0,0.9333333333333333,8,0 +EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1.0,1.0,3,0 +47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,1.0,1.0,2,0 +7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,0.9666666666666667,0.9611111111111111,6,0 +A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,1.0,1.0,1,0 +E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1.0,0.9444444444444444,5,0 +671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1.0,1.0,5,0 +81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,1.0,0.9925925925925926,9,0 diff --git a/hlink/tests/input_data/prepped_df_a_agg.csv b/hlink/tests/input_data/prepped_df_a_agg.csv new file mode 100644 index 0000000..53a72ad --- /dev/null +++ b/hlink/tests/input_data/prepped_df_a_agg.csv @@ -0,0 +1,25 @@ +histid,bpl,namelast_clean,namefrst_unstd,sex +0202928A-AC3E-48BB-8568-3372067F35C7,3100,cridlebaugh,gerald,1 +1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,3600,symonds,horace,1 +095AD921-9B08-468E-817A-44879FBCADDE,60094,abrahams,isiah,1 +6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,4700,eilbatt,reginald,1 +EAD03D68-F21D-4A74-8C16-F9123F5288D7,2100,knopke,andrew,1 +AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,15010,caldwell,daisy,2 +8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,1700,sonnenschein,max,1 +F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,5500,gibson,dwight,1 +D30C40B9-2E7C-4933-84CE-CEAAB37E3209,5600,hegewald,karl,1 +CCBA170F-93D0-42C3-A57B-CCABBF2772FB,3800,king,virgel,1 +4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,4700,looney,sadie,2 +CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,1700,rydstrom,hubert,1 +2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,3600,mugrdickian,misak,1 +195EA695-D047-4045-8757-E7A22F12E148,3900,brightman,austin,1 +74941094-9737-40F0-BF3C-0C2380B08040,5400,harman,eston,1 +F0F34E2F-49CC-4F06-8CC4-691CF3150244,4000,oglesby,stephen,1 +6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,5600,kassik,james,1 +EE22ED8E-9544-4C77-A689-75895376E3EB,1700,wood,dudley,1 +47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,4200,foulkrod,s,1 +7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,100,huges,keneth,1 +A859D9BC-6106-43A2-8A47-B12D9D2C49C8,5000,caldwell,nathan,1 +E19E5381-C68D-4E03-A688-597DF13311CE,1200,platta,norman,1 +671DE512-479B-4EEB-85B4-93A848E6BDD7,1300,lipscomb,roy,1 +81E992C0-3796-4BE7-B02E-9CAD0289C6EC,200,woodburne,walter,1 diff --git a/hlink/tests/input_data/prepped_df_b_agg.csv b/hlink/tests/input_data/prepped_df_b_agg.csv new file mode 100644 index 0000000..245860a --- /dev/null +++ b/hlink/tests/input_data/prepped_df_b_agg.csv @@ -0,0 +1,31 @@ +histid,bpl,namelast_clean,namefrst_unstd,sex +001B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1 +002B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1 +003B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1 +00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,symonds,horace,1 +00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,symonds,horace,1 +01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1 +02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1 +03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1 +00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1 +00669345-C937-4405-A0F0-1FCA5204DF64,4700,eilbott,reginald,1 +007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,knopke,andrew,1 +00849961-E52F-42F2-9B70-052606223052,15010,caldwell,daisy,2 +00C4291F-7064-4A81-8589-5854C367EEC4,1700,sonnenschein,max,1 +010F244F-94D0-4295-82DB-0E172724358A,5500,gebson,dwight,1 +01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,hegewald,karl,1 +0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,king,virgil,1 +016EF43B-E70F-440E-882E-E447663F682F,4700,looney,sadye,2 +018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,rydstrom,hubert,1 +019D26A0-0335-48B5-A6D6-1D499424BE84,3600,mugrdichian,misak,1 +0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,brightman,anstin,1 +0282109F-581C-4B8E-A99D-135CF0077C2E,5400,harman,estan,1 +02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,oglesby,stephen,1 +033FD0FA-C523-42B5-976A-751E830F7021,5600,kassek,james,1 +0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,wood,dudley,1 +03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,foulkrod,s,1 +038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,hughes,kenneth,1 +039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,caldwell,nathan,1 +03B89FD5-872A-4504-9758-F5AA1607BA01,1200,platts,norman,1 +03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,lipscomb,roy,1 +03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,woodburn,walter,1 diff --git a/hlink/tests/main_loop_test.py b/hlink/tests/main_loop_test.py index f9ba9e5..8d16325 100755 --- a/hlink/tests/main_loop_test.py +++ b/hlink/tests/main_loop_test.py @@ -19,10 +19,6 @@ def test_do_get_steps(capsys, main, spark): main.do_get_steps("") output = capsys.readouterr().out for step in steps: - if str(step) not in output: - print(type(step)) - print(step) - print(output) assert str(step) in output diff --git a/hlink/tests/matching_potential_matches_test.py b/hlink/tests/matching_potential_matches_test.py index c626df2..ac1e552 100755 --- a/hlink/tests/matching_potential_matches_test.py +++ b/hlink/tests/matching_potential_matches_test.py @@ -4,90 +4,6 @@ # https://github.com/ipums/hlink from jinja2 import Environment, PackageLoader -import pytest - - -@pytest.mark.skip( - reason="We still want to test that these aggregate features are being created correctly, but we need to refactor this test to account for the fact that aggregate features are now being created in a different step (step 4 doesn't exist anymore and the functionality was moved in the code)." -) -def test_step_4_aggregate_features( - spark, matching_conf, matching, potential_matches_agg_path -): - """Test adding aggregate features (hits, hits2, exact_all_mult, etc.) to potential matches""" - matching_conf["id_column"] = "histid" - matching_conf["comparison_features"] = [ - { - "alias": "namelast_jw", - "column_name": "namelast", - "comparison_type": "jaro_winkler", - }, - {"alias": "exact"}, - {"alias": "exact_all"}, - ] - matching_conf["training"] = { - "independent_vars": [ - "namelast_jw", - "exact", - "exact_all", - "hits", - "hits2", - "exact_mult", - "exact_all_mult", - "exact_all_mult2", - ] - } - - potential_matches = matching.spark.read.csv( - potential_matches_agg_path, header=True, inferSchema=True - ) - potential_matches.write.mode("overwrite").saveAsTable("potential_matches") - matching.step_4_aggregate_features() - - pm_df = matching.spark.table("potential_matches").toPandas() - - assert pm_df.shape == (30, 21) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["exact"].iloc[0] - == 1 - ) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["exact_all"].iloc[0] - == 1 - ) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["hits"].iloc[0] - == 3 - ) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["hits2"].iloc[0] - == 9 - ) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["exact_mult"].iloc[0] - == 3 - ) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["exact_all_mult"].iloc[0] - == 3 - ) - assert ( - pm_df.query( - "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" - )["exact_all_mult2"].iloc[0] - == 9 - ) def test_potential_matches_sql_template() -> None: diff --git a/hlink/tests/matching_scoring_test.py b/hlink/tests/matching_scoring_test.py index a624618..613e1f6 100755 --- a/hlink/tests/matching_scoring_test.py +++ b/hlink/tests/matching_scoring_test.py @@ -3,84 +3,13 @@ # in this project's top-level directory, and also on-line at: # https://github.com/ipums/hlink -import hlink.tests import pandas as pd -import pytest import hlink.linking.core.threshold as threshold_core from hlink.linking.matching.link_step_score import LinkStepScore -@pytest.mark.skip( - reason="We still want to test that whatever 'secondary_threshold' became is being applied correctly, but we need to refactor this test to account for the fact that this was totally renamed and is now being carried out in a different step (step 3 doesn't exist anymore)." -) -def test_step_3_uniq_and_secondary_threshold(spark, matching_conf, matching): - """Test a secondary threshold with uniqueness""" - matching_conf["comparison_features"] = [ - { - "alias": "namefrst_jw", - "column_name": "namefrst", - "comparison_type": "jaro_winkler", - }, - { - "alias": "namelast_jw", - "column_name": "namelast", - "comparison_type": "jaro_winkler", - }, - ] - - matching_conf["comparisons"] = { - "comp_a": { - "feature_name": "namefrst_jw", - "threshold": 0.8, - "comparison_type": "threshold", - }, - "comp_b": { - "feature_name": "namelast_jw", - "comparison_type": "threshold", - "threshold": 0.8, - }, - "operator": "AND", - } - - matching_conf["secondary_threshold"] = { - "threshold_a": { - "feature_name": "namefrst_jw", - "comparison_type": "threshold", - "threshold": 0.9, - }, - "threshold_b": { - "feature_name": "namelast_jw", - "comparison_type": "threshold", - "threshold": 0.9, - }, - "unique_true": {"id_a": "id_a", "id_b": "id_b"}, - "operator": "AND", - "secondary": True, - } - - matching.step_0_explode() - matching.step_1_match() - hlink.linking.matching._step_2_score.__create_features(matching, matching_conf) - - # Create pandas DFs of the step_2 potential matches table - potential_matches_df = spark.table("potential_matches_prepped").toPandas() - - # matching.step_3_secondary_threshold() - # unique_matches_df = spark.table("potential_matches").toPandas() - unique_high_matches_df = spark.table("potential_matches_prepped").toPandas() - - assert len(potential_matches_df.id_a) == 5 - # assert (len(unique_matches_df.id_a) == 1) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] > 0.8) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] < 0.9) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.8) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.9) - assert unique_high_matches_df.empty - - -# TODO: is there a step 3 anymore? -def test_step_3_skip_on_no_conf(spark, matching_conf, matching, capsys): - """Test matching step 3 doesn't run if no training config""" +def test_step_2_skip_on_no_conf(spark, matching_conf, matching, capsys): + """Test matching step 2 doesn't run if no training config""" matching_conf["comparison_features"] = [ { @@ -102,11 +31,10 @@ def test_step_3_skip_on_no_conf(spark, matching_conf, matching, capsys): ) -# TODO: is there a step 3 any more? -def test_step_3_alpha_beta_thresholds( +def test_step_2_alpha_beta_thresholds( spark, matching, matching_conf, threshold_ratio_data_path_2 ): - """Test matching step 3 with both probability and ratio thresholds""" + """Test matching step 2 with both probability and ratio thresholds""" matching.spark.read.csv( threshold_ratio_data_path_2, header=True, inferSchema=True @@ -170,3 +98,74 @@ def test_step_3_alpha_beta_thresholds( assert tp.query("histid_a == '5a' and histid_b == '7b'")["prediction"].iloc[0] == 1 assert tp.query("histid_a == '5a' and histid_b == '6b'")["prediction"].iloc[0] == 0 + + +def test_step_2_aggregate_features( + spark, matching_conf, matching, agg_features_datasources +): + matching_conf["id_column"] = "histid" + matching_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "exact", + "column_names": ["namefrst_unstd", "namelast_clean"], + "comparison_type": "all_equals", + }, + { + "alias": "exact_all", + "column_names": ["namefrst_unstd", "namelast_clean", "bpl"], + "comparison_type": "all_equals", + }, + ] + matching_conf["training"] = { + "independent_vars": [ + "namelast_jw", + "exact", + "exact_all", + "hits", + "hits2", + "exact_mult", + "exact_all_mult", + "exact_all_mult2", + ], + "chosen_model": { + "type": "probit", + "threshold": 0.5, + }, + "dependent_var": "match", + } + + potential_matches_path, prepped_df_a_path, prepped_df_b_path = ( + agg_features_datasources + ) + spark.read.csv(potential_matches_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("potential_matches") + + spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + + link_step_score = LinkStepScore(matching) + link_step_score._create_features(matching_conf) + + pm_prepped = spark.table("potential_matches_prepped").toPandas() + + filtered = pm_prepped.query( + "histid_a == '0202928A-AC3E-48BB-8568-3372067F35C7' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + ) + + assert filtered["exact"].item() + assert filtered["exact_all"].item() + assert filtered["hits"].item() == 3 + assert filtered["hits2"].item() == 9 + assert filtered["exact_mult"].item() + assert filtered["exact_all_mult"].item() == 3 + assert filtered["exact_all_mult2"].item() == 9 diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 8fb1e17..e0cf593 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -552,169 +552,3 @@ def test_step_2_split_by_id_a( assert splits[1][1].toPandas()["id_a"].unique().tolist() == ["30"] main.do_drop_all("") - - -@pytest.mark.skip( - reason="Need to get tests working for new version of feature importances" -) -def test_step_3_get_feature_importances_random_forest( - spark, - training_conf, - training, - state_dist_path, - datasource_training_input, - potential_matches_path, - spark_test_tmp_dir_path, - model_exploration, -): - """Test running the chosen model on potential matches dataset""" - td_path, pa_path, pb_path = datasource_training_input - - training_conf["comparison_features"] = [ - { - "alias": "regionf", - "column_name": "region", - "comparison_type": "fetch_a", - "categorical": True, - }, - { - "alias": "namelast_jw", - "column_name": "namelast", - "comparison_type": "jaro_winkler", - }, - { - "alias": "state_distance", - "column_name": "bpl", - "key_count": 1, - "comparison_type": "geo_distance", - "loc_a": "statecode1", - "loc_b": "statecode2", - "distance_col": "dist", - "table_name": "state_distances_lookup", - "distances_file": state_dist_path, - }, - ] - - training_conf["training"]["dataset"] = td_path - training_conf["training"]["dependent_var"] = "match" - training_conf["training"]["independent_vars"] = [ - "namelast_jw", - "regionf", - "state_distance", - ] - training_conf["training"]["chosen_model"] = { - "type": "random_forest", - "maxDepth": 6, - "numTrees": 100, - "featureSubsetStrategy": "sqrt", - } - - # training_conf["training"]["use_potential_matches_features"] = True - training_conf["training"]["score_with_model"] = True - training_conf["training"]["feature_importances"] = True - training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path - training_conf["drop_data_from_scored_matches"] = True - - training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( - "overwrite" - ).saveAsTable("prepped_df_a") - training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( - "overwrite" - ).saveAsTable("prepped_df_b") - training.spark.read.csv( - potential_matches_path, header=True, inferSchema=True - ).write.mode("overwrite").saveAsTable("potential_matches") - - training.run_step(0) - training.run_step(1) - training.run_step(2) - - model_exploration.run_step(3) - - fi_df = training.spark.table("feature_importances").toPandas() - - assert fi_df.shape == (6, 3) - assert 1 > fi_df.query("idx == 0")["score"].iloc()[0] >= 0 - assert "regionf_onehotencoded_2" in list(fi_df["name"]) - assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"]) - - -@pytest.mark.skip( - reason="Need to get tests working for new version of feature importances" -) -def test_step_3_get_feature_importances_probit( - spark, - training_conf, - training, - state_dist_path, - datasource_training_input, - potential_matches_path, - spark_test_tmp_dir_path, - matching, -): - """Test running the chosen model on potential matches dataset""" - td_path, pa_path, pb_path = datasource_training_input - - training_conf["comparison_features"] = [ - { - "alias": "regionf", - "column_name": "region", - "comparison_type": "fetch_a", - "categorical": True, - }, - { - "alias": "namelast_jw", - "column_name": "namelast", - "comparison_type": "jaro_winkler", - }, - { - "alias": "state_distance", - "key_count": 1, - "column_name": "bpl", - "comparison_type": "geo_distance", - "loc_a": "statecode1", - "loc_b": "statecode2", - "distance_col": "dist", - "table_name": "state_distances_lookup", - "distances_file": state_dist_path, - }, - ] - - training_conf["training"]["dataset"] = td_path - training_conf["training"]["dependent_var"] = "match" - training_conf["training"]["independent_vars"] = [ - "namelast_jw", - "regionf", - "state_distance", - ] - - training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5} - - # training_conf["training"]["use_potential_matches_features"] = True - training_conf["training"]["score_with_model"] = True - training_conf["training"]["feature_importances"] = True - training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path - training_conf["drop_data_from_scored_matches"] = True - - training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( - "overwrite" - ).saveAsTable("prepped_df_a") - training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( - "overwrite" - ).saveAsTable("prepped_df_b") - training.spark.read.csv( - potential_matches_path, header=True, inferSchema=True - ).write.mode("overwrite").saveAsTable("potential_matches") - - training.run_step(0) - training.run_step(1) - training.run_step(2) - matching.run_step(2) - training.run_step(3) - - fi_df = training.spark.table("feature_importances").toPandas() - - assert fi_df.shape == (6, 3) - assert 25 > fi_df.query("idx == 0")["score"].iloc()[0] >= -5 - assert "regionf_onehotencoded_2" in list(fi_df["name"]) - assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"]) diff --git a/hlink/tests/plugins/external_data_paths.py b/hlink/tests/plugins/external_data_paths.py index a57947f..f04c525 100755 --- a/hlink/tests/plugins/external_data_paths.py +++ b/hlink/tests/plugins/external_data_paths.py @@ -148,15 +148,18 @@ def potential_matches_path_ids_only(spark): @pytest.fixture(scope="module") -def potential_matches_agg_path(spark): - """Create a fixture with the path to the test potential_matches csv file""" - - path = "input_data/potential_matches_agg.csv" +def agg_features_datasources() -> tuple[str, str, str]: + """Return the path to the potential_matches, prepped_df_a, and prepped_df_b csv data files.""" + potential_matches_path = "input_data/potential_matches_agg.csv" + prepped_df_a_path = "input_data/prepped_df_a_agg.csv" + prepped_df_b_path = "input_data/prepped_df_b_agg.csv" package_path = os.path.dirname(hlink.tests.__file__) - full_path = os.path.join(package_path, path) + full_pm_path = os.path.join(package_path, potential_matches_path) + full_prepped_a_path = os.path.join(package_path, prepped_df_a_path) + full_prepped_b_path = os.path.join(package_path, prepped_df_b_path) - return full_path + return full_pm_path, full_prepped_a_path, full_prepped_b_path @pytest.fixture(scope="module") diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py index 3730c02..0fbdb0a 100644 --- a/hlink/tests/training_test.py +++ b/hlink/tests/training_test.py @@ -349,6 +349,89 @@ def test_step_3_interacted_categorical_features( ) +def test_step_3_with_probit_model( + spark, training_conf, training, state_dist_path, datasource_training_input +): + training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input + """Run training step 3 with a probit ML model.""" + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + + training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5} + training_conf["training"]["score_with_model"] = True + training_conf["training"]["feature_importances"] = True + + spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + training.run_step(3) + + tfi = spark.table("training_feature_importances").toPandas() + assert ( + 8.9 + <= tfi.query("feature_name == 'namelast_jw'")[ + "coefficient_or_importance" + ].item() + <= 9.0 + ) + assert ( + tfi.query("feature_name == 'regionf' and category == 0")[ + "coefficient_or_importance" + ].item() + == 0 + ) + assert ( + -7.6 + <= tfi.query("feature_name == 'regionf' and category == 1")[ + "coefficient_or_importance" + ].item() + <= -7.5 + ) + assert ( + 6.4 + <= tfi.query("feature_name == 'regionf' and category == 99")[ + "coefficient_or_importance" + ].item() + <= 6.5 + ) + + def test_step_3_requires_table(training_conf, training): training_conf["training"]["feature_importances"] = True with pytest.raises(RuntimeError, match="Missing input tables"):