diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7806e3e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +**/__pycache__ +**/*.egg_info +**/*.pyc diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8af6669 --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +extend-exclude = conda_env/, working_dir/ +ignore = C901, E203, E266, E501, E722, E731, F401, F403, F405, W503 +max-line-length = 88 +max-complexity = 40 +select = B,C,E,F,W,T4,B9 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d82a7b0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +derby.log +*.egg-info +metastore_db +__pycache__ +activate-env.bash +build +conda_env/ +dist +scala_jar/data +scala_jar/target +scala_jar/project/target +output_data/ +.ipynb_checkpoints +.idea/ +*.class +*.cache +.DS_Store +hlink/tests/output_data +hlink/linking/reporting/output_reports +hlink_config/derby +hlink_config/spark_tmp_dir +hlink_config/warehouse +hlink_config/run.log +*.pyc +sphinx-docs/_* +run_test.sh +venv/ +working_dir/ +.coverage +coverage_* +*_coverage.xlsx diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4d370f1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: +- repo: https://github.com/ambv/black + rev: 20.8b1 + hooks: + - id: black + language_version: python3.6 +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v1.2.3 + hooks: + - id: flake8 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8f08ab3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,96 @@ +FROM python:3.6.5 + +# --------------------- JAVA INSTALL START --------------------- # +# Taken from the java openjdk 8 docker image: https://github.com/docker-library/openjdk/blob/7a33416016b60c045cf0ba99e82617ed6c130595/8/jre/slim/Dockerfile +RUN apt-get update && apt-get install -y --no-install-recommends \ + bzip2 \ + unzip \ + xz-utils \ + && rm -rf /var/lib/apt/lists/* + +# Default to UTF-8 file.encoding +ENV LANG C.UTF-8 + +# add a simple script that can auto-detect the appropriate JAVA_HOME value +# based on whether the JDK or only the JRE is installed +RUN { \ + echo '#!/bin/sh'; \ + echo 'set -e'; \ + echo; \ + echo 'dirname "$(dirname "$(readlink -f "$(which javac || which java)")")"'; \ + } > /usr/local/bin/docker-java-home \ + && chmod +x /usr/local/bin/docker-java-home + +# do some fancy footwork to create a JAVA_HOME that's cross-architecture-safe +RUN ln -svT "/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture)" /docker-java-home +ENV JAVA_HOME /docker-java-home/jre + +ENV JAVA_VERSION 8u212 +ENV JAVA_DEBIAN_VERSION 8u212-b01-1~deb9u1 + +# see https://bugs.debian.org/775775 +# and https://github.com/docker-library/java/issues/19#issuecomment-70546872 +# ENV CA_CERTIFICATES_JAVA_VERSION 20170531+nmu1 + +RUN set -ex; \ + \ +# deal with slim variants not having man page directories (which causes "update-alternatives" to fail) + if [ ! -d /usr/share/man/man1 ]; then \ + mkdir -p /usr/share/man/man1; \ + fi; \ + \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + openjdk-8-jre-headless \ + # ca-certificates-java="$CA_CERTIFICATES_JAVA_VERSION" \ + ; \ + rm -rf /var/lib/apt/lists/*; \ + \ +# verify that "docker-java-home" returns what we expect + [ "$(readlink -f "$JAVA_HOME")" = "$(docker-java-home)" ]; \ + \ +# update-alternatives so that future installs of other OpenJDK versions don't change /usr/bin/java + update-alternatives --get-selections | awk -v home="$(readlink -f "$JAVA_HOME")" 'index($3, home) == 1 { $2 = "manual"; print | "update-alternatives --set-selections" }'; \ +# ... and verify that it actually worked for one of the alternatives we care about + update-alternatives --query java | grep -q 'Status: manual' + +# see CA_CERTIFICATES_JAVA_VERSION notes above +RUN /var/lib/dpkg/info/ca-certificates-java.postinst configure + +# --------------------- JAVA INSTALL END --------------------- # + +# --------------------- SBT INSTALL --------------------- # +ENV SCALA_VERSION 2.11.12 +ENV SBT_VERSION 1.1.6 + +# Scala expects this file +RUN touch /usr/lib/jvm/java-8-openjdk-amd64/release + +# Install Scala +## Piping curl directly in tar +RUN \ + curl -fsL https://downloads.typesafe.com/scala/$SCALA_VERSION/scala-$SCALA_VERSION.tgz | tar xfz - -C /root/ && \ + echo >> /root/.bashrc && \ + echo "export PATH=~/scala-$SCALA_VERSION/bin:$PATH" >> /root/.bashrc + +# Install sbt +RUN \ + curl -L -o sbt-$SBT_VERSION.deb "https://scala.jfrog.io/ui/api/v1/download?repoKey=debian&path=/sbt-$SBT_VERSION.deb" && \ + dpkg -i sbt-$SBT_VERSION.deb && \ + rm sbt-$SBT_VERSION.deb && \ + apt-get update && \ + apt-get install sbt && \ + sbt sbtVersion + +# --------------------- SBT INSTALL END --------------------- # + + +RUN mkdir /hlink +WORKDIR /hlink + +COPY scala_jar scala_jar +RUN cd scala_jar && sbt assembly + +COPY . . +RUN mv scala_jar/target/scala-2.11/*.jar hlink/spark/jars/ +RUN pip install . diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100755 index 0000000..c860f87 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,36 @@ +node { + def image_id = "hlink:${env.BUILD_TAG}" + stage("Checkout") { + deleteDir() + checkout scm + } + stage("Deploy") { + host = "gp1.pop.umn.edu" + deploy_target = "/pkg/ipums/programming/linking/hlink/deploys/${env.BRANCH_NAME}" + def target_exists = sh script: "ssh ${host} 'test -d ${deploy_target}'", returnStatus:true + if (target_exists != 0) { + sh "ssh ${host} 'mkdir ${deploy_target} && cd ${deploy_target} && mkdir scripts'" + sh "ssh ${host} 'cd ${deploy_target} && git clone git@github.umn.edu:mpc/hlink.git'" + sh "ssh ${host} 'cd ${deploy_target} && /pkg/ipums/programming/conda/v4.8/envs/hlink/bin/virtualenv -p 3.6.5 venv'" + } + sh "ssh ${host} 'cd ${deploy_target}/hlink && git checkout ${env.BRANCH_NAME} && git pull origin ${env.BRANCH_NAME}'" + sh "rsync -av ./deploy/hlink ${host}:${deploy_target}/scripts/hlink" + sh "rsync -av ./deploy/global_conf.json ${host}:${deploy_target}/global_conf.json" + sh "ssh ${host} 'cd ${deploy_target} && sed -i \'s/XXX_BRANCH/${env.BRANCH_NAME}/g\' global_conf.json && sed -i \'s/XXX_BRANCH/${env.BRANCH_NAME}/g\' scripts/hlink'" + //sh "ssh ${host} 'cd ${deploy_target}/hlink/scala_jar && rm -rf target && /pkg/mpctools/bin/sbt assembly && cp ./target/scala-2.11/hlink_lib-assembly-1.0.jar ../hlink/spark/jars'" + sh "ssh ${host} 'cd ${deploy_target} && venv/bin/pip install ./hlink'" + } + + /*stage("Build") { + docker.build(image_id) + } + stage("Black") { + sh "docker run ${image_id} black --check ." + } + stage("Flake8") { + sh "docker run ${image_id} flake8 --count ." + } + stage("Test") { + sh "docker run ${image_id} pytest hlink/tests/" + }*/ +} diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..14e2f77 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000..fe3ff4f --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,16 @@ +hlink +https://github.com/ipums/hlink + +Copyright 2019-2022 Regents of the University of Minnesota + +Contributors: + Jacob Wellington, ISRDI, University of Minnesota + Kelly Thompson, ISRDI, University of Minnesota + Jonas Helgertz, ISRDI, University of Minnesota + Colin Davis, ISRDI, University of Minnesota + Jimm Domingo, ISRDI, University of Minnesota + Riley Harper, ISRDI, University of Minnesota + +This project is licensed under the Mozilla Public License, version 2.0 (the +"License"). A copy of the License is in the project file "LICENSE.txt", +and is also available at https://www.mozilla.org/en-US/MPL/2.0/. diff --git a/README.md b/README.md new file mode 100755 index 0000000..217dfcc --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# hlink: historical record linkage + +A working paper on the creation and applications of this program can be found at . A publication on the same topic is forthcoming. + +## Docs + +Documentation site can be found [here](https://pages.github.umn.edu/mpc/hlink). +This includes information about installation and setting up your configuration files. + +An example script and config file can be found in the `examples` directory. + +## Overview + +Hlink is designed to link two datasets. It allows for probabilistic and deterministic record linkage. It provides functionality for the following tasks: + +1. Preprocessing: preprocess each dataset to clean and or transform it in preperation for linking. +2. Training: train ML models on a set of features and compare results between models. +3. Matching: match two datasets using a model created in training or with deterministic rules. +4. Household linking: using the results from an individual linking process, compare household members of linked records to generate additional links. +5. Reporting: generate summarized information on linked datasets. +6. Model Exploration: compare various models and hyperparameter matrices to choose production model specs. + diff --git a/deploy/global_conf.json b/deploy/global_conf.json new file mode 100644 index 0000000..8e50e74 --- /dev/null +++ b/deploy/global_conf.json @@ -0,0 +1,5 @@ +{ + "users_dir": "/pkg/ipums/programming/linking/hlink/users", + "users_dir_fast": "/mnt/nas-af/linking/hlink/users", + "python": "/pkg/ipums/programming/linking/hlink/deploys/XXX_BRANCH/venv/bin/python" +} diff --git a/deploy/hlink b/deploy/hlink new file mode 100755 index 0000000..eeda408 --- /dev/null +++ b/deploy/hlink @@ -0,0 +1,4 @@ +#! /bin/bash + +/pkg/ipumsi-programming/perl_ipums_git/src/_log_from_wrapper /pkg/ipums/programming/linking/hlink/deploys/XXX_BRANCH/venv/bin/hlink "$@" +SPARK_HOME=/pkg/ipums/programming/linking/hlink/spark/spark-2.3.1-bin-hadoop2.7 HLINK_CONF=/pkg/ipums/programming/linking/hlink/deploys/XXX_BRANCH/global_conf.json /pkg/ipums/programming/linking/hlink/deploys/XXX_BRANCH/venv/bin/hlink "$@" diff --git a/doc/developer.md b/doc/developer.md new file mode 100644 index 0000000..fbedf47 --- /dev/null +++ b/doc/developer.md @@ -0,0 +1,41 @@ + +## Program Structure + +There are 4 modules of the program. See documentation in each specific class for more information. + +1) `scripts` -- This contains the code for all of the CLI (command line interface). It contains the entrypoint into the program as well as all of the commands the user can run. +2) `configs` -- This contains the code for reading and parsing the program configurations. +3) `spark` -- This contains the code for the spark initialization and connection. +4) `linking` -- This contains the code for all of the linking tasks. There is a separate README.md file in this module to further describe it. + +In addition to these 4 modules, the `setup.py` file at the top level of the repo contains the configurations for packaging up the program with pip. + +## Developing Code +To set up a copy of this project for development, + +1. Clone the repository. +2. Run `pip install -e .[dev]` in the root project directory. This should install all dependencies. + +## Running Tests + +To run the project's test suite, run `pytest hlink/tests` in the root project directory. + +## Building the Scala Jar + +To build the Scala jar, do + +``` +cd scala_jar +sbt assembly +``` + +Then move the scala jar over to the hlink directory with `mv target/scala-2.11/*.jar ../hlink/spark/jars`. + +## Creating Sphinx Docs + +To write out the sphinx docs to the `docs` folder for the GitHub pages site, run + +``` +cd sphinx-docs +make github +``` diff --git a/docs/.buildinfo b/docs/.buildinfo new file mode 100644 index 0000000..86e0496 --- /dev/null +++ b/docs/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 3a4451c59e16408b6b28773f8fac2c58 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/.doctrees/column_mapping_transforms.doctree b/docs/.doctrees/column_mapping_transforms.doctree new file mode 100644 index 0000000..45a71aa Binary files /dev/null and b/docs/.doctrees/column_mapping_transforms.doctree differ diff --git a/docs/.doctrees/comparison_types.doctree b/docs/.doctrees/comparison_types.doctree new file mode 100644 index 0000000..e19c9b0 Binary files /dev/null and b/docs/.doctrees/comparison_types.doctree differ diff --git a/docs/.doctrees/config.doctree b/docs/.doctrees/config.doctree new file mode 100644 index 0000000..ff816c4 Binary files /dev/null and b/docs/.doctrees/config.doctree differ diff --git a/docs/.doctrees/environment.pickle b/docs/.doctrees/environment.pickle new file mode 100644 index 0000000..79d9d34 Binary files /dev/null and b/docs/.doctrees/environment.pickle differ diff --git a/docs/.doctrees/example_workflow.doctree b/docs/.doctrees/example_workflow.doctree new file mode 100644 index 0000000..bec571f Binary files /dev/null and b/docs/.doctrees/example_workflow.doctree differ diff --git a/docs/.doctrees/feature_selection_transforms.doctree b/docs/.doctrees/feature_selection_transforms.doctree new file mode 100644 index 0000000..d2e7348 Binary files /dev/null and b/docs/.doctrees/feature_selection_transforms.doctree differ diff --git a/docs/.doctrees/index.doctree b/docs/.doctrees/index.doctree new file mode 100644 index 0000000..bc355fa Binary files /dev/null and b/docs/.doctrees/index.doctree differ diff --git a/docs/.doctrees/installation.doctree b/docs/.doctrees/installation.doctree new file mode 100644 index 0000000..a084932 Binary files /dev/null and b/docs/.doctrees/installation.doctree differ diff --git a/docs/.doctrees/introduction.doctree b/docs/.doctrees/introduction.doctree new file mode 100644 index 0000000..0a39253 Binary files /dev/null and b/docs/.doctrees/introduction.doctree differ diff --git a/docs/.doctrees/link_tasks.doctree b/docs/.doctrees/link_tasks.doctree new file mode 100644 index 0000000..93000e4 Binary files /dev/null and b/docs/.doctrees/link_tasks.doctree differ diff --git a/docs/.doctrees/models.doctree b/docs/.doctrees/models.doctree new file mode 100644 index 0000000..4b0f173 Binary files /dev/null and b/docs/.doctrees/models.doctree differ diff --git a/docs/.doctrees/pipeline_features.doctree b/docs/.doctrees/pipeline_features.doctree new file mode 100644 index 0000000..9e7c426 Binary files /dev/null and b/docs/.doctrees/pipeline_features.doctree differ diff --git a/docs/.doctrees/running_the_program.doctree b/docs/.doctrees/running_the_program.doctree new file mode 100644 index 0000000..da02bf0 Binary files /dev/null and b/docs/.doctrees/running_the_program.doctree differ diff --git a/docs/.doctrees/substitutions.doctree b/docs/.doctrees/substitutions.doctree new file mode 100644 index 0000000..2b232e2 Binary files /dev/null and b/docs/.doctrees/substitutions.doctree differ diff --git a/docs/.doctrees/use_examples.doctree b/docs/.doctrees/use_examples.doctree new file mode 100644 index 0000000..5874769 Binary files /dev/null and b/docs/.doctrees/use_examples.doctree differ diff --git a/docs/.nojekyll b/docs/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/docs/_sources/column_mapping_transforms.md.txt b/docs/_sources/column_mapping_transforms.md.txt new file mode 100644 index 0000000..bea728c --- /dev/null +++ b/docs/_sources/column_mapping_transforms.md.txt @@ -0,0 +1,274 @@ +# Column mapping transforms + +Each header below represents a column mapping transform type. Transforms are used in the context of `column_mappings`. + +Some transforms refer to "a" or "b". These mean the transforms apply to columns from only one of the two datasets to be linked (we're trying to link people in dataset "a" with people in dataset "b"). + +More than one transform can be applied to a column. Transforms apply in the order they're listed, so the output of one transform may be the input of another. + +Each transform applies to the column specified by the `column_name` attribute in the config under the `[[column_mappings]]` section. The `transforms` attribute +indicates the type of the transform, which is one of the ones listed below. Along with `type`, there can be additional attributes used by the transform. +These may vary by type, and additional information is given for each type of transform below. Often an additional attribute is just named `value` or `values`. + +``` +[[column_mappings]] +alias = "namefrst_split" +column_name = "namefrst_clean" +transforms = [ { type = "split" } ] +``` + +## add_to_a + +Add a value to a column from dataset "a". + +``` +transforms = [ { type = "add_to_a", value = 11 } ] +``` + +## concat_to_a + +Concatenate the string value to the end of a column in dataset "a". + +``` +transforms = [ { type = "concat_to_a", value = " "} ] +``` + +## concat_to_b + +Concatenate the string value to the end of a column in dataset "b". + +``` +transforms = [ { type = "concat_to_b", value = " "} ] +``` + +## lowercase_strip + +Used in name cleaning. + +Convert alphabetical characters to lower-case and strip white space characters from the start and end of the strings in the column. + +``` +transforms = [ { type = "lowercase_strip"} ] + +``` + +## rationalize_name_words + +Used in name cleaning. + +Replace '?', '\*', and '-' with spaces. Since people's names in raw census data can contain these +characters, replacing these characters can lead to better matching. + +``` +transforms = [ { type = "rationalize_name_words"} ] +``` + + +## remove_qmark_hyphen + +Used in name cleaning. + +Remove the '?-' from words and replace with nothing. + +``` +transforms = [ { type = "remove_qmark_hyphen"} ] +``` + +## remove_punctuation + +Remove most punctuation and replace with nothing. + +Removes: +``` +? - \ / " ' : , . [ ] { } +``` + +``` +transforms = [ { type = "remove_punctuation"} ] +``` + +## replace_apostrophe + +Used in name cleaning. + +Replace each apostrophe "'" with a space. + +``` +transforms = [ { type = "replace_apostrophe"} ] + +``` + + +## remove_alternate_names + +Used in name cleaning. + +Remove any names following the string 'or'. + +``` +transforms = [ { type = "remove_alternate_names"} ] +``` + + +## remove_suffixes + +Used in name cleaning. + +Given a list of suffixes, remove them from the names in the column. + +``` +transforms=[{ type = "remove_suffixes", values = ["jr", "sr", "ii", "iii"] }] +``` + +## remove_stop_words + +Used in name cleaning. + +Remove last words from names such as street names. + +``` +transforms=[ +{type = "remove_stop_words", values = ['alley','ally','aly','anex','annex','av','ave','aven','avenu','avenue','avn','avnue','avanue','avaneu','bg','blvd','boul','boulevard','brg','bridge','burg','camp','circle','cor', 'corner', 'corners','cors', 'court', 'courts', 'cp', 'cres', 'crescent', 'ct', 'cts', 'dr','driv', 'drive', 'est', 'estate', 'express', 'expressway', 'ext', 'extension', 'ferry', 'fort', 'frt', 'fry', 'ft', 'heights', 'ht', 'hts', 'is', 'island', 'key', 'ky', 'ldg', 'lodge', 'mill', 'mills', 'ml', 'mls', 'mount', 'mountain', 'mountin', 'mt', 'mtn', 'park', 'parkway','pike', 'pikes','pkwy', 'pl', 'place', 'point', 'points', 'pr', 'prairie', 'prk', 'pt', 'pts', 'rad', 'radial', 'rd', 'rds', 'rest', 'riv', 'river', 'road', 'roads', 'rst', 'spgs', 'springs', 'sq', 'square', 'st', 'sta', 'station', 'str', 'street', 'streets', 'strt', 'sts', 'ter', 'terrace', 'track', 'tracks', 'trail', 'trails', 'trnpk', 'turnpike', 'un', 'union', 'valley', 'vally', 'via', 'viaduct', 'vill', 'villag', 'village', 'villiage', 'well', 'wl', 'wl', 'and','of','.',',','-','/','&','south','north','east','west','s','n','e','w','block']}] + +``` + +## remove_prefixes + +Used in name cleaning. + +Remove prefixes like "Ms.", "Mr.", or "Mrs." from names. + +In some census data, "ah" is such a prefix from Chinese names. + +``` +transforms=[{ type = "remove_prefixes", values = ["ah"]}] +``` + +## condense_strip_whitespace + +Used in name cleaning. + +Take white space that may be more than one character or contain non-space characters and replace it with a single space. + +``` + +transforms=[{ type = "condense_strip_whitespace"}] + +``` + +## remove_one_letter_names + +Used in name cleaning. + +If a name is a single character, remove it and leave the white space behind. + +``` +transforms=[{ type = "remove_one_letter_names"}] +``` + + +## split + + +Split the column value on space characters (" "). + +``` +[[column_mappings]] +alias = "namefrst_split" +column_name = "namefrst_clean" +transforms = [ { type = "split" } ] +``` + + + + +## array_index + +If the column contains an array, select the element at the given position. + +This can be used as the input to another transform. In the example below, the first transform selects the second (index 1) item from the "namefrst_split" column that contains a set of names split on white space. Then, the substring 0,1 is selected which gives the first initial of the person's probable middle name. + +``` +alias = "namefrst_mid_init" +column_name = "namefrst_split" +transforms = [ + { type = "array_index", value = 1}, + { type = "substring", values = [0, 1]} +] +``` + +## mapping + +Map single or multiple values to a single output value, otherwise known as a "recoding." + +``` +[[column_mappings]] +column_name = "birthyr" +alias = "clean_birthyr" +transforms=[ +{type = "mapping" +values = [{"from"=[9999,1999], "to" = ""}, +{"from" = -9998, "to" = 9999} +]} +``` + +## substring + +Replace a column with a substring of the data in the column. + +``` +transforms = [ + { type = "substring", values = [0, 1]}] + ``` + +## divide_by_int + +Divide data in a column by an integer value. It may leave a non-integer result. + +For instance, this transform takes the birthplace variable and converts it from the detailed version to the general version. The two least significant digits are detailed birthplace information; to make the more general version, we simply drop them by dividing by 100 and rounding to the lowest whole number (floor function). + +``` +[[column_mappings]] +column_name = "bpl" +alias = "bpl_root" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] + +``` + + +## when_value + + +Apply conditional logic to replacement of values in a column. Works like the SQL if() or case() expressions in the SQL "select" clause. + +When a the value of a column is "value" replace it with "if_value" otherwise replace it with the "else_value". + +This example replaces all "race" IPUMS codes with 0 (white) or 1 (non-white). An IPUMS code of 100 is the "white" race category. + +``` +column_name = "race" +transforms = [ + { type = "when_value", value = 100, if_value = 0, else_value = 1} +] +``` + + +## get_floor + +Round down to the nearest whole number. + +This example produces the general version of the IPUMS "relate" variable. The variable is coded such that detailed categories are between the hundreds (300 is child of household head, 301 is simply 'child', 302 is adopted child, 303 is step-child for instance). The general categories are usually all that's needed (1 == household head, 2 == spouse, 3 == child, 4 == child-in-law, 5 == parent, 6 == parent-in-law, 7== sibling, 12 == not related to head). + +``` +[[column_mappings]] +alias = "relate_div_100" +column_name = "relate" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] +``` + diff --git a/docs/_sources/comparison_types.md.txt b/docs/_sources/comparison_types.md.txt new file mode 100644 index 0000000..06a93b8 --- /dev/null +++ b/docs/_sources/comparison_types.md.txt @@ -0,0 +1,836 @@ +# Comparison types, transform add-ons, aggregate features, and household aggregate features + +This page has information on the different comparison types available for the `[[comparison_features]]` +section, along with some attributes available to all of the comparison types and some aggregate features +that are not configurable. + +## Comparison types +Each header below represents a comparison type. Transforms are used in the context of `comparison_features`. + +``` +[[comparison_features]] +alias = "relatematch" +column_name = "relate_div_100" +comparison_type = "equals" +categorical = true +``` + +### maximum_jaro_winkler +Finds the greatest Jaro-Winkler value among the cartesian product of multiple columns. For example, given an input of `column_names = ['namefrst', 'namelast']`, it would return the maximum Jaro-Winkler name comparison value among the following four comparisons: +``` +[('namefrst_a', 'namefrst_b'), + ('namefrst_a', 'namelast_b'), + ('namelast_a', 'namefrst_b'), + ('namelast_a', 'namelast_b')] + ``` +* Attributes: + * `column_names` -- Type: list of strings. Required. The list of columns used as input for the set of comparisons generated by taking the cartesian product. + + ``` +[[comparison_features]] +alias = "maximum_jw" +column_names = ["namelast", "namefrst"] +comparison_type = "maximum_jaro_winkler" +``` + + +### jaro_winkler + +Returns the Jaro-Winkler comparison score for a given column. +* Attributes: + * `column_name` -- Type: `string`. Required. The column to compare using the Jaro-Winkler score. +``` +[[comparison_features]] +alias = "namefrst_jw" +column_name = "namefrst" +comparison_type = "jaro_winkler +``` + +### jaro_winkler_street +Uses an additional geographic column value to filter for major location changes before comparing street names. If boundary column A is not equal to boundary column B, a Jaro-Winkler score of zero is returned. If boundary column A and B are equal, the Jaro-Winkler comparison score of the street columns is returned. +* Attributes: + * `column_name` -- Type: `string`. Required. The input street column. + * `boundary` -- Type: `string`. Required. An input column to match on before comparing street name values. +``` +[[comparison_features]] +alias = "jw_street" +column_name = "street" +boundary = "enum_dist" +comparison_type = "jaro_winkler_street" +``` + +### max_jaro_winkler + +Returns the greatest Jaro-Winkler value from the comparisons of a list of names. +* Attributes: + * `column_name` -- Type: `string`. Required. Input column containing a list of names to compare (such as related household members, or neighborhood surnames). +``` +[[comparison_features]] +alias = "related_individual_max_jw" +column_name= "namefrst_related" +comparison_type = "max_jaro_winkler" +``` + +### equals + +Asserts that values are the same for both compared columns using SQL: `a.{column_name} IS NOT DISTINCT FROM b.{column_name}` + +``` +[[comparison_features]] +alias = "relatematch" +column_name = "relate_div_100" +comparison_type = "equals" +categorical = true +``` + +### f1_match +Evaluates if the first name initial A matches either the first name first initial B or either the first or second middle initial of B. If so, returns 1. Otherwise, returns 2. + +1 = First initial of first first name A matches first initial of any of potential match first names B + +2 = mismatch + +Uses the following SQL query: +``` +"CASE WHEN ( + (a.{fi} IS NOT DISTINCT FROM b.{fi}) OR + (a.{fi} IS NOT DISTINCT FROM b.{mi0}) OR + (a.{fi} IS NOT DISTINCT FROM b.{mi1}) +) THEN 1 ELSE 2 END" +``` + +``` +[[comparison_features]] +alias = "f1_match" +first_init_col = "namefrst_init" +mid_init_cols = ["namefrst_mid_init", "namefrst_mid_init_2"] +comparison_type = "f1_match" +categorical = true +``` + +### f2_match +Evaluates if first middle initial A is empty/null. If so, return 0. +Otherwise, if either first or second middle initial A is not null and matches first name initial B, or first or second middle initial B, return 1. +Otherwise, return 2. + +1 = First initial of A second first name matches first initial of any of potential match first names B + +2 = mismatch + +0 = no second first name A + +Uses the following SQL: +``` +CASE WHEN ((a.{mi0} == '') OR (a.{mi0} IS NULL)) THEN 0 WHEN ( + (a.{mi0} IS NOT DISTINCT FROM b.{fi}) OR + ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{fi})) OR + (a.{mi0} IS NOT DISTINCT FROM b.{mi0}) OR + (a.{mi0} IS NOT DISTINCT FROM b.{mi1}) OR + ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi0})) OR + ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi1})) +) THEN 1 ELSE 2 END +``` +* Attributes: + * `first_init_col` -- Type: `string`. Required. First name initial input column. + * `mid_init_cols` -- Type: list of strings. Required. List of first and second middle initial input columns. +``` +[[comparison_features]] +alias = "f2_match" +first_init_col = "namefrst_init" +mid_init_cols = ["namefrst_mid_init", "namefrst_mid_init_2"] +comparison_type = "f2_match" +categorical = true +``` + +### not_equals +Asserts that values are distinct between compared individuals using SQL: `a.{column_name} IS DISTINCT FROM b.{column_name}`. Used mainly in caution flag features (f_caution, m_caution, sp_caution). +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare. + +``` +[[comparison_features]] +alias = "m_caution" +column_names = ["mbpl", "mother_birthyr", "stepmom", "momloc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "mbpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "mother_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "stepmom" +comparison_type = "parent_step_change" +[comparison_features.comp_d] +column_name = "momloc" +comparison_type = "present_both_years" +``` + +### equals_as_int +Checks for equality using equals sign and returns boolean result in integer form. Uses SQL: `CAST(a.{col} = b.{col} as INT)` +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare. + +``` +[[comparison_features]] +alias = "namelast_equal_as_int" +column_name = "namelast_clean" +comparison_type = "equals_as_int" +``` +### all_equals +Asserts whether the values in all given columns match. Uses a SQL expression generated by joining `a.{col} = b.{col}` and `AND` for each given column. +* Attributes: + * `column_names` -- Type: list of strings. Required. List of the columns to evaluate if all are equal across records being compared. +``` +[[comparison_features]] +alias = "exact" +column_names = ["namefrst_unstd", "namelast_clean"] +comparison_type = "all_equals" +``` + +### or +Allows for the concatenation of up to four comparison features into one feature using a SQL `OR` between the generated clause for each sub-comparison. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. + * `comp_c`, `comp_d` -- Type: Object. Optional. Sub-comparison using any of the comparison feature types documented in this section. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr"] +comparison_type = "or" +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +lower_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +``` + +### and + +Allows for the concatenation of up to four comparison features into one feature using a SQL `AND` between the generated clause for each sub-comparison. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. + * `comp_c`, `comp_d` -- Type: Object. Optional. Sub-comparison using any of the comparison feature types documented in this section. + +In this example, the `and` comparison appears in `[comparison_features.comp_b]`. + +``` +[[comparison_features]] +alias = "street_jw" +comparison_type = "times" +column_names = ["street","county", "statefip"] +[comparison_features.comp_a] +column_name = "street" +comparison_type = "jaro_winkler" +lower_threshold = 0.9 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### times +Takes the output of two sub-comparisons and multiplies them together after casting as floats. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. `comp_a` and `comp_b` can also have sub-comparisons, as in the given example. +``` +[[comparison_features]] +alias = "street_jw" +comparison_type = "times" +column_names = ["street","county", "statefip"] +[comparison_features.comp_a] +column_name = "street" +comparison_type = "jaro_winkler" +lower_threshold = 0.9 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### caution_comp_3 +Generates an SQL expression in the form `(({expr_a} OR {expr_b}) AND {expr_c})`. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b`, `comp_c` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. `comp_a`, `comp_b`, and `comp_c` can also have sub-comparisons. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_3" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +``` + +### caution_comp_4 +Generates an SQL expression in the form `(({expr_a} OR {expr_b} OR {expr_c}) AND {expr_d})`. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b`, `comp_c`, `comp_d` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. `comp_a`, `comp_b`, `comp_c`, and `comp_d` can also have sub-comparisons. + +``` +[[comparison_features]] +alias = "m_caution" +column_names = ["mbpl", "mother_birthyr", "stepmom", "momloc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "mbpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "mother_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "stepmom" +comparison_type = "parent_step_change" +[comparison_features.comp_d] +column_name = "momloc" +comparison_type = "present_both_years" +``` + +### any_equals +Used to compare middle initials and first names under specific circumstances. +If middle initial A is not empty/null and is the same as either middle initial B or first name B, +OR if first name A is not empty/null and is the same as middle initial B. +* Attributes: + * `column_names` -- Type: list of strings. Required. The first input column should be the middle initial column, and the second input column should be the first name column. +``` +[[comparison_features]] +alias = "mid_init_match" +column_names = ["namefrst_mid_init", "namefrst_unstd"] +comparison_type = "any_equals" +``` + +### either_are_1 +Checks if the column value for either A or B is equal to 1. +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare to 1. + +``` +[[comparison_features]] +alias = "either_1" +column_name = "nativity" +comparison_type = "either_are_1" +categorical = true +``` + +### either_are_0 +Checks if the column value for either A or B is equal to 0. +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare to 0. + +``` +[[comparison_features]] +alias = "either_0" +column_name = "nativity" +comparison_type = "either_are_0" +categorical = true +``` + +### second_gen_imm +Checks if individual A is a second-generation immigrant by looking for `nativity` value of 2, 3, or 4 (one or both parents foreign-born). +* Attributes: + * `column_name` -- Type: `string`. Required. Input should be the name of the nativity column. +``` +[[comparison_features]] +alias = "sgen" +column_name = "nativity" +comparison_type = "second_gen_imm" +categorical = true +``` + +### rel_jaro_winkler +Uses a Scala function to determine the number of people in the input column with a name similarity score (Jaro-Winkler) greater than or equal to the given `jw_threshold`, an age difference less than or equal to the given `age_threshold`, and matching sex for the sample A individual and the sample B potential match. Takes a column generated with the feature selection transform `related_individual_rows` as input (list of person data objects to compare). Can be used for related or unrelated individuals, depending on the input column specified. + +* Attributes: + * `column_name` -- Type: `string`. The input column with data in the form of a list of person data objects. + * `name_col` -- Type: `string`. + * `birthyr_col` -- Type: `string`. + * `jw_threshold` -- Type: `float`. + * `age_threshold` -- Type: `int`. + +``` +[[comparison_features]] +alias = "rel" +column_name = "namefrst_related_rows" +name_col = "namefrst_unstd" +birthyr_col = "replaced_birthyr" +comparison_type = "rel_jaro_winkler" +jw_threshold = 0.9 +age_threshold = 5 +``` + +### extra_children +Using a Scala function, checks to see if there are children present in sample B who are not present in sample A, but based on relate codes, age, sex, and name, we would have expected to be present in A. Returns a count of suspected "extra" children. Takes a column generated with the feature selection transform `related_individual_rows` as input (list of person data objects to compare). +* Attributes: + * `column_name` -- Type: `string`. The input column with data in the form of a list of person data objects. + * `relate_col` -- Type: `string`. The name of the column with the `relate` code. + * `histid_col` -- Type: `string`. The name of the id column. + * `name_col` -- Type: `string`. The name of the column containing the first name for comparison. + * `birthyr_col` -- Type: `string`. The name of the column containing the birth year. + * `year_b` -- Type: `int`. The year that sample B was taken. + * `jw_threshold` -- Type: `float`. The minimum acceptable Jaro-Winkler score to consider a match. + * `age_threshold` -- Type: `int`. The maximum acceptable age difference to consider a match. + +``` +[[comparison_features]] +alias = "extra_children" +column_name = "namefrst_related_rows" +relate_col = "relate" +histid_col = "histid" +name_col = "namefrst_unstd" +birthyr_col = "replaced_birthyr" +year_b = 1910 +comparison_type = "extra_children" +jw_threshold = 0.8 +age_threshold = 2 +``` + +### jaro_winkler_rate +Uses a Scala function to calculate the percentage of individuals who have a Jaro-Winkler score greater than or equal to the given threshold. Rate returned as a percentage as a float data type. +* Attributes: + * `column_name` -- Type: `string`. The input column with data in the form of a list of person data objects. The input column seen below ("namelast_neighbors")was generated using a "neighbor_aggregate" feature selection. + * `jw_threshold` -- Type: `float`. The minimum Jaro-Winkler threshold to consider an acceptable match. + +In the following example, a `lower_threshold` feature add-on is used to convert the returned rate to a boolean asserting whether it meets the given minimum threshold. (>= 5% of neighbors have a Jaro-Winkler score >= 0.95) +``` +[[comparison_features]] +alias = "nbors" +comparison_type = "times" +column_names = ["namelast_neighbors", "county", "statefip"] +[comparison_features.comp_a] +column_name = "namelast_neighbors" +comparison_type = "jaro_winkler_rate" +jw_threshold = 0.95 +lower_threshold = 0.05 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### sum +Adds the column values for A and B together (takes the sum). +* Attributes: + * `column_name` -- Type: `string`. The input column to be added. + +``` +[[comparison_features]] +alias = "namelast_popularity_sum" +column_name = "namelast_popularity" +comparison_type = "sum" +``` + +### length_b +Returns the length of the column value in record B using the SQL `size()` function. +* Attributes: + * `column_name` -- Type: `string`. The name of the input column to take the length of in dataset B. + +### abs_diff +Takes the absolute value of the difference between the values of the given column in datasets A and B. +* Attributes: + * `column_name` -- Type: `string`. The input column to evaluate. + * `not_equals` -- Type: `int`. OPTIONAL. You can specify a value for the column to be considered invalid input, in which case the expression would return the value -1 instead of an absolute difference. For example, if you are evaluating the difference in marriage duration values, and "99" is a placeholder value for "unknown" in the data, you can exclude those values from consideration using this attribute. + +``` +[[comparison_features]] +alias = "byrdiff" +column_name = "replaced_birthyr" +comparison_type = "abs_diff" + +[[comparison_features]] +alias = "mardurmatch" +column_name = "durmarr" +not_equals = 99 +comparison_type = "abs_diff" +btwn_threshold = [9, 14] +categorical = True +``` + +### b_minus_a +Returns the value of subtracting the value of column A from the value of column B. +* Attributes: + * `column_name` -- Type: `string`. The input column to evaluate. + * `not_equals` -- Type: `int`. OPTIONAL. You can specify a value for the column to be considered invalid input, in which case the expression would return the value -1 instead of an absolute difference. For example, if you are evaluating the difference in marriage duration values, and "99" is a placeholder value for "unknown" in the data, you can exclude those values from consideration using this attribute. +``` +[[comparison_features]] +alias = "mardurmatch" +column_name = "durmarr" +not_equals = 99 +comparison_type = "b_minus_a" +btwn_threshold = [5,14] +categorical = true +``` + +### geo_distance +Uses a lookup table to find the geographic distance between locations. The SQL expression is generated by `hlink/linking/core/dist_table.py`. There are several ways to configure this feature. You can look up distances in the given file using one or two keys (specified with the `key_count` attribute). You can also optionally have a secondary look-up table that serves as a back-up value in the case that the primary look-up does not contain a value for the locations given. This is particularly useful for county distance, as you can set the primary join to be across counties, but set up a secondary join on state, which has much fewer combinations and thus less risk of nulls, to fill in when the counties specified aren't in the look-up. + +* Attributes: + * `key_count` -- Type: `int`. The number of keys used to join on the primary (or only) look-up table. Acceptable values are 1 or 2. Ex: for state and county, key_count = 2. For just state, key_count = 1 even though there is county_a and county_b. + * `distances_file` -- Type: `string` of path. Path to the distances look-up file. + * `table_name` -- Type: `string`. What to name the table that will be generated from the distances file. If you want to do multiple look-ups, if the table_name is the same across all feature specifications, it will only be read in once. + + * Attributes for `key_count = 1`: + * `column_name` -- Type: `string`. The column in the input data that you want to use as a key to look up the geographic distance. + * `loc_a` -- Type: `string`. First column to join on in the look-up table (where to find the value coming from the `column_name` column A). + * `loc_b` -- Type: `string`. Second column to join on in the look-up table (where to find the value coming from the `column_name` column B). + * `distance_col` -- Type: `string`. Name of the column containing the geographic distance in the look-up table. + + * Attributes for `key_count = 2`: + * `column_names` -- Type: list of strings. The two columns you want to use as keys to look up the geographic distance. + * `source_column_a` -- Type: `string`. First column to join on in the source data. + * `source_column_b` -- Type: `string`. Second column to join on in the source data. + * `loc_a_0` -- Type: `string`. First column to join on in the look-up table. + * `loc_a_1` -- Type: `string`. First column to join on in the look-up table. + * `loc_b_0` -- Type: `string`. Second column to join on in the look-up table. + * `loc_b_1` -- Type: `string`. Second column to join on in the look-up table. + * `distance_col` -- Type: `string`. Name of the column containing the geographic distance in the look-up table. + + * Attributes if using a secondary join: + * `secondary_key_count` -- Type: `int`. The number of keys used to join on the secondary (backup) look-up table. Acceptable values are 1 or 2. + * `secondary_table_name` -- Type: `string`. What to name the table that will be generated from the `secondary_distances_file`. If you want to do multiple look-ups, if the table_name is the same across all feature specifications, it will only be read in once. + * `secondary_distances_file` -- Type: `string` of path. Path to the secondary distances look-up file. + * `secondary_source_column` -- Type: `string`. The column in the input data that you want to use as a key in the secondary geographic distance look-up. + * `secondary_loc_a` -- Type: `string`. First column to join on in the secondary look-up table. + * `secondary_loc_b` -- Type: `string`. Second column to join on in the secondary look-up table. + * `secondary_distance_col` -- Type: `string`. Name of the column containing the geographic distance in the secondary look-up table. + +``` +[[comparison_features]] +alias = "state_distance" +comparison_type = "geo_distance" +key_count = 1 +table_name = "state_distance_lookup" +distances_file = "/path/to/county_state_distance.csv" +column_name = "bpl" +loc_a = "statecode1" +loc_b = "statecode2" +distance_col = "dist" + + +[[comparison_features]] +alias = "county_distance" +comparison_type = "geo_distance" +column_names = ["county", "statefip"] +key_count = 2 +table_name = "county_distance_lookup" +distances_file = "/path/to/county_1900_1910_distances_km.csv" +# columns to join on in the data +source_column_a = "county" +source_column_b = "statefip" + +# column names from the csv lookup file +loc_a_0 = "from_icpsrctyi" +loc_a_1 = "to_icpsrctyi" +loc_b_0 = "from_statefip" +loc_b_1 = "to_statefip" +distance_col = "distance_km" + +# SECONDARY JOIN +secondary_key_count = 1 +secondary_table_name = "state_distance_lookup" +secondary_distances_file = "/path/to/state_1900_1910_distances_km.csv" +secondary_source_column = "statefip" +secondary_loc_a = "from_statefip" +secondary_loc_b = "to_statefip" +secondary_distance_col = "distance_km" +``` + +### fetch_a + +Gets the value of column A. + +* Attributes: + * `column_name` -- Type: `string`. Required. The column to get the value from. + +``` +[[comparison_features]] +alias = "race" +column_name = "race" +comparison_type = "fetch_a" +categorical = true +``` + + +### fetch_b + +Gets the value of column B. + +* Attributes: + * `column_name` -- Type: `string`. The column to get the value from. + +``` +[[comparison_features]] +alias = "race" +column_name = "race" +comparison_type = "fetch_b" +categorical = true +``` + +### present_both_years + +Checks whether both column A and column B are present. + +* Attributes: + * `column_name` -- Type: `string`. The column to check. + +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +[comparison_features.comp_d] +column_name = "sploc" +comparison_type = "present_both_years" +``` + +### neither_are_null + +Checks that neither column A nor column B is null. + +* Attributes: + * `column_name` -- Type: `string`. The column to check. + + +### present_and_not_equal + +Checks that column A and column B are both present but are not equal. + +* Attributes: + * `column_name` -- Type: `string`. The column to check. + +## Feature add-ons +These attributes can be added to most comparison feature types above to extend the type of output returned beyond the standard comparison feature. + +### alias +* Attributes: + * `alias`: Type: `string`. Should be used at the top level comparison of every comparison feature. The name for the output column. +``` +[[comparison_features]] +alias = "jw_f" +column_name = "father_namefrst" +comparison_type = "jaro_winkler" +``` + +### power +Raises a comparison feature to a given exponential power. +* Attributes: + * `power` -- Type: `int`. The power to raise the comparison output to. For example, `power = 2` will square the output. +``` +[[comparison_features]] +alias = "county_distance_squared" +comparison_type = "geo_distance" +column_names = ["county", "statefip"] +# PRIMARY JOIN +# key count: the number of keys used for the join per source file. Ex: for state and county, key_count = 2. For just state, key_count = 1 even though there is county_a and county_b +key_count = 2 +table_name = "county_distance_lookup" +#distances_file = "/path/to/county_state_distance.csv" +distances_file = "/path/to/county_1900_1910_distances_km.csv" +# columns to join on in the data +source_column_a = "county" +source_column_b = "statefip" +# column names from the csv lookup file +loc_a_0 = "from_icpsrctyi" +loc_a_1 = "to_icpsrctyi" +loc_b_0 = "from_statefip" +loc_b_1 = "to_statefip" +distance_col = "distance_km" +# SECONDARY JOIN +secondary_key_count = 1 +secondary_table_name = "state_distance_lookup" +secondary_distances_file = "/path/to/state_1900_1910_distances_km.csv" +secondary_source_column = "statefip" +secondary_loc_a = "from_statefip" +secondary_loc_b = "to_statefip" +secondary_distance_col = "distance_km" +power = 2 +``` + +### threshold +* Attributes: + * `threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is greater than or equal to (`>=`) the given threshold value. +``` +[[comparison_features]] +alias = "imm" +column_name = "nativity" +comparison_type = "fetch_a" +threshold = 5 +categorical = true +``` + +### lower_threshold +* Attributes: + * `lower_threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is greater than or equal to (`>=`) the given threshold value. +``` +[[comparison_features]] +alias = "street_jw" +comparison_type = "times" +column_names = ["street","county", "statefip"] +[comparison_features.comp_a] +column_name = "street" +comparison_type = "jaro_winkler" +lower_threshold = 0.9 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### upper_threshold +* Attributes: + * `upper_threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is less than or equal to (`<=`) the given threshold value. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +[comparison_features.comp_d] +column_name = "sploc" +comparison_type = "present_both_years" +``` + +### gt_threshold +* Attributes: + * `gt_threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is greater than (`>`) the given threshold value. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +[comparison_features.comp_d] +column_name = "sploc" +comparison_type = "present_both_years" +``` + +### btwn_threshold +* Attributes: + * `btwn_threshold` -- Type: List of numeric type. Asserts if the comparison feature output is greater than or equal to (`>=`) the first threshold value, and less than or equal to (`<=`) the second threshold value. +``` +[[comparison_features]] +alias = "mardurmatch" +column_name = "durmarr" +not_equals = 99 +comparison_type = "b_minus_a" +btwn_threshold = [5,14] +categorical = true +``` + +### look_at_addl_var +* Attributes: + * `look_at_addl_var` -- Type: boolean. Flags the program to consider an additional column value before reporting the comparison feature value. + * `addl_var` -- Type: `string`. The additional column to consider. + * `check_val_expr` -- Type: expression. The expression to use to evaluate the additional column. For example, `check_val_expr = "= 5"`. + * `else_val` -- Type: same type as comparison feature output. If the additional volumn value does not meet the `check_val_expr` specification, the value to return instead of the comparison feature value. + +In the following example, the generated SQL expression for the column would be: `CASE WHEN {datasource}.nativity = 5 then {yrimmig abs_diff value} else -1 END`. +``` +[[comparison_features]] +alias = "immyear_diff" +column_name = "yrimmig" +comparison_type = "abs_diff" +look_at_addl_var = true +addl_var = "nativity" +datasource = "a" +check_val_expr = "= 5" +else_val = -1 +``` + +## Aggregate Features +These features are not configurable. To include them in the generated comparison features, they just need to be included in the `[training][independent_vars]` section of the config. They are generated using the "aggregate_features" SQL template. + +### hits +The number of potential matches generated for the given individual (counted by aggregating on `{id_column}_a`). + +### hits2 +`hits` squared. + +### exact_mult +Indicator for the existence of multiple potential matches with the exact same first and last name as the A sample individual within the B data. Returns numeric boolean (0 or 1). + +## Household Aggregate Features +These features are not configurable. To include them in the generated comparison features, they just need to be included in the `[hh_training][independent_vars]` section of the config. They are generated using the "hh_aggregate_features" SQL template. + +### jw_max_a +The highest Jaro-Winkler score for any of the first names in linked household A against the first name in linked household B where birth year difference is less than or equal to ten, excluding the individual A in the current potential match. Returns `0` if no other individuals are in the household for comparison. + +### jw_max_b +The highest Jaro-Winkler score for any of the first names in linked household A against the first name in linked household B where sex matches and birth year difference is less than or equal to ten, excluding the individual A in the current potential match. Returns `0` if no other individuals are in the household for comparison. diff --git a/docs/_sources/config.md.txt b/docs/_sources/config.md.txt new file mode 100644 index 0000000..df9ce24 --- /dev/null +++ b/docs/_sources/config.md.txt @@ -0,0 +1,749 @@ +# Configuration +1. [Basic Example Config File](#basic-config-file) +2. [Advanced Example Config File](#advanced-config-file) +3. [Top level configs](#top-level-configs) +4. [Data sources](#data-sources) +5. [Filter](#filter) +6. [Column mappings](#column-mappings) +7. [Substitution columns](#subsitution-columns) +8. [Feature selections](#feature-selections) +9. [Potential matches universe](#potential-matches-universe) +10. [Blocking](#blocking) +11. [Comparisons](#comparisons) +12. [Household comparisons](#household-comparisons) +13. [Comparison features](#comparison-features) +14. [Pipeline-generated features](#pipeline-generated-features) +15. [Training and models](#training-and-models) +16. [Household training and models](#household-training-and-models) + +## Basic Config File + +The config file tells the hlink program what to link and how to link it. A description of the different sections of +a configuration file are below. For reference, here is an example of a relatively basic config file. This config file +is used by the `examples/tutorial/tutorial.py` script for linking, and there is a more detailed discussion of the config +file in the README in `examples/tutorial`. + +Note that this config is written in TOML, but hlink is also able to work with JSON config files. + +``` +id_column = "id" +feature_selections = [] + +[datasource_a] +alias = "a" +file = "data/A.csv" + +[datasource_b] +alias = "b" +file = "data/B.csv" + +[[column_mappings]] +column_name = "NAMEFRST" +transforms = [ + {type = "lowercase_strip"} +] + +[[column_mappings]] +column_name = "NAMELAST" +transforms = [ + {type = "lowercase_strip"} +] + +[[column_mappings]] +column_name = "AGE" +transforms = [ + {type = "add_to_a", value = 10} +] + +[[column_mappings]] +column_name = "SEX" + +[[blocking]] +column_name = "SEX" + +[[blocking]] +column_name = "AGE_2" +dataset = "a" +derived_from = "AGE" +expand_length = 2 +explode = true + +[[comparison_features]] +alias = "NAMEFRST_JW" +column_name = "NAMEFRST" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "NAMELAST_JW" +column_name = "NAMELAST" +comparison_type = "jaro_winkler" + +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "NAMEFRST_JW" +threshold = 0.79 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "NAMELAST_JW" +threshold = 0.84 +``` + +## Advanced Config File + +Here is an example of a more complex config file that makes use of more of hlink's features. +It uses machine learning to probabilistically link the two datasets. + +``` +id_column = "histid" +drop_data_from_scored_matches = false + +# --------- DATASOURCES -------------- +[datasource_a] +alias = "us1900" +file = "/path/to/us1900m_usa.P.parquet" + +[datasource_b] +alias = "us1910" +file = "/path/to/us1910m_usa.P.parquet" + +# --------- FILTERS -------------- + +[[filter]] +expression = "NAMELAST is not null and NAMELAST != ''" + +[[filter]] +training_data_subset = true +datasource = "a" + +[[filter]] +expression = "age >= 5" +datasource = "b" + +# --------- COLUMN MAPPINGS -------------- + +[[column_mappings]] +column_name = "serialp" + +[[column_mappings]] +column_name = "sex" + +[[column_mappings]] +column_name = "age" + +[[column_mappings]] +column_name = "namelast" + +[[column_mappings]] +alias = "namefrst_clean" +column_name = "namefrst" +transforms = [ + { type = "lowercase_strip" }, + { type = "rationalize_name_words" }, + { type = "remove_qmark_hyphen"}, + { type = "replace_apostrophe"}, + { type = "remove_suffixes", values = ["jr", "sr", "ii", "iii"] }, + { type = "remove_alternate_names"}, + { type = "condense_strip_whitespace"}, +] + +[[column_mappings]] +alias = "namefrst_split" +column_name = "namefrst_clean" +transforms = [ { type = "split" } ] + +[[column_mappings]] +alias = "namefrst_std" +column_name = "namefrst_split" +transforms = [ + { type = "array_index", value = 0 } +] + +[[column_mappings]] +alias = "bpl_orig" +column_name = "bpl" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] + +[[column_mappings]] +alias = "statefip" +column_name = "statefip_h" + +[[column_mappings]] +column_name = "birthyr" +alias = "clean_birthyr" +[[column_mappings.transforms]] +type = "mapping" +mappings = {9999 = "", 1999 = ""} +output_type = "int" + +[[column_mappings]] +alias = "relate_div_100" +column_name = "relate" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] + +# --------- SUBSTITUTIONS -------------- + +[[substitution_columns]] +column_name = "namefrst_std" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "1" +substitution_file = "/path/to/name_std/male.csv" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "2" +substitution_file = "/path/to/name_std/female.csv" + +# --------- FEATURE SELECTIONS -------------- + +[[feature_selections]] +input_column = "clean_birthyr" +output_column = "replaced_birthyr" +condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end" +transform = "sql_condition" + +[[feature_selections]] +input_column = "namelast" +output_column = "namelast_bigrams" +transform = "bigrams" + +[[feature_selections]] +input_column = "bpl_orig" +output_column = "bpl_clean" +condition = "case when bpl_str == 'washington' and bpl2_str=='washington' then 53 when (bpl_str is null or bpl_str == '') and bpl2_str=='washington' then 53 when bpl_str == 'washington' and (bpl2_str=='' or bpl2_str is null) then 53 else bpl_orig end" +transform = "sql_condition" + +[[feature_selections]] +input_column = "bpl_clean" +output_column = "region" +transform = "attach_variable" +region_dict = "/path/to/region.csv" +col_to_join_on = "bpl" +col_to_add = "region" +null_filler = 99 +col_type = "float" + +# --------- POTENTIAL MATCHES UNIVERSE ------------- + +[[potential_matches_universe]] +expression = "sex == 1" + +# --------- BLOCKING -------------- + +[[blocking]] +column_name = "sex" + +[[blocking]] +column_name = "birthyr_3" +dataset = "a" +derived_from = "replaced_birthyr" +expand_length = 3 +explode = true + +[[blocking]] +column_name = "namelast_bigrams" +explode = true + +# --------- COMPARISONS -------------- + +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "namefrst_std_jw" +threshold = 0.8 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "namelast_jw" +threshold = 0.75 + +# --------- HOUSEHOLD COMPARISIONS (post-blocking filters) ------------- + +[hh_comparisons] +comparison_type = "threshold" +feature_name = "byrdiff" +threshold_expr = "<= 10" + +# --------- COMPARISON FEATURES -------------- + +[[comparison_features]] +alias = "region" +column_name = "region" +comparison_type = "fetch_a" +categorical = true + +[[comparison_features]] +alias = "namefrst_std_jw" +column_name = "namefrst_std" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "namelast_jw" +column_name = "namelast" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "sex_equals" +column_name = "sex" +comparison_type = "equals" +categorical = true + +[[comparison_features]] +alias = "relate_a" +column_name = "relate_div_100" +comparison_type = "fetch_a" + +# --------- PIPELINE-GENERATED FEATURES ------------ + +[[pipeline_features]] +input_columns = ["sex_equals", "region"] +output_column = "sex_region_interaction" +transformer_type = "interaction" + +[[pipeline_features]] +input_column = "relate_a" +output_column = "relatetype" +transformer_type = "bucketizer" +categorical = true +splits = [1,3,5,9999] + +# --------- TRAINING -------------- + +[training] + +independent_vars = [ "namelast_jw", "region", "hits", "sex_region_interaction", "relatetype"] +scale_data = false + +dataset = "/path/to/training_data.csv" +dependent_var = "match" +score_with_model = true +use_training_data_features = false +split_by_id_a = true +decision = "drop_duplicate_with_threshold_ratio" + +n_training_iterations = 2 +output_suspicious_TD = true +param_grid = true +model_parameters = [ + { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] }, + { type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] } +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } + +# --------- HOUSEHOLD TRAINING -------------- + +[hh_training] + +prediction_col = "prediction" +hh_col = "serialp" + +independent_vars = ["namelast_jw", "namefrst_std_jw", "relatetype", "sex_equals"] +scale_data = false + +dataset = "/path/to/hh_training_data_1900_1910.csv" +dependent_var = "match" +score_with_model = true +use_training_data_features = false +split_by_id_a = true +decision = "drop_duplicate_with_threshold_ratio" + +n_training_iterations = 10 +output_suspicious_TD = true +param_grid = false +model_parameters = [ + { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 }, + { type = "probit", threshold = 0.5, threshold_ratio = 1.0 } +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } + +``` + +## Top level configs + +These configs should go at the top of your config file under no header: + +*id_column* + +Required. Specify the id column that uniquely identifies a record in each dataset. +``` +id_column = "id" +``` + +*drop_data_from_scored_matches* + +Optional. Whether or not the scored potential matches should be output with full features data, or just ids and match information. +``` +drop_data_from_scored_matches = false +``` + +## Data sources + +* Header names: `datasource_a`, `datasource_b` +* Description: Specifies your input data. +* Required: True +* Type: Object +* Attributes: + * `alias` -- Type: `string`. The short name for the datasource. Must be alphanumeric with no spaces. + * `file` -- Type: `string`. Required. The path to the input file. The file can be `csv` or `parquet`. + +``` +[datasource_a] +alias = "us1900" +file = "/path/to/my_file.csv" +``` + +## Filter + +* Header name: `filter` +* Description: Specifies filters to apply to your input data. +* Required: False +* Type: List +* Attributes: + * `expression` -- Type: `string`. SQL expression to apply to your input datasets. Can not be combined with `training_data_subset` in a single filter. + * `training_data_subset` -- Type: `boolean`. If set to true, will subset your input data to only include records that are also in your training data. Can not be combined with `expression` in a single filter. + * `datasource` -- Type: `string`. If you want to limit the filter to operate only on dataset a or b, you can specify that with this attribute. + +``` +[[filter]] +training_data_subset = true +datasource = "a" + +[[filter]] +expression = "NAMELAST is not null and NAMELAST != ''" + +[[filter]] +expression = "age >= 5" +datasource = "b" +``` + + +## [Column Mappings](column_mapping_transforms) + +* Header name: `column_mappings` +* Description: Base column mappings and transformations to extract from your input datasets. +* Required: True +* Type: List +* Attributes: + * `alias` -- Type: `string`. Optional; if not specified the new column name defaults to `column_name`. New name of column. + * `column_name` -- Type: `string`. Name of column in input data. Used as the name of the output column if `alias` is not specified. + * `transforms` -- Type: `List`. Optional. A list of transforms to apply, in order, to the input data. See the [column mapping transforms](column_mapping_transforms) section for more information. + +``` +[[column_mappings]] +column_name = "age" + +[[column_mappings]] +alias = "namefrst_clean" +column_name = "namefrst" +transforms = [ + { type = "lowercase_strip" }, + { type = "rationalize_name_words" }, + { type = "remove_qmark_hyphen"}, + { type = "replace_apostrophe"}, + { type = "remove_suffixes", values = ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"] }, + { type = "remove_alternate_names"}, + { type = "condense_strip_whitespace"} +] +``` + +## [Substitution Columns](substitutions) + +* Header name: `substitution_columns` +* Description: Substitutions to apply to data after column mappings. +* Required: False +* Type: List +* Attributes: + * `column_name` -- Type: `string`. Required. Column to apply substitutions to. + * `substitutions` -- Type: `list`. A list of substitutions to apply. See the [substitutions](substitutions) section for more information. + +``` +[[substitution_columns]] +column_name = "namefrst_std" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "1" +substitution_file = "/path/to/name_std/male.csv" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "2" +substitution_file = "/path/to/name_std/female.csv" +``` + + +## [Feature Selections](feature_selection_transforms) + +* Header name: `feature_selections` +* Description: A list of feature selections to apply to the input data after substitutions and column mappings. See the [feature selection transforms](feature_selection_transforms) section for more information, including information on the specific transforms available. + +* Required: False +* Type: List +* Attributes: + * `input_column` -- Type: `string`. Required. The name of the input column. + * `output_column` -- Type: `string`. Required. The name of the output column. + * `transform` -- Type: `string`. The name of the transform to apply to the column. + * Other attributes vary depending on transform type. + +``` +[[feature_selections]] +input_column = "namelast_clean" +output_column = "namelast_clean_bigrams" +transform = "bigrams" + +[[feature_selections]] +input_column = "bpl_clean" +output_column = "region" +transform = "attach_variable" +region_dict = "/path/to/region.csv" +col_to_join_on = "bpl" +col_to_add = "region" +null_filler = 99 +col_type = "float" +``` + +## Potential Matches Universe + +* Header name: `potential_matches_universe` +* Description: Limits the universe of created potential matches generated using an expression fed to a SQL query. +* Required: False +* Type: List +* Attributes: + * `expression` -- Type: `string`. Required. The expression to use to filter prepped_df_(a/b) before generating potential matches. + +``` +[[potential_matches_universe]] +# limits potential matches created to only men +expression = "sex == 1" +``` + +## Blocking + +* Header name: `blocking` +* Description: Describes what columns to block on and how to create the blocks for the potential matches. +* Required: True +* Type: List +* Attributes: + * `column_name` -- Type: `string`. Required. The name of the column in the existing data to block on if not exploded; The name of the newly exploded column if `explode = true`. + * `explode` -- Type: `boolean`. Optional. If true, will attempt to "explode" the column by creating duplicate rows for each value in the column. Only works on columns that are arrays of values or when `expand_length` is set. + * `dataset` -- Type: `string`. Optional. Must be `a` or `b` and used in conjuction with `explode`. Will only explode the column from the `a` or `b` dataset when specified. + * `derived_from` -- Type: `string`. Used in conjunction with `explode = true`. Specifies an input column from the existing dataset to be exploded. + * `expand_length` -- Type: `integer`. When `explode` is used on a column that is an integer, this can be specified to create an array with a range of integer values from (`expand_length` minus `original_value`) to (`expand_length` plus `original_value`). For example, if the input column value for birthyr is 1870, explode is true, and the expand_length is 3, the exploded column birthyr_3 value would be the array [1867, 1868, 1869, 1870, 1871, 1872, 1873]. + + +``` +[[blocking]] +column_name = "bpl" + +[[blocking]] +column_name = "birthyr_3" +dataset = "a" +derived_from = "birthyr" +expand_length = 3 +explode = true +``` + +## [Comparisons](comparison_types) + +* Header name: `comparisons` +* Description: A list of comparisons to threshold the potential matches on. Only potential matches that pass the thresholds will be created. See [comparison types](comparison_types) for more information. +* Required: True +* Type: Object +* Attributes: + * `comparison_type` -- Type: `string`. Required. See [comparison types](comparison_types) for more information. + * `feature_name` -- Type: `string`. Required. The `comparison_feature` to use for the comparison threshold. A `comparison_feature` column by this name must be specified in the `comparison_features` section. + +``` +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "namefrst_jw" +threshold = 0.79 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "namelast_jw" +threshold = 0.79 +``` + +## [Household Comparisons](comparison_types) + +* Header name: `hh_comparisons` +* Description: A list of comparisons to threshold the household potential matches on. Also referred to as post-blocking filters, as all household potential matches are created, then only potential matches that pass the post-blocking filters will be kept for scoring. See [comparison types](comparison_types) for more information. +* Required: False +* Type: Object +* Attributes: + * `comparison_type` -- Type: `string`. Required. See [comparison types](comparison_types) for more information. + * `feature_name` -- Type: `string`. Required. The `comparison_feature` to use for the comparison threshold. A `comparison_feature` column by this name must be specified in the `comparison_features` section. + +``` +[hh_comparisons] +# only keep household potential matches with an age difference less than or equal than ten years +comparison_type = "threshold" +feature_name = "byrdiff" +threshold_expr = "<= 10" +``` + +## [Comparison Features](comparison_types) + +* Header name: `comparison_features` +* Description: A list of comparison features to create when comparing records. Comparisons for individual and household linking rounds are both represented here -- no need to duplicate comparisons if used in both rounds, simply specify the `column_name` in the appropriate `training` or `hh_training` section of the config. See the [comparison types](comparison_types) section for more information. +* Required: True +* Type: List +* Attributes: + * `alias` -- Type: `string`. Optional. The name of the comparison feature column to be generated. If not specified, the output column will default to `column_name`. + * `column_name` -- Type: `string`. The name of the columns to compare. + * `comparison_type` -- Type: `string`. The name of the comparison type to use. See the [comparison types](comparison_types) section for more information. + * `categorical` -- Type: `boolean`. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage). + * Other attributes may be included as well depending on `comparison_type`. See the [comparison types](comparison_types) section for details on each comparison type. + +``` +[[comparison_features]] +alias = "race" +column_name = "race" +comparison_type = "equals" +categorical = true + +[[comparison_features]] +alias = "namefrst_jw" +column_name = "namefrst_unstd" +comparison_type = "jaro_winkler" + +[[comparison_features]] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +``` + +## [Pipeline-generated Features](pipeline_features) + +* Header name: `pipeline_features` +* Description: Features to be added in the model pipeline created for scoring a dataset. These features cannot be used in the `comparisons` section of the config and are for creating more robust ML models. They typically leverage code available in the Spark Pipeline API. +* Required: False +* Type: List +* Attributes: + * `transformer_type` -- Type: `string`. Required. See [pipeline features](pipeline_features) for more information on the available transformer types. + * `input_column` -- Type: `string`. Either use `input_column` or `input_columns`. Used if a single input_column is needed for the pipeline feature. + * `input_columns` -- Type: List of strings. Either use `input_column` or `input_columns`. Used if a list of input_columns is needed for the pipeline feature. + * `output_column` -- Type: `string`. The name of the new pipeline feature column to be generated. + * `categorical` -- Type: `boolean`. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage). + * Other attributes may be included as well depending on the particular pipline feature `transformer_type`. + +``` +[[pipeline_features]] +input_columns = ["sex_equals", "regionf"] +output_column = "sex_regionf_interaction" +transformer_type = "interaction" + +[[pipeline_features]] +input_column = "immyear_diff" +output_column = "immyear_caution" +transformer_type = "bucketizer" +categorical = true +splits = [-1,0,6,11,9999] +``` + +## Training and [models](models) + +* Header name: `training` +* Description: Specifies the training data set as well as a myriad of attributes related to training a model including the dependent variable within that dataset, the independent variables created from the `comparison_features` section, and the different models you want to use for either model exploration or scoring. +* Required: False +* Type: Object +* Attributes: + * `dataset` -- Type: `string`. Location of the training dataset. Must be a csv file. + * `dependent_var` -- Type: `string`. Name of dependent variable in training dataset. + * `independent_vars` -- Type: `list`. List of independent variables to use in the model. These must be either part of `pipeline_features` or `comparison_features`. + * `chosen_model` -- Type: `object`. The model to train with in the `training` task and score with in the `matching` task. See the [models](models) section for more information on model specifications. + * `threshold` -- Type: `float`. The threshold for which to accept model probability values as true predictions. Can be used to specify a threshold to use for all models, or can be specified within each `chosen_model` and `model_parameters` specification. + * `decision` -- Type: `string`. Optional. Specifies which decision function to use to create the final prediction. The first option is `drop_duplicate_a`, which drops any links for which a record in the `a` data set has a predicted match more than one time. The second option is `drop_duplicate_with_threshold_ratio` which only takes links for which the `a` record has the highest probability out of any other potential links, and the second best link for the `a` record is less than the `threshold_ratio`. + * `threshold_ratio` -- Type: `float`. Optional. For use when `decision` is `drop_duplicate_with_threshold_ratio` . Specifies the smallest possible ratio to accept between a best and second best link for a given record. Can be used to specify a threshold ratio (beta threshold) to use for all models. Alternatively, unique threshold ratios can be specified in each individual `chosen_model` and `model_parameters` specification. + * `model_parameters` -- Type: `list`. Specifies models to test out in the `model_exploration` task. See the [models](models) section for more information on model specifications. + * `param_grid` -- Type: `boolean`. Optional. If you would like to evaluate multiple hyper-parameters for a single model type in your `model_parameters` specification, you can give hyper-parameter inputs as arrays of length >= 1 instead of integers to allow one model per row specification with multiple model eval outputs. + * `score_with_model` -- Type: `boolean`. If set to false, will skip the `apply_model` step of the matching task. Use this if you want to use the `run_all_steps` command and are just trying to generate potential links, such as for the creation of training data. + * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of training iterations to use during the `model_exploration` task. + * `scale_data` -- Type: `boolean`. Optional. Whether to scale the data as part of the machine learning pipeline. + * `use_training_data_features` -- Type: `boolean`. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to `true`, or training features will not be able to be generated, giving null column errors. For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to `true` or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data. If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to `false`, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven't changed, you could set it to `true` to save a small amount of processing time. + * `output_suspicious_TD` -- Type: `boolean`. Optional. Used in the `model_exploration` link task. Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data. Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set. + * `split_by_id_a` -- Type: `boolean`. Optional. Used in the `model_exploration` link task. When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a "A304BT" has three potential matches in the training data, one each to histid_b "B200", "C201", and "D425", all of those potential matches would either end up in the "train" split or the "test" split when evaluating the model performance. + * `feature_importances` -- Type: `boolean`. Optional, and currently not functional. Whether to record feature importances for the training features when training or evaluating an ML model. + + +``` +[training] +independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"] +scale_data = false +dataset = "/path/to/1900_1910_training_data_20191023.csv" +dependent_var = "match" +use_training_data_features = false +output_suspicious_TD = true +split_by_id_a = true + +score_with_model = true +feature_importances = true + +decision = "drop_duplicate_with_threshold_ratio" + +n_training_iterations = 10 +param_grid = false +model_parameters = [ + { type = "random_forest", maxDepth = 6, numTrees = 50 }, + { type = "probit", threshold = 0.5} +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } +``` + +## Household training and models + +* Header name: `hh_training` +* Description: Specifies the household training data set as well as a myriad of attributes related to training a model including the dependent var within that data set, the independent vars created from the `comparison_features` section, and the different models you want to use. +* Required: False +* Type: Object +* Attributes: + * All of the attributes and [models](models) available in [training](#training-and-models) may also be used here. + * `prediction_col` -- Type: `string`. Required. The name of the column that the final prediction value is recorded in the individual linking round scoring step. + * `hh_col` -- Type: `string`. Required. The name of the column with the household identifier. + +``` +[hh_training] +prediction_col = "prediction" +hh_col = "serialp" + +independent_vars = ["namelast_jw","namefrst_jw","namefrst_std_jw", "jw_max_a", "jw_max_b", "f1_match", "f2_match", "byrdifcat", "racematch", "imm", "bplmatch", "imm_interacted_bplmatch", "sexmatch", "mardurmatch", "relatetype", "relatematch", "relatetype_interacted_relatematch"] + +scale_data = false +dataset = "/path/to/hh_training_data_1900_1910.csv" +dependent_var = "match" +use_training_data_features = false +output_suspicious_TD = true +split_by_id_a = true +score_with_model = true +feature_importances = true +decision = "drop_duplicate_with_threshold_ratio" + +param_grid = true +n_training_iterations = 10 +model_parameters = [ + { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.1]}, + { type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]} +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } +``` diff --git a/docs/_sources/feature_selection_transforms.md.txt b/docs/_sources/feature_selection_transforms.md.txt new file mode 100644 index 0000000..0e7e332 --- /dev/null +++ b/docs/_sources/feature_selection_transforms.md.txt @@ -0,0 +1,102 @@ +# Feature Selection transforms + +Each header below represents a feature selection transform. These transforms are used in the context of `feature_selections`. + +``` +[[feature_selections]] +input_column = "clean_birthyr" +output_column = "replaced_birthyr" +condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end" +transform = "sql_condition" +``` + +There are some additional attributes available for all transforms: `checkpoint`, `override_column_a`, `override_column_b`, `set_value_column_a`, `set_value_column_b`. + +## bigrams + +Split the given string column into [bigrams](https://en.wikipedia.org/wiki/Bigram). + +* Attributes: + * `input_column` - Type: `string`. Required. + * `output_column` - Type: `string`. Required. + * `no_first_pad` - Type: boolean. Optional. If set to true, don't prepend a space " " to the column before splitting into bigrams. If false or not provided, do prepend the space. + +``` +[[feature_selections]] +input_column = "namelast_clean" +output_column = "namelast_clean_bigrams" +transform = "bigrams" +``` + +## sql_condition + +Apply the given SQL. + +* Attributes: + * `condition` - Type: `string`. Required. The SQL condition to apply. + * `output_column` - Type: `string`. Required. + +``` +[[feature_selections]] +input_column = "clean_birthyr" +output_column = "replaced_birthyr" +condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end" +transform = "sql_condition" +``` + +## array + +Combine two input columns into an array output column. + +* Attributes: + * `input_columns` - Type: list of strings. Required. The two input columns. + * `output_column` - Type: `string`. Required. + +``` +[[feature_selections]] +input_columns = ["namelast_clean_bigrams", "namefrst_unstd_bigrams"] +output_column = "namelast_frst_bigrams" +transform = "array" +``` + +## union + +Take the set union of two columns that are arrays of strings, returning another +array of strings. + +* Attributes: + * `input_columns` - Type: list of strings. Required. + * `output_column` - Type: `string`. Required. + +## soundex + +Compute the [soundex](https://en.wikipedia.org/wiki/Soundex) encoding of the input column. + +* Attributes: + * `input_column` - Type: `string`. Required. + * `output_column` - Type: `string`. Required. + +``` +[[feature_selections]] +input_column = "namelast_clean" +output_column = "namelast_clean_soundex" +transform = "soundex" +``` + +## power + +Raise the input column to a given power. + +* Attributes: + * `input_col` - Type: `string`. Required. + * `output_col` - Type: `string`. Required. + * `exponent` - Type: `int`. Required. The power to which to raise the input column. + +``` +[[feature_selections]] +input_col = "ncount" +output_col = "ncount2" +transform = "power" +exponent = 2 +``` + diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt new file mode 100644 index 0000000..1f903fd --- /dev/null +++ b/docs/_sources/index.rst.txt @@ -0,0 +1,31 @@ +.. hlink documentation master file, created by + sphinx-quickstart on Mon Jul 1 14:30:23 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to hlink's documentation! +================================= + +.. toctree:: + :maxdepth: 2 + + introduction + installation + link_tasks + running_the_program + Advanced Workflows + config + +Configuration API +================= + +.. toctree:: + :maxdepth: 2 + :caption: Configuration API + + Column Mapping + Comparison Types + Feature Selection + Pipeline Features + substitutions + models diff --git a/docs/_sources/installation.md.txt b/docs/_sources/installation.md.txt new file mode 100644 index 0000000..ff82cd9 --- /dev/null +++ b/docs/_sources/installation.md.txt @@ -0,0 +1,15 @@ +# Installation + +## Requirements +Make sure that you have each of these installed on your system. + +- [Java 8](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) +- Python >= 3.6 + +## Installing the program + +In the root project directory, run `pip install .` + +To install hlink for development work, run `pip install -e .[dev]`. This will install additional +development dependencies and install hlink in editable mode so that any changes made to the source +code are automatically built. diff --git a/docs/_sources/introduction.md.txt b/docs/_sources/introduction.md.txt new file mode 100644 index 0000000..e1627b5 --- /dev/null +++ b/docs/_sources/introduction.md.txt @@ -0,0 +1,15 @@ +# Introduction + +## Overview + +`hlink` is designed to link two datasets. It allows for probabilistic and deterministic record linkage. It provides functionality for the following production tasks: + +1. [Preprocessing](link_tasks.html#preprocessing): preprocess each dataset to clean and transform it in preparation for linking. +2. [Training](link_tasks.html#training-and-household-training): train machine learning models on a set of features and compare results between models. +3. [Matching](link_tasks.html#matching): match two datasets using a model created in training or with deterministic rules. +4. [Household Training](link_tasks.html#training-and-household-training): train machine learning models on a set of features for households and compare results between models. +5. [Household Matching](link_tasks.html#household-matching): match households between two datasets. + +In addition, it also provides functionality for the following research/development tasks: +1. [Model Exploration and Household Model Exploration](link_tasks.html#model-exploration-and-household-model-exploration): Use a matrix of models and hyper-parameters to evaluate model performance and select a model to be used in the production run. Also generates reports of suspected false positives and false negatives in the specified training data set if appropriate config flag is set. +2. [Reporting](link_tasks.html#reporting): Generate reports on the linked data. diff --git a/docs/_sources/link_tasks.md.txt b/docs/_sources/link_tasks.md.txt new file mode 100644 index 0000000..6fd8a3d --- /dev/null +++ b/docs/_sources/link_tasks.md.txt @@ -0,0 +1,73 @@ +# Link Tasks + +## Preprocessing + +### Overview + +Read in raw data and prepare it for linking. + +### Task steps + +* Step 0: Register raw dataframes with the program. Read raw data in from .parquet or .csv files. +* Step 1: Prepare the dataframes for linking. Perform substitutions, transformations, and column mappings as requested. + +## Training and Household Training + +### Overview + +Train a machine learning model to use for classification of potential links. + +### Task steps + +The steps in each of these tasks are the same: +* Step 0: Ingest the training data from a .csv file. +* Step 1: Create comparison features. +* Step 2: Train and save the model. + +## Matching + +### Overview + +Run the linking algorithm, generating a table with potential matches between individuals in the two datasets. + +### Task steps + +* Step 0: Perform blocking, exploding any columns that need it. +* Step 1: Run the matching algorithm, outputting potential matches to a `potential_matches` table. +* Step 2: Score the potential matches with the trained model. This step will be automatically skipped if machine learning is not being used. + +## Household Matching + +### Overview + +Generate a table with potential matches between households in the two datasets. + +### Task steps + +* Step 0: Block on households. +* Step 1: Filter households based on `hh_comparisons` configuration settings. +* Step 2: Score the potential matches with the trained model. This step will be automatically skipped if machine learning is not being used. + +## Model Exploration and Household Model Exploration + +### Overview + +There are two dedicated linking tasks for model exploration. `model_exploration` uses configuration settings from the Training section of the config file. `hh_model_exploration` uses configuration settings from the Household Training section of the config file. See documentation of the [`[training]`](config.html#training-and-models) and [`[hh_training]`](config.html#household-training-and-models) config sections for more details. + +### Task steps +The steps in each of these tasks are the same: + * Step 0: Ingest the specified training data file specified in the config with the `dataset` tag. + * Step 1: Create training features on the training data, or use those in the training data file (specified in the respective config section with the `use_training_data_features` flag). + * Step 2: Run `n_training_iterations` number of train-test splits on each of the models in the config `model_parameters`. + +## Reporting + +### Overview + +Report on characteristics of the linked data. + +### Task steps + +* Step 0: For households with anyone linked in round 1, report the percent of remaining household members linked in round 2. +* Step 1: Report on the representivity of linked data compared to source populations. +* Step 2: Pull in key demographic data for linked individuals and export a fixed-width crosswalk file. diff --git a/docs/_sources/models.md.txt b/docs/_sources/models.md.txt new file mode 100644 index 0000000..6631f5c --- /dev/null +++ b/docs/_sources/models.md.txt @@ -0,0 +1,60 @@ +# Models + +These are models available to be used in the model evaluation, training, and household training link tasks. + +* Attributes for all models: + * `threshold` -- Type: `float`. Alpha threshold (model hyperparameter). + * `threshold_ratio` -- Type: `float`. Beta threshold (de-duplication distance ratio). + * Any parameters available in the model as defined in the Spark documentation can be passed as params using the label given in the Spark docs. Commonly used parameters are listed below with descriptive explanations from the Spark docs. + +## random_forest + +Uses [pyspark.ml.classification.RandomForestClassifier](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier). Returns probability as an array. +* Parameters: + * `maxDepth` -- Type: `int`. Maximum depth of the tree. Spark default value is 5. + * `numTrees` -- Type: `int`. The number of trees to train. Spark default value is 20, must be >= 1. + * `featureSubsetStrategy` -- Type: `string`. Per the Spark docs: "The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]." + +``` +model_parameters = { type = "random_forest", maxDepth = 5, numTrees = 75, featureSubsetStrategy = "sqrt", threshold = 0.15, threshold_ratio = 1.0 } +``` + +## probit + +Uses [pyspark.ml.regression.GeneralizedLinearRegression](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.regression.GeneralizedLinearRegression) with `family="binomial"` and `link="probit"`. + +``` +model_parameters = { type = "probit", threshold = 0.85, threshold_ratio = 1.2 } +``` + +## logistic_regression + +Uses [pyspark.ml.classification.LogisticRegression](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression) + +``` +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } +``` + +## decision_tree + +Uses [pyspark.ml.classification.DecisionTreeClassifier](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier). +* Parameters: + * `maxDepth` -- Type: `int`. Maximum depth of the tree. + * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1." + * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature." + +``` +chosen_model = { type = "decision_tree", maxDepth = 6, minInstancesPerNode = 2, maxBins = 4} +``` + +## gradient_boosted_trees + +Uses [pyspark.ml.classification.GBTClassifier](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.GBTClassifier). +* Parameters: + * `maxDepth` -- Type: `int`. Maximum depth of the tree. + * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1." + * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature." + +``` +chosen_model = { type = "gradient_boosted_trees", maxDepth = 4, minInstancesPerNode = 1, maxBins = 6, threshold = 0.7, threshold_ratio = 1.3 } +``` diff --git a/docs/_sources/pipeline_features.md.txt b/docs/_sources/pipeline_features.md.txt new file mode 100644 index 0000000..5e07829 --- /dev/null +++ b/docs/_sources/pipeline_features.md.txt @@ -0,0 +1,48 @@ +# Pipeline generated features + +## Transformer types + +Each header below represents a feature created using a transformation available through the Spark Pipeline API. These transforms are used in the context of `pipeline_features`. + +``` +[[pipeline_features]] +input_column = "immyear_diff" +output_column = "immyear_caution" +transformer_type = "bucketizer" +categorical = true +splits = [-1,0,6,11,9999] + +[[pipeline_features]] +input_columns = ["race","srace"] +output_column = "race_interacted_srace" +transformer_type = "interaction" + +``` + +### interaction + +Interact two or more features, creating a vectorized result. + +``` +[[pipeline_features]] +# interact the categorical features for mother caution flag, mother present flag, and mother jaro-winkler score +input_columns = ["m_caution", "m_pres", "jw_m"] +output_column = "m_interacted_jw_m" +transformer_type = "interaction" +``` + +### bucketizer + +From the `pyspark.ml.feature.Bucketizer()` docs: "Maps a column of continuous features to a column of feature buckets." + +* Attributes: + * `splits` -- Type: Array of integers. Required for this transformer_type. Per the `pyspark.ml.feature.Bucketizer()` docs: "Split points for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. The splits should be of length >= 3 and strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, values outside the splits specified will be treated as errors." + +``` +[[pipeline_features]] +input_column = "relate_a" +output_column = "relatetype" +transformer_type = "bucketizer" +categorical = true +splits = [1,3,5,9999] +``` diff --git a/docs/_sources/running_the_program.md.txt b/docs/_sources/running_the_program.md.txt new file mode 100644 index 0000000..3402d38 --- /dev/null +++ b/docs/_sources/running_the_program.md.txt @@ -0,0 +1,256 @@ +# Running hlink + +## Using hlink as a Library + +hlink can be used as a Python library for scripting linking jobs. It provides some high-level classes and +functions for interacting with Spark, handling configuration, and running linking tasks and steps. + +The main class in the library is `LinkRun`, which represents a complete linking job. It provides access +to each of the link tasks and their steps. Here is an example script that uses `LinkRun` to do some linking. +Below we go into more detail on each of the important aspects of the script. + +```python +from hlink.linking.link_run import LinkRun +from hlink.spark.factory import SparkFactory +from hlink.configs.load_config import load_conf_file + +# First we create a SparkSession with all default configuration settings. +factory = SparkFactory() +spark = factory.create() + +# Now let's load in our config file. +config = load_conf_file("./my_conf") + +lr = LinkRun(spark, config) + +# Get some information about each of the steps in the +# preprocessing task. +prep_steps = lr.preprocessing.get_steps() +for (i, step) in enumerate(prep_steps): + print(f"Step {i}:", step) + print("Required input tables:", step.input_table_names) + print("Generated output tables:", step.output_table_names) + +# Run all of the steps in the preprocessing task. +lr.preprocessing.run_all_steps() + +# Run the first two steps in the matching task. +lr.matching.run_step(0) +lr.matching.run_step(1) + +# Get the potential_matches table. +matches = lr.get_table("potential_matches") + +assert matches.exists() + +# Get the Spark DataFrame for the potential_matches table. +matches_df = matches.df() +``` + +Each link task can be accessed through the `LinkRun` as an attribute like `lr.preprocessing` or `lr.hh_model_exploration`. +Link steps for each task can be run with `task.run_all_steps()` or `task.run_step(i)`. The easiest way to +access Spark tables is through `lr.get_table()`. This method returns an `hlink.linking.table.Table` object, which provides +an interface to easily check if the table exists, get its Spark DataFrame, or drop it. + +To create a `LinkRun`, we need to set up a `pyspark.sql.SparkSession` object. The most convenient way to do this is through +the `hlink.spark.factory.SparkFactory` class. `SparkFactory` defines many default configuration values which can be adjusted as needed. + +``` +from hlink.spark.factory import SparkFactory + +factory = SparkFactory() +spark = factory.set_local().set_num_cores(8).set_executor_memory("5G").create() +``` + +We'll also need to load in a config to get the `LinkRun` up and running. A config is +a dictionary with string keys, often read in from a TOML or JSON file. The +`hlink.configs.load_config.load_conf_file` function is helpful for reading in config files, +as are the `json` and `toml` python modules. For more information on writing config files, +please see the [Configuration](config) page. + +In the `examples/tutorial` directory there is an example script that uses hlink as a library to +link people between two datasets. The example includes a working config file. + +## Interactive Mode + +In addition to a library, hlink provides a command-line interface, which can be started +with the `hlink` command. + +### Starting the program + +The program takes as input a TOML or JSON configuration file, described in the [Configuration](config) page. Parameters described in the config include paths to input data files, paths to training data files, instructions for generating machine learning features, and model parameters. The configuration enables reproducible runs that should produce the same results on the same input data. + +All input flags can be printed to the console by running `hlink --help`. + +``` +cpu ~$ hlink --help +usage: hlink [-h] [--mesos] [--user USER] [--cores CORES] + [--executor_memory EXECUTOR_MEMORY] [--task TASK] + [--execute_tasks EXECUTE_TASKS [EXECUTE_TASKS ...]] + [--execute_command EXECUTE_COMMAND [EXECUTE_COMMAND ...]] + [--conf CONF] + +Historical linking program. + +optional arguments: + -h, --help show this help message and exit + --mesos run on mesos at isrdi. Must be on isrdi machines to + work. + --user USER run as a specific user + --cores CORES the max number of cores to use on mesos + --executor_memory EXECUTOR_MEMORY + the memory per executor to use + --task TASK The initial task to begin processing. + --execute_tasks EXECUTE_TASKS [EXECUTE_TASKS ...] + Execute a series of tasks then exit the program. + --execute_command EXCUTE_COMMAND [EXECUTE_COMMAND ...] + Execute a single command then exit the program. + --conf CONF, --run CONF + Specify a filepath where your config file for the run + is located. +``` + +To run the program in interactive mode using a configuration file at a specified path, say `./fullcount_1870_1880.toml`, run a command following this pattern: + +```bash +hlink --conf=./full_count_1870_1880.toml +``` + +After the program has started, you will see a prompt that looks like this: + +``` +hlink $ +``` + +Type `help` or `?` and hit enter to see a list of commands; type `help ` to see the help text of a specific command. +Commands that start with "x_" are experimental. They may be unstable or missing some documentation. + +``` +hlink $ ? + +Documented commands (type help ): +======================================== +analyze get_steps set_preexisting_tables x_persist +borrow_tables get_tasks set_print_sql x_sql +count help show x_sqlf +csv ipython showf x_summary +desc list x_crosswalk x_tab +drop q x_hh_tfam x_tfam +drop_all reload x_hh_tfam_2a x_tfam_raw +drop_all_prc run_all_steps x_hh_tfam_2b x_union +drop_all_temp run_step x_load +get_settings set_link_task x_parquet_from_csv +``` + +### Running Linking Tasks and Steps + +The program is organized into a hierarchy of tasks and steps. The five major tasks are `preprocessing`, `training`, `matching`, `hh_training`, and `hh_matching`, and within each task are multiple steps. +To see all linking tasks, run the command `get_tasks`. You should see something like this: + +``` +hlink $ get_tasks +Current link task: Preprocessing +Linking task choices are: +preprocessing :: Preprocessing + Requires no preexisting tables. + Produces tables: {'prepped_df_a', 'prepped_df_b', 'raw_df_b', 'raw_df_a'} +training :: Training + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'training_data', 'training_features'} +matching :: Matching + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'scored_potential_matches', 'potential_matches_prepped', 'potential_matches', 'exploded_df_b', 'exploded_df_a', 'predicted_matches'} +hh_training :: Household Training + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'hh_training_features', 'hh_training_data'} +hh_matching :: Household Matching + Requires tables: {'prepped_df_a', 'predicted_matches', 'prepped_df_b'} + Produces tables: {'hh_predicted_matches', 'hh_scored_potential_matches', 'hh_potential_matches', 'hh_blocked_matches', 'hh_potential_matchs_prepped'} +model_exploration :: Model Exploration + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'model_eval_training_vectorized', 'model_eval_training_data', 'model_eval_repeat_FPs', 'model_eval_training_features', 'model_eval_training_results', 'model_eval_repeat_FNs'} +hh_model_exploration :: Household Model Exploration + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'hh_model_eval_training_vectorized', 'hh_model_eval_repeat_FPs', 'hh_model_eval_repeat_FNs', 'hh_model_eval_training_results', 'hh_model_eval_training_features', 'hh_model_eval_training_data'} +reporting :: Reporting + Requires tables: {'prepped_df_a', 'hh_predicted_matches', 'prepped_df_b', 'predicted_matches', 'raw_df_b', 'raw_df_a'} + Produces no persistent tables. +``` + +Each linking task will interact with Spark tables within the program. To see a list of tables run the command `list`. To also see hidden intermediate tables, run `list all`. If you have just started the program for the first time, you should see no tables created yet: + +``` +hlink $ list ++--------+---------+-----------+ +|database|tableName|isTemporary| ++--------+---------+-----------+ ++--------+---------+-----------+ +``` + +To see information about the steps of the task you are currently on, run `get_steps`. You should see something that looks like this: + +```txt +Link task: Preprocessing +step 0: register raw dataframes + Tables used: + Tables created: + Table 'raw_df_a' <- Preprocessing: Raw data read in from datasource A + Table 'raw_df_b' <- Preprocessing: Raw data read in from datasource B +step 1: prepare dataframes + Tables used: + Table 'raw_df_a' <- Preprocessing: Raw data read in from datasource A + Table 'raw_df_b' <- Preprocessing: Raw data read in from datasource B + Tables created: + Table 'prepped_df_a' <- Preprocessing: Preprocessed data from source A with selected columns and features + Table 'prepped_df_b' <- Preprocessing: Preprocessed data from source B with selected columns and features +``` + +To change your current link task, run `set_link_task `, where `` is the name of the link task. + +Once you are sure that you are on the right task, you can use the `run_step ` command to run a step. For example if you run `run_step 0` you should see something like this: + +``` +hlink $ run_step 0 +Link task: Preprocessing +Running step 0: register raw dataframes +Finished step 0: register raw dataframes in 5.85s +``` + +After the step is complete, you can run `list` to see what tables it created: + +``` +hlink $ list ++--------+---------+-----------+-------------------------------------------------+ +|database|tableName|isTemporary|description | ++--------+---------+-----------+-------------------------------------------------+ +|linking |raw_df_a |false |Preprocessing: Raw data read in from datasource A| +|linking |raw_df_b |false |Preprocessing: Raw data read in from datasource B| ++--------+---------+-----------+-------------------------------------------------+ +``` + +To run all steps in a task, use the `run_all_steps ` command, where `` is a list of tasks you want to run all the steps for. By default this command will run all the steps for the current task. + +### Example interactive mode workflow + +1) Create a config file and put it in your hlink config directory. + For example: + ``` + /path/to/conf/full_count_1870_1880.toml + ``` + +2) Launch the hlink program in interactive mode: + ```bash + hlink --conf=/path/to/conf/full_count_1870_1880 + ``` +3) Run the tasks you want to complete: + ``` + hlink $ run_all_steps preprocessing training matching + ``` +4) List the created tables: + ``` + hlink $ list + ``` +5) Export the results: + ``` + hlink $ csv predicted_matches /my/output/file.csv + ``` diff --git a/docs/_sources/substitutions.md.txt b/docs/_sources/substitutions.md.txt new file mode 100644 index 0000000..93c9947 --- /dev/null +++ b/docs/_sources/substitutions.md.txt @@ -0,0 +1,49 @@ +# Substitutions +* Parent header: `substitution_columns` +* Subheader name: `substitutions` +* Type: List +* Attributes: + * `substitution_file` -- Type: `string`. Required. Path to the file containing the look-up table to join against for replacement values. + +You must supply a substitution file and either specify `regex_word_replace=true` or supply a join value. + +## 1:1 substitution by data table + +Performs a 1:1 replacement on a filtered subset of the data table. If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file. Used to replace variant name forms with standardized name forms, filtering on sex. + +* Attributes: + * `join_column` -- Type: `string`. Column to filter input data on. + * `join_value` -- Type: `string`. Value to filter for in the input data. + +``` +[[substitution_columns]] +column_name = "namefrst_std" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "1" +substitution_file = "/path/to/name_std/male.csv" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "2" +substitution_file = "/path/to/name_std/female.csv" +``` + +## Substitution by regex word replace + +Performs word replacement within a column's data string (such as replacing the abbreviation `Ave.` in the string `7th Ave.` with `Avenue` to create `7th Avenue`). + +* Attributes: + * `regex_word_replace` -- Type: `boolean`. Whether or not to use regex matching on the input data to perform replacement. If `true`, the swap value will still be replaced if it is anywhere in the column data, as long as it is: + * at the start of the column data string, or proceeded by a space + * at the end of the column data string, or followed by a space + +``` +[[substitution_columns]] +column_name = "street_unstd" + +[[substitution_columns.substitutions]] +regex_word_replace = true +substitution_file = "/path/to/dir/substitutions_street_abbrevs.csv" +``` diff --git a/docs/_sources/use_examples.md.txt b/docs/_sources/use_examples.md.txt new file mode 100644 index 0000000..e781202 --- /dev/null +++ b/docs/_sources/use_examples.md.txt @@ -0,0 +1,137 @@ +# Advanced Workflow Examples + + +## Export training data after generating features to reuse in different linking years + +It is common to have a single training data set that spans two linked years, which is then used to train a model that is applied to a different set of linked years. For example, we have a training data set that spans linked individuals from the 1900 census to the 1910 census. We use this training data to predict links in the full count 1900-1910 linking run, but we also use this training data to link year pairs 1910-1920, 1920-1930, and 1930-1940. + +When this training data set is used for the years it was derived from, the only columns necessary are the HISTIDs identifying the individuals in the data and the dependent variable (usually a boolean `match` column) for the model training. Features for the machine learning model generation are created from the source data available in the full count run. + +However, when this training data set is used for other years, the program does not have access to the source full count files, and can't generate the ML features from the given data. In this scenario, you would need to save a copy of the `training_features` and `hh_training_features` Spark tables to .csv so you can point to that in the other year pair runs, and indicate the `use_potential_matches_features = true` flag in both the `training` and `hh_training` sections of the configuration. + +### Example training data export with generated ML features + +1) Create a config file and put it in your hlink config directory. + +2) Launch the hlink program in interactive mode: + + ``` + hlink --conf=full_count_1900_1910 --cores 50 --executor_memory 50G + ``` + +3) Run the preprocessing and training link tasks: + + ```bash + hlink $ run_all_steps preprocessing training + ``` + +4) Ask the program what the arguments for the `csv` command are: + + ```bash + hlink $ ? csv + Writes a dataframe out to csv. + Arg 1: dataframe + Arg 2: path + Arg 3 (optional): # of partitions + ``` + +5) Export the results using the `csv` command: + + ```bash + hlink $ csv training_features /my/output/training_data_1900_1910_HLINK_FEATURES.csv + ``` + +6) Continue with other linking work you might need to do with this year pair, otherwise shut down the hlink framework for this pair of linking years: + + ```bash + hlink $ q + ``` + +7) In the config file for the new year pairs (1910-1920, 1920-1930, etc.), point to this new file as your dataset, and set the `use_training_data_features` + + ``` + # config file for 1910-1920 linking run using the 1900-1910 training data with hlink-generated features + [training] + + # more configs here... + + dataset = "/path/to/training_data_1900_1910_HLINK_FEATURES.csv" + dependent_var = "match" + + # This needs to be changed to `true` to use the features we just generated + use_training_data_features = true + + # configs continue here... + ``` + +8) Launch the hlink program using your new config for the new year pair you want to link. Run your link tasks and export relevant data. + +## ML model exploration and export of lists of potential false positives/negatives in training data +`hlink` accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models. You can see example `training` and `hh_training` configuration sections that implement this in the [training](config.html#training-and-models) and [household training](config.html#household-training-and-models) sections of the configuration documentation. + +The model exploration link task also allows you to export lists of potential false positives (FPs) and false negatives (FNs) in your training data. This is calculated when running the train/test splits in the regular model exploration tasks if the `output_suspicious_TD` flag is true. + +### Example model exploration and FP/FN export workflow + +1) Create a config file that has a `training` and/or `hh_training` section with model parameters to explore. For example: + + ``` + [training] + + independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"] + + scale_data = false + dataset = "/path/to/training_data_1900_1910.csv" + dependent_var = "match" + + # This would need to be changed to `true` in a run between other years if your + # source data years weren't identical to the linked years of your training data. + use_training_data_features = false + + # VERY IMPORTANT if you want to output FPs/FNs + output_suspicious_TD = true + + split_by_id_a = true + score_with_model = true + feature_importances = false + decision = "drop_duplicate_with_threshold_ratio" + param_grid = true + n_training_iterations = 10 + model_parameters = [ + { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.0, 1.1]}, + { type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]} + ] + + # The chosen_model is the final selected model to use in the full count production + # run. This is where you would manually update your config after running model + # exploration and making decisions about your models and hyperparameters. This + # section isn't used by the model exploration task. + chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } + ``` + +2) Launch the hlink program in interactive mode: + + ```bash + hlink --conf=full_count_1900_1910 --cores 50 --executor_memory 50G + ``` + +3) Run the preprocessing and model exploration link tasks: + + ``` + hlink $ run_all_steps preprocessing model_exploration + ``` + +4) Export the results of the train/test split runs to csv for further analysis. For `training` params, the results will be in the `training_results` table, and for `hh_training` in the `hh_training_results` table. + + ``` + hlink $ csv training_results /my/output/1900_1910_training_results.csv + ``` + +5) Export the potential FPs and FNs to csv. For `training` params, the results will be in the `repeat_FPs` and `repeat_FNs` tables, and for `hh_training` in the `hh_repeat_FPs` and `hh_repeat_FNs` tables. + + ``` + hlink $ csv repeat_FPs /my/output/1900_1910_potential_FPs.csv + hlink $ csv repeat_FNs /my/output/1900_1910_potential_FNs.csv + ``` + +6) Use your preferred methods to analyze the data you've just exported. Update the `chosen_model` in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs. diff --git a/docs/_static/alabaster.css b/docs/_static/alabaster.css new file mode 100644 index 0000000..0eddaeb --- /dev/null +++ b/docs/_static/alabaster.css @@ -0,0 +1,701 @@ +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +body { + font-family: Georgia, serif; + font-size: 17px; + background-color: #fff; + color: #000; + margin: 0; + padding: 0; +} + + +div.document { + width: 940px; + margin: 30px auto 0 auto; +} + +div.documentwrapper { + float: left; + width: 100%; +} + +div.bodywrapper { + margin: 0 0 0 220px; +} + +div.sphinxsidebar { + width: 220px; + font-size: 14px; + line-height: 1.5; +} + +hr { + border: 1px solid #B1B4B6; +} + +div.body { + background-color: #fff; + color: #3E4349; + padding: 0 30px 0 30px; +} + +div.body > .section { + text-align: left; +} + +div.footer { + width: 940px; + margin: 20px auto 30px auto; + font-size: 14px; + color: #888; + text-align: right; +} + +div.footer a { + color: #888; +} + +p.caption { + font-family: inherit; + font-size: inherit; +} + + +div.relations { + display: none; +} + + +div.sphinxsidebar a { + color: #444; + text-decoration: none; + border-bottom: 1px dotted #999; +} + +div.sphinxsidebar a:hover { + border-bottom: 1px solid #999; +} + +div.sphinxsidebarwrapper { + padding: 18px 10px; +} + +div.sphinxsidebarwrapper p.logo { + padding: 0; + margin: -10px 0 0 0px; + text-align: center; +} + +div.sphinxsidebarwrapper h1.logo { + margin-top: -10px; + text-align: center; + margin-bottom: 5px; + text-align: left; +} + +div.sphinxsidebarwrapper h1.logo-name { + margin-top: 0px; +} + +div.sphinxsidebarwrapper p.blurb { + margin-top: 0; + font-style: normal; +} + +div.sphinxsidebar h3, +div.sphinxsidebar h4 { + font-family: Georgia, serif; + color: #444; + font-size: 24px; + font-weight: normal; + margin: 0 0 5px 0; + padding: 0; +} + +div.sphinxsidebar h4 { + font-size: 20px; +} + +div.sphinxsidebar h3 a { + color: #444; +} + +div.sphinxsidebar p.logo a, +div.sphinxsidebar h3 a, +div.sphinxsidebar p.logo a:hover, +div.sphinxsidebar h3 a:hover { + border: none; +} + +div.sphinxsidebar p { + color: #555; + margin: 10px 0; +} + +div.sphinxsidebar ul { + margin: 10px 0; + padding: 0; + color: #000; +} + +div.sphinxsidebar ul li.toctree-l1 > a { + font-size: 120%; +} + +div.sphinxsidebar ul li.toctree-l2 > a { + font-size: 110%; +} + +div.sphinxsidebar input { + border: 1px solid #CCC; + font-family: Georgia, serif; + font-size: 1em; +} + +div.sphinxsidebar hr { + border: none; + height: 1px; + color: #AAA; + background: #AAA; + + text-align: left; + margin-left: 0; + width: 50%; +} + +div.sphinxsidebar .badge { + border-bottom: none; +} + +div.sphinxsidebar .badge:hover { + border-bottom: none; +} + +/* To address an issue with donation coming after search */ +div.sphinxsidebar h3.donation { + margin-top: 10px; +} + +/* -- body styles ----------------------------------------------------------- */ + +a { + color: #004B6B; + text-decoration: underline; +} + +a:hover { + color: #6D4100; + text-decoration: underline; +} + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + font-family: Georgia, serif; + font-weight: normal; + margin: 30px 0px 10px 0px; + padding: 0; +} + +div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; } +div.body h2 { font-size: 180%; } +div.body h3 { font-size: 150%; } +div.body h4 { font-size: 130%; } +div.body h5 { font-size: 100%; } +div.body h6 { font-size: 100%; } + +a.headerlink { + color: #DDD; + padding: 0 4px; + text-decoration: none; +} + +a.headerlink:hover { + color: #444; + background: #EAEAEA; +} + +div.body p, div.body dd, div.body li { + line-height: 1.4em; +} + +div.admonition { + margin: 20px 0px; + padding: 10px 30px; + background-color: #EEE; + border: 1px solid #CCC; +} + +div.admonition tt.xref, div.admonition code.xref, div.admonition a tt { + background-color: #FBFBFB; + border-bottom: 1px solid #fafafa; +} + +div.admonition p.admonition-title { + font-family: Georgia, serif; + font-weight: normal; + font-size: 24px; + margin: 0 0 10px 0; + padding: 0; + line-height: 1; +} + +div.admonition p.last { + margin-bottom: 0; +} + +div.highlight { + background-color: #fff; +} + +dt:target, .highlight { + background: #FAF3E8; +} + +div.warning { + background-color: #FCC; + border: 1px solid #FAA; +} + +div.danger { + background-color: #FCC; + border: 1px solid #FAA; + -moz-box-shadow: 2px 2px 4px #D52C2C; + -webkit-box-shadow: 2px 2px 4px #D52C2C; + box-shadow: 2px 2px 4px #D52C2C; +} + +div.error { + background-color: #FCC; + border: 1px solid #FAA; + -moz-box-shadow: 2px 2px 4px #D52C2C; + -webkit-box-shadow: 2px 2px 4px #D52C2C; + box-shadow: 2px 2px 4px #D52C2C; +} + +div.caution { + background-color: #FCC; + border: 1px solid #FAA; +} + +div.attention { + background-color: #FCC; + border: 1px solid #FAA; +} + +div.important { + background-color: #EEE; + border: 1px solid #CCC; +} + +div.note { + background-color: #EEE; + border: 1px solid #CCC; +} + +div.tip { + background-color: #EEE; + border: 1px solid #CCC; +} + +div.hint { + background-color: #EEE; + border: 1px solid #CCC; +} + +div.seealso { + background-color: #EEE; + border: 1px solid #CCC; +} + +div.topic { + background-color: #EEE; +} + +p.admonition-title { + display: inline; +} + +p.admonition-title:after { + content: ":"; +} + +pre, tt, code { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; + font-size: 0.9em; +} + +.hll { + background-color: #FFC; + margin: 0 -12px; + padding: 0 12px; + display: block; +} + +img.screenshot { +} + +tt.descname, tt.descclassname, code.descname, code.descclassname { + font-size: 0.95em; +} + +tt.descname, code.descname { + padding-right: 0.08em; +} + +img.screenshot { + -moz-box-shadow: 2px 2px 4px #EEE; + -webkit-box-shadow: 2px 2px 4px #EEE; + box-shadow: 2px 2px 4px #EEE; +} + +table.docutils { + border: 1px solid #888; + -moz-box-shadow: 2px 2px 4px #EEE; + -webkit-box-shadow: 2px 2px 4px #EEE; + box-shadow: 2px 2px 4px #EEE; +} + +table.docutils td, table.docutils th { + border: 1px solid #888; + padding: 0.25em 0.7em; +} + +table.field-list, table.footnote { + border: none; + -moz-box-shadow: none; + -webkit-box-shadow: none; + box-shadow: none; +} + +table.footnote { + margin: 15px 0; + width: 100%; + border: 1px solid #EEE; + background: #FDFDFD; + font-size: 0.9em; +} + +table.footnote + table.footnote { + margin-top: -15px; + border-top: none; +} + +table.field-list th { + padding: 0 0.8em 0 0; +} + +table.field-list td { + padding: 0; +} + +table.field-list p { + margin-bottom: 0.8em; +} + +/* Cloned from + * https://github.com/sphinx-doc/sphinx/commit/ef60dbfce09286b20b7385333d63a60321784e68 + */ +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +table.footnote td.label { + width: .1px; + padding: 0.3em 0 0.3em 0.5em; +} + +table.footnote td { + padding: 0.3em 0.5em; +} + +dl { + margin: 0; + padding: 0; +} + +dl dd { + margin-left: 30px; +} + +blockquote { + margin: 0 0 0 30px; + padding: 0; +} + +ul, ol { + /* Matches the 30px from the narrow-screen "li > ul" selector below */ + margin: 10px 0 10px 30px; + padding: 0; +} + +pre { + background: #EEE; + padding: 7px 30px; + margin: 15px 0px; + line-height: 1.3em; +} + +div.viewcode-block:target { + background: #ffd; +} + +dl pre, blockquote pre, li pre { + margin-left: 0; + padding-left: 30px; +} + +tt, code { + background-color: #ecf0f3; + color: #222; + /* padding: 1px 2px; */ +} + +tt.xref, code.xref, a tt { + background-color: #FBFBFB; + border-bottom: 1px solid #fff; +} + +a.reference { + text-decoration: none; + border-bottom: 1px dotted #004B6B; +} + +/* Don't put an underline on images */ +a.image-reference, a.image-reference:hover { + border-bottom: none; +} + +a.reference:hover { + border-bottom: 1px solid #6D4100; +} + +a.footnote-reference { + text-decoration: none; + font-size: 0.7em; + vertical-align: top; + border-bottom: 1px dotted #004B6B; +} + +a.footnote-reference:hover { + border-bottom: 1px solid #6D4100; +} + +a:hover tt, a:hover code { + background: #EEE; +} + + +@media screen and (max-width: 870px) { + + div.sphinxsidebar { + display: none; + } + + div.document { + width: 100%; + + } + + div.documentwrapper { + margin-left: 0; + margin-top: 0; + margin-right: 0; + margin-bottom: 0; + } + + div.bodywrapper { + margin-top: 0; + margin-right: 0; + margin-bottom: 0; + margin-left: 0; + } + + ul { + margin-left: 0; + } + + li > ul { + /* Matches the 30px from the "ul, ol" selector above */ + margin-left: 30px; + } + + .document { + width: auto; + } + + .footer { + width: auto; + } + + .bodywrapper { + margin: 0; + } + + .footer { + width: auto; + } + + .github { + display: none; + } + + + +} + + + +@media screen and (max-width: 875px) { + + body { + margin: 0; + padding: 20px 30px; + } + + div.documentwrapper { + float: none; + background: #fff; + } + + div.sphinxsidebar { + display: block; + float: none; + width: 102.5%; + margin: 50px -30px -20px -30px; + padding: 10px 20px; + background: #333; + color: #FFF; + } + + div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p, + div.sphinxsidebar h3 a { + color: #fff; + } + + div.sphinxsidebar a { + color: #AAA; + } + + div.sphinxsidebar p.logo { + display: none; + } + + div.document { + width: 100%; + margin: 0; + } + + div.footer { + display: none; + } + + div.bodywrapper { + margin: 0; + } + + div.body { + min-height: 0; + padding: 0; + } + + .rtd_doc_footer { + display: none; + } + + .document { + width: auto; + } + + .footer { + width: auto; + } + + .footer { + width: auto; + } + + .github { + display: none; + } +} + + +/* misc. */ + +.revsys-inline { + display: none!important; +} + +/* Make nested-list/multi-paragraph items look better in Releases changelog + * pages. Without this, docutils' magical list fuckery causes inconsistent + * formatting between different release sub-lists. + */ +div#changelog > div.section > ul > li > p:only-child { + margin-bottom: 0; +} + +/* Hide fugly table cell borders in ..bibliography:: directive output */ +table.docutils.citation, table.docutils.citation td, table.docutils.citation th { + border: none; + /* Below needed in some edge cases; if not applied, bottom shadows appear */ + -moz-box-shadow: none; + -webkit-box-shadow: none; + box-shadow: none; +} + + +/* relbar */ + +.related { + line-height: 30px; + width: 100%; + font-size: 0.9rem; +} + +.related.top { + border-bottom: 1px solid #EEE; + margin-bottom: 20px; +} + +.related.bottom { + border-top: 1px solid #EEE; +} + +.related ul { + padding: 0; + margin: 0; + list-style: none; +} + +.related li { + display: inline; +} + +nav#rellinks { + float: right; +} + +nav#rellinks li+li:before { + content: "|"; +} + +nav#breadcrumbs li+li:before { + content: "\00BB"; +} + +/* Hide certain items when printing */ +@media print { + div.related { + display: none; + } +} \ No newline at end of file diff --git a/docs/_static/basic.css b/docs/_static/basic.css new file mode 100644 index 0000000..bf18350 --- /dev/null +++ b/docs/_static/basic.css @@ -0,0 +1,906 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 450px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a.brackets:before, +span.brackets > a:before{ + content: "["; +} + +a.brackets:after, +span.brackets > a:after { + content: "]"; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +table.footnote td, table.footnote th { + border: 0 !important; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +dl.footnote > dt, +dl.citation > dt { + float: left; + margin-right: 0.5em; +} + +dl.footnote > dd, +dl.citation > dd { + margin-bottom: 0em; +} + +dl.footnote > dd:after, +dl.citation > dd:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dt:after { + content: ":"; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/docs/_static/custom.css b/docs/_static/custom.css new file mode 100644 index 0000000..2a924f1 --- /dev/null +++ b/docs/_static/custom.css @@ -0,0 +1 @@ +/* This file intentionally left blank. */ diff --git a/docs/_static/doctools.js b/docs/_static/doctools.js new file mode 100644 index 0000000..e1bfd70 --- /dev/null +++ b/docs/_static/doctools.js @@ -0,0 +1,358 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for all documentation. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + +/** + * make the code below compatible with browsers without + * an installed firebug like debugger +if (!window.console || !console.firebug) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", + "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", + "profile", "profileEnd"]; + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +} + */ + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} + +/** + * Small JavaScript module for the documentation. + */ +var Documentation = { + + init : function() { + this.fixFirefoxAnchorBug(); + this.highlightSearchWords(); + this.initIndexTable(); + this.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS : {}, + PLURAL_EXPR : function(n) { return n === 1 ? 0 : 1; }, + LOCALE : 'unknown', + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext : function(string) { + var translated = Documentation.TRANSLATIONS[string]; + if (typeof translated === 'undefined') + return string; + return (typeof translated === 'string') ? translated : translated[0]; + }, + + ngettext : function(singular, plural, n) { + var translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated === 'undefined') + return (n == 1) ? singular : plural; + return translated[Documentation.PLURALEXPR(n)]; + }, + + addTranslations : function(catalog) { + for (var key in catalog.messages) + this.TRANSLATIONS[key] = catalog.messages[key]; + this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); + this.LOCALE = catalog.locale; + }, + + /** + * add context elements like header anchor links + */ + addContextElements : function() { + $('div[id] > :header:first').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this headline')). + appendTo(this); + }); + $('dt[id]').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this definition')). + appendTo(this); + }); + }, + + /** + * workaround a firefox stupidity + * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 + */ + fixFirefoxAnchorBug : function() { + if (document.location.hash && $.browser.mozilla) + window.setTimeout(function() { + document.location.href += ''; + }, 10); + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords : function() { + var params = $.getQueryParameters(); + var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; + if (terms.length) { + var body = $('div.body'); + if (!body.length) { + body = $('body'); + } + window.setTimeout(function() { + $.each(terms, function() { + body.highlightText(this.toLowerCase(), 'highlighted'); + }); + }, 10); + $('') + .appendTo($('#searchbox')); + } + }, + + /** + * init the domain index toggle buttons + */ + initIndexTable : function() { + var togglers = $('img.toggler').click(function() { + var src = $(this).attr('src'); + var idnum = $(this).attr('id').substr(7); + $('tr.cg-' + idnum).toggle(); + if (src.substr(-9) === 'minus.png') + $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); + else + $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); + }).css('display', ''); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { + togglers.click(); + } + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords : function() { + $('#searchbox .highlight-link').fadeOut(300); + $('span.highlighted').removeClass('highlighted'); + var url = new URL(window.location); + url.searchParams.delete('highlight'); + window.history.replaceState({}, '', url); + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar : function() { + $('input[name=q]').first().focus(); + }, + + /** + * make the url absolute + */ + makeURL : function(relativeURL) { + return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; + }, + + /** + * get the current relative url + */ + getCurrentURL : function() { + var path = document.location.pathname; + var parts = path.split(/\//); + $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { + if (this === '..') + parts.pop(); + }); + var url = parts.join('/'); + return path.substring(url.lastIndexOf('/') + 1, path.length - 1); + }, + + initOnKeyListeners: function() { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) + return; + + $(document).keydown(function(event) { + var activeElementType = document.activeElement.tagName; + // don't navigate when in search box, textarea, dropdown or button + if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT' + && activeElementType !== 'BUTTON') { + if (event.altKey || event.ctrlKey || event.metaKey) + return; + + if (!event.shiftKey) { + switch (event.key) { + case 'ArrowLeft': + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) + break; + var prevHref = $('link[rel="prev"]').prop('href'); + if (prevHref) { + window.location.href = prevHref; + return false; + } + break; + case 'ArrowRight': + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) + break; + var nextHref = $('link[rel="next"]').prop('href'); + if (nextHref) { + window.location.href = nextHref; + return false; + } + break; + case 'Escape': + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) + break; + Documentation.hideSearchWords(); + return false; + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case '/': + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) + break; + Documentation.focusSearchBar(); + return false; + } + } + }); + } +}; + +// quick alias for translations +_ = Documentation.gettext; + +$(document).ready(function() { + Documentation.init(); +}); diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js new file mode 100644 index 0000000..0dc9af0 --- /dev/null +++ b/docs/_static/documentation_options.js @@ -0,0 +1,14 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '2.0.0', + LANGUAGE: 'None', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, +}; \ No newline at end of file diff --git a/docs/_static/file.png b/docs/_static/file.png new file mode 100644 index 0000000..a858a41 Binary files /dev/null and b/docs/_static/file.png differ diff --git a/docs/_static/jquery-3.5.1.js b/docs/_static/jquery-3.5.1.js new file mode 100644 index 0000000..5093733 --- /dev/null +++ b/docs/_static/jquery-3.5.1.js @@ -0,0 +1,10872 @@ +/*! + * jQuery JavaScript Library v3.5.1 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2020-05-04T22:49Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var flat = arr.flat ? function( array ) { + return arr.flat.call( array ); +} : function( array ) { + return arr.concat.apply( [], array ); +}; + + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + +var isFunction = function isFunction( obj ) { + + // Support: Chrome <=57, Firefox <=52 + // In some browsers, typeof returns "function" for HTML elements + // (i.e., `typeof document.createElement( "object" ) === "function"`). + // We don't want to classify *any* DOM node as a function. + return typeof obj === "function" && typeof obj.nodeType !== "number"; + }; + + +var isWindow = function isWindow( obj ) { + return obj != null && obj === obj.window; + }; + + +var document = window.document; + + + + var preservedScriptAttributes = { + type: true, + src: true, + nonce: true, + noModule: true + }; + + function DOMEval( code, node, doc ) { + doc = doc || document; + + var i, val, + script = doc.createElement( "script" ); + + script.text = code; + if ( node ) { + for ( i in preservedScriptAttributes ) { + + // Support: Firefox 64+, Edge 18+ + // Some browsers don't support the "nonce" property on scripts. + // On the other hand, just using `getAttribute` is not enough as + // the `nonce` attribute is reset to an empty string whenever it + // becomes browsing-context connected. + // See https://github.com/whatwg/html/issues/2369 + // See https://html.spec.whatwg.org/#nonce-attributes + // The `node.getAttribute` check was added for the sake of + // `jQuery.globalEval` so that it can fake a nonce-containing node + // via an object. + val = node[ i ] || node.getAttribute && node.getAttribute( i ); + if ( val ) { + script.setAttribute( i, val ); + } + } + } + doc.head.appendChild( script ).parentNode.removeChild( script ); + } + + +function toType( obj ) { + if ( obj == null ) { + return obj + ""; + } + + // Support: Android <=2.3 only (functionish RegExp) + return typeof obj === "object" || typeof obj === "function" ? + class2type[ toString.call( obj ) ] || "object" : + typeof obj; +} +/* global Symbol */ +// Defining this global in .eslintrc.json would create a danger of using the global +// unguarded in another place, it seems safer to define global only for this module + + + +var + version = "3.5.1", + + // Define a local copy of jQuery + jQuery = function( selector, context ) { + + // The jQuery object is actually just the init constructor 'enhanced' + // Need init if jQuery is called (just allow error to be thrown if not included) + return new jQuery.fn.init( selector, context ); + }; + +jQuery.fn = jQuery.prototype = { + + // The current version of jQuery being used + jquery: version, + + constructor: jQuery, + + // The default length of a jQuery object is 0 + length: 0, + + toArray: function() { + return slice.call( this ); + }, + + // Get the Nth element in the matched element set OR + // Get the whole matched element set as a clean array + get: function( num ) { + + // Return all the elements in a clean array + if ( num == null ) { + return slice.call( this ); + } + + // Return just the one element from the set + return num < 0 ? this[ num + this.length ] : this[ num ]; + }, + + // Take an array of elements and push it onto the stack + // (returning the new matched element set) + pushStack: function( elems ) { + + // Build a new jQuery matched element set + var ret = jQuery.merge( this.constructor(), elems ); + + // Add the old object onto the stack (as a reference) + ret.prevObject = this; + + // Return the newly-formed element set + return ret; + }, + + // Execute a callback for every element in the matched set. + each: function( callback ) { + return jQuery.each( this, callback ); + }, + + map: function( callback ) { + return this.pushStack( jQuery.map( this, function( elem, i ) { + return callback.call( elem, i, elem ); + } ) ); + }, + + slice: function() { + return this.pushStack( slice.apply( this, arguments ) ); + }, + + first: function() { + return this.eq( 0 ); + }, + + last: function() { + return this.eq( -1 ); + }, + + even: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return ( i + 1 ) % 2; + } ) ); + }, + + odd: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return i % 2; + } ) ); + }, + + eq: function( i ) { + var len = this.length, + j = +i + ( i < 0 ? len : 0 ); + return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); + }, + + end: function() { + return this.prevObject || this.constructor(); + }, + + // For internal use only. + // Behaves like an Array's method, not like a jQuery method. + push: push, + sort: arr.sort, + splice: arr.splice +}; + +jQuery.extend = jQuery.fn.extend = function() { + var options, name, src, copy, copyIsArray, clone, + target = arguments[ 0 ] || {}, + i = 1, + length = arguments.length, + deep = false; + + // Handle a deep copy situation + if ( typeof target === "boolean" ) { + deep = target; + + // Skip the boolean and the target + target = arguments[ i ] || {}; + i++; + } + + // Handle case when target is a string or something (possible in deep copy) + if ( typeof target !== "object" && !isFunction( target ) ) { + target = {}; + } + + // Extend jQuery itself if only one argument is passed + if ( i === length ) { + target = this; + i--; + } + + for ( ; i < length; i++ ) { + + // Only deal with non-null/undefined values + if ( ( options = arguments[ i ] ) != null ) { + + // Extend the base object + for ( name in options ) { + copy = options[ name ]; + + // Prevent Object.prototype pollution + // Prevent never-ending loop + if ( name === "__proto__" || target === copy ) { + continue; + } + + // Recurse if we're merging plain objects or arrays + if ( deep && copy && ( jQuery.isPlainObject( copy ) || + ( copyIsArray = Array.isArray( copy ) ) ) ) { + src = target[ name ]; + + // Ensure proper type for the source value + if ( copyIsArray && !Array.isArray( src ) ) { + clone = []; + } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { + clone = {}; + } else { + clone = src; + } + copyIsArray = false; + + // Never move original objects, clone them + target[ name ] = jQuery.extend( deep, clone, copy ); + + // Don't bring in undefined values + } else if ( copy !== undefined ) { + target[ name ] = copy; + } + } + } + } + + // Return the modified object + return target; +}; + +jQuery.extend( { + + // Unique for each copy of jQuery on the page + expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), + + // Assume jQuery is ready without the ready module + isReady: true, + + error: function( msg ) { + throw new Error( msg ); + }, + + noop: function() {}, + + isPlainObject: function( obj ) { + var proto, Ctor; + + // Detect obvious negatives + // Use toString instead of jQuery.type to catch host objects + if ( !obj || toString.call( obj ) !== "[object Object]" ) { + return false; + } + + proto = getProto( obj ); + + // Objects with no prototype (e.g., `Object.create( null )`) are plain + if ( !proto ) { + return true; + } + + // Objects with prototype are plain iff they were constructed by a global Object function + Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; + return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; + }, + + isEmptyObject: function( obj ) { + var name; + + for ( name in obj ) { + return false; + } + return true; + }, + + // Evaluates a script in a provided context; falls back to the global one + // if not specified. + globalEval: function( code, options, doc ) { + DOMEval( code, { nonce: options && options.nonce }, doc ); + }, + + each: function( obj, callback ) { + var length, i = 0; + + if ( isArrayLike( obj ) ) { + length = obj.length; + for ( ; i < length; i++ ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } else { + for ( i in obj ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } + + return obj; + }, + + // results is for internal usage only + makeArray: function( arr, results ) { + var ret = results || []; + + if ( arr != null ) { + if ( isArrayLike( Object( arr ) ) ) { + jQuery.merge( ret, + typeof arr === "string" ? + [ arr ] : arr + ); + } else { + push.call( ret, arr ); + } + } + + return ret; + }, + + inArray: function( elem, arr, i ) { + return arr == null ? -1 : indexOf.call( arr, elem, i ); + }, + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + merge: function( first, second ) { + var len = +second.length, + j = 0, + i = first.length; + + for ( ; j < len; j++ ) { + first[ i++ ] = second[ j ]; + } + + first.length = i; + + return first; + }, + + grep: function( elems, callback, invert ) { + var callbackInverse, + matches = [], + i = 0, + length = elems.length, + callbackExpect = !invert; + + // Go through the array, only saving the items + // that pass the validator function + for ( ; i < length; i++ ) { + callbackInverse = !callback( elems[ i ], i ); + if ( callbackInverse !== callbackExpect ) { + matches.push( elems[ i ] ); + } + } + + return matches; + }, + + // arg is for internal usage only + map: function( elems, callback, arg ) { + var length, value, + i = 0, + ret = []; + + // Go through the array, translating each of the items to their new values + if ( isArrayLike( elems ) ) { + length = elems.length; + for ( ; i < length; i++ ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + + // Go through every key on the object, + } else { + for ( i in elems ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + } + + // Flatten any nested arrays + return flat( ret ); + }, + + // A global GUID counter for objects + guid: 1, + + // jQuery.support is not used in Core but other projects attach their + // properties to it so it needs to exist. + support: support +} ); + +if ( typeof Symbol === "function" ) { + jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; +} + +// Populate the class2type map +jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), +function( _i, name ) { + class2type[ "[object " + name + "]" ] = name.toLowerCase(); +} ); + +function isArrayLike( obj ) { + + // Support: real iOS 8.2 only (not reproducible in simulator) + // `in` check used to prevent JIT error (gh-2145) + // hasOwn isn't used here due to false negatives + // regarding Nodelist length in IE + var length = !!obj && "length" in obj && obj.length, + type = toType( obj ); + + if ( isFunction( obj ) || isWindow( obj ) ) { + return false; + } + + return type === "array" || length === 0 || + typeof length === "number" && length > 0 && ( length - 1 ) in obj; +} +var Sizzle = +/*! + * Sizzle CSS Selector Engine v2.3.5 + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://js.foundation/ + * + * Date: 2020-03-14 + */ +( function( window ) { +var i, + support, + Expr, + getText, + isXML, + tokenize, + compile, + select, + outermostContext, + sortInput, + hasDuplicate, + + // Local document vars + setDocument, + document, + docElem, + documentIsHTML, + rbuggyQSA, + rbuggyMatches, + matches, + contains, + + // Instance-specific data + expando = "sizzle" + 1 * new Date(), + preferredDoc = window.document, + dirruns = 0, + done = 0, + classCache = createCache(), + tokenCache = createCache(), + compilerCache = createCache(), + nonnativeSelectorCache = createCache(), + sortOrder = function( a, b ) { + if ( a === b ) { + hasDuplicate = true; + } + return 0; + }, + + // Instance methods + hasOwn = ( {} ).hasOwnProperty, + arr = [], + pop = arr.pop, + pushNative = arr.push, + push = arr.push, + slice = arr.slice, + + // Use a stripped-down indexOf as it's faster than native + // https://jsperf.com/thor-indexof-vs-for/5 + indexOf = function( list, elem ) { + var i = 0, + len = list.length; + for ( ; i < len; i++ ) { + if ( list[ i ] === elem ) { + return i; + } + } + return -1; + }, + + booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + + "ismap|loop|multiple|open|readonly|required|scoped", + + // Regular expressions + + // http://www.w3.org/TR/css3-selectors/#whitespace + whitespace = "[\\x20\\t\\r\\n\\f]", + + // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram + identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + + "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", + + // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors + attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + + + // Operator (capture 2) + "*([*^$|!~]?=)" + whitespace + + + // "Attribute values must be CSS identifiers [capture 5] + // or strings [capture 3 or capture 4]" + "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + + whitespace + "*\\]", + + pseudos = ":(" + identifier + ")(?:\\((" + + + // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: + // 1. quoted (capture 3; capture 4 or capture 5) + "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + + + // 2. simple (capture 6) + "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + + + // 3. anything else (capture 2) + ".*" + + ")\\)|)", + + // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter + rwhitespace = new RegExp( whitespace + "+", "g" ), + rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + + whitespace + "+$", "g" ), + + rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), + rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + + "*" ), + rdescend = new RegExp( whitespace + "|>" ), + + rpseudo = new RegExp( pseudos ), + ridentifier = new RegExp( "^" + identifier + "$" ), + + matchExpr = { + "ID": new RegExp( "^#(" + identifier + ")" ), + "CLASS": new RegExp( "^\\.(" + identifier + ")" ), + "TAG": new RegExp( "^(" + identifier + "|[*])" ), + "ATTR": new RegExp( "^" + attributes ), + "PSEUDO": new RegExp( "^" + pseudos ), + "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + + whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + + whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), + "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), + + // For use in libraries implementing .is() + // We use this for POS matching in `select` + "needsContext": new RegExp( "^" + whitespace + + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) + }, + + rhtml = /HTML$/i, + rinputs = /^(?:input|select|textarea|button)$/i, + rheader = /^h\d$/i, + + rnative = /^[^{]+\{\s*\[native \w/, + + // Easily-parseable/retrievable ID or TAG or CLASS selectors + rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, + + rsibling = /[+~]/, + + // CSS escapes + // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters + runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), + funescape = function( escape, nonHex ) { + var high = "0x" + escape.slice( 1 ) - 0x10000; + + return nonHex ? + + // Strip the backslash prefix from a non-hex escape sequence + nonHex : + + // Replace a hexadecimal escape sequence with the encoded Unicode code point + // Support: IE <=11+ + // For values outside the Basic Multilingual Plane (BMP), manually construct a + // surrogate pair + high < 0 ? + String.fromCharCode( high + 0x10000 ) : + String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); + }, + + // CSS string/identifier serialization + // https://drafts.csswg.org/cssom/#common-serializing-idioms + rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, + fcssescape = function( ch, asCodePoint ) { + if ( asCodePoint ) { + + // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER + if ( ch === "\0" ) { + return "\uFFFD"; + } + + // Control characters and (dependent upon position) numbers get escaped as code points + return ch.slice( 0, -1 ) + "\\" + + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; + } + + // Other potentially-special ASCII characters get backslash-escaped + return "\\" + ch; + }, + + // Used for iframes + // See setDocument() + // Removing the function wrapper causes a "Permission Denied" + // error in IE + unloadHandler = function() { + setDocument(); + }, + + inDisabledFieldset = addCombinator( + function( elem ) { + return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; + }, + { dir: "parentNode", next: "legend" } + ); + +// Optimize for push.apply( _, NodeList ) +try { + push.apply( + ( arr = slice.call( preferredDoc.childNodes ) ), + preferredDoc.childNodes + ); + + // Support: Android<4.0 + // Detect silently failing push.apply + // eslint-disable-next-line no-unused-expressions + arr[ preferredDoc.childNodes.length ].nodeType; +} catch ( e ) { + push = { apply: arr.length ? + + // Leverage slice if possible + function( target, els ) { + pushNative.apply( target, slice.call( els ) ); + } : + + // Support: IE<9 + // Otherwise append directly + function( target, els ) { + var j = target.length, + i = 0; + + // Can't trust NodeList.length + while ( ( target[ j++ ] = els[ i++ ] ) ) {} + target.length = j - 1; + } + }; +} + +function Sizzle( selector, context, results, seed ) { + var m, i, elem, nid, match, groups, newSelector, + newContext = context && context.ownerDocument, + + // nodeType defaults to 9, since context defaults to document + nodeType = context ? context.nodeType : 9; + + results = results || []; + + // Return early from calls with invalid selector or context + if ( typeof selector !== "string" || !selector || + nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { + + return results; + } + + // Try to shortcut find operations (as opposed to filters) in HTML documents + if ( !seed ) { + setDocument( context ); + context = context || document; + + if ( documentIsHTML ) { + + // If the selector is sufficiently simple, try using a "get*By*" DOM method + // (excepting DocumentFragment context, where the methods don't exist) + if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { + + // ID selector + if ( ( m = match[ 1 ] ) ) { + + // Document context + if ( nodeType === 9 ) { + if ( ( elem = context.getElementById( m ) ) ) { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( elem.id === m ) { + results.push( elem ); + return results; + } + } else { + return results; + } + + // Element context + } else { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( newContext && ( elem = newContext.getElementById( m ) ) && + contains( context, elem ) && + elem.id === m ) { + + results.push( elem ); + return results; + } + } + + // Type selector + } else if ( match[ 2 ] ) { + push.apply( results, context.getElementsByTagName( selector ) ); + return results; + + // Class selector + } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && + context.getElementsByClassName ) { + + push.apply( results, context.getElementsByClassName( m ) ); + return results; + } + } + + // Take advantage of querySelectorAll + if ( support.qsa && + !nonnativeSelectorCache[ selector + " " ] && + ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && + + // Support: IE 8 only + // Exclude object elements + ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { + + newSelector = selector; + newContext = context; + + // qSA considers elements outside a scoping root when evaluating child or + // descendant combinators, which is not what we want. + // In such cases, we work around the behavior by prefixing every selector in the + // list with an ID selector referencing the scope context. + // The technique has to be used as well when a leading combinator is used + // as such selectors are not recognized by querySelectorAll. + // Thanks to Andrew Dupont for this technique. + if ( nodeType === 1 && + ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { + + // Expand context for sibling selectors + newContext = rsibling.test( selector ) && testContext( context.parentNode ) || + context; + + // We can use :scope instead of the ID hack if the browser + // supports it & if we're not changing the context. + if ( newContext !== context || !support.scope ) { + + // Capture the context ID, setting it first if necessary + if ( ( nid = context.getAttribute( "id" ) ) ) { + nid = nid.replace( rcssescape, fcssescape ); + } else { + context.setAttribute( "id", ( nid = expando ) ); + } + } + + // Prefix every selector in the list + groups = tokenize( selector ); + i = groups.length; + while ( i-- ) { + groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + + toSelector( groups[ i ] ); + } + newSelector = groups.join( "," ); + } + + try { + push.apply( results, + newContext.querySelectorAll( newSelector ) + ); + return results; + } catch ( qsaError ) { + nonnativeSelectorCache( selector, true ); + } finally { + if ( nid === expando ) { + context.removeAttribute( "id" ); + } + } + } + } + } + + // All others + return select( selector.replace( rtrim, "$1" ), context, results, seed ); +} + +/** + * Create key-value caches of limited size + * @returns {function(string, object)} Returns the Object data after storing it on itself with + * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) + * deleting the oldest entry + */ +function createCache() { + var keys = []; + + function cache( key, value ) { + + // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) + if ( keys.push( key + " " ) > Expr.cacheLength ) { + + // Only keep the most recent entries + delete cache[ keys.shift() ]; + } + return ( cache[ key + " " ] = value ); + } + return cache; +} + +/** + * Mark a function for special use by Sizzle + * @param {Function} fn The function to mark + */ +function markFunction( fn ) { + fn[ expando ] = true; + return fn; +} + +/** + * Support testing using an element + * @param {Function} fn Passed the created element and returns a boolean result + */ +function assert( fn ) { + var el = document.createElement( "fieldset" ); + + try { + return !!fn( el ); + } catch ( e ) { + return false; + } finally { + + // Remove from its parent by default + if ( el.parentNode ) { + el.parentNode.removeChild( el ); + } + + // release memory in IE + el = null; + } +} + +/** + * Adds the same handler for all of the specified attrs + * @param {String} attrs Pipe-separated list of attributes + * @param {Function} handler The method that will be applied + */ +function addHandle( attrs, handler ) { + var arr = attrs.split( "|" ), + i = arr.length; + + while ( i-- ) { + Expr.attrHandle[ arr[ i ] ] = handler; + } +} + +/** + * Checks document order of two siblings + * @param {Element} a + * @param {Element} b + * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b + */ +function siblingCheck( a, b ) { + var cur = b && a, + diff = cur && a.nodeType === 1 && b.nodeType === 1 && + a.sourceIndex - b.sourceIndex; + + // Use IE sourceIndex if available on both nodes + if ( diff ) { + return diff; + } + + // Check if b follows a + if ( cur ) { + while ( ( cur = cur.nextSibling ) ) { + if ( cur === b ) { + return -1; + } + } + } + + return a ? 1 : -1; +} + +/** + * Returns a function to use in pseudos for input types + * @param {String} type + */ +function createInputPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for buttons + * @param {String} type + */ +function createButtonPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return ( name === "input" || name === "button" ) && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for :enabled/:disabled + * @param {Boolean} disabled true for :disabled; false for :enabled + */ +function createDisabledPseudo( disabled ) { + + // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable + return function( elem ) { + + // Only certain elements can match :enabled or :disabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled + if ( "form" in elem ) { + + // Check for inherited disabledness on relevant non-disabled elements: + // * listed form-associated elements in a disabled fieldset + // https://html.spec.whatwg.org/multipage/forms.html#category-listed + // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled + // * option elements in a disabled optgroup + // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled + // All such elements have a "form" property. + if ( elem.parentNode && elem.disabled === false ) { + + // Option elements defer to a parent optgroup if present + if ( "label" in elem ) { + if ( "label" in elem.parentNode ) { + return elem.parentNode.disabled === disabled; + } else { + return elem.disabled === disabled; + } + } + + // Support: IE 6 - 11 + // Use the isDisabled shortcut property to check for disabled fieldset ancestors + return elem.isDisabled === disabled || + + // Where there is no isDisabled, check manually + /* jshint -W018 */ + elem.isDisabled !== !disabled && + inDisabledFieldset( elem ) === disabled; + } + + return elem.disabled === disabled; + + // Try to winnow out elements that can't be disabled before trusting the disabled property. + // Some victims get caught in our net (label, legend, menu, track), but it shouldn't + // even exist on them, let alone have a boolean value. + } else if ( "label" in elem ) { + return elem.disabled === disabled; + } + + // Remaining elements are neither :enabled nor :disabled + return false; + }; +} + +/** + * Returns a function to use in pseudos for positionals + * @param {Function} fn + */ +function createPositionalPseudo( fn ) { + return markFunction( function( argument ) { + argument = +argument; + return markFunction( function( seed, matches ) { + var j, + matchIndexes = fn( [], seed.length, argument ), + i = matchIndexes.length; + + // Match elements found at the specified indexes + while ( i-- ) { + if ( seed[ ( j = matchIndexes[ i ] ) ] ) { + seed[ j ] = !( matches[ j ] = seed[ j ] ); + } + } + } ); + } ); +} + +/** + * Checks a node for validity as a Sizzle context + * @param {Element|Object=} context + * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value + */ +function testContext( context ) { + return context && typeof context.getElementsByTagName !== "undefined" && context; +} + +// Expose support vars for convenience +support = Sizzle.support = {}; + +/** + * Detects XML nodes + * @param {Element|Object} elem An element or a document + * @returns {Boolean} True iff elem is a non-HTML XML node + */ +isXML = Sizzle.isXML = function( elem ) { + var namespace = elem.namespaceURI, + docElem = ( elem.ownerDocument || elem ).documentElement; + + // Support: IE <=8 + // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes + // https://bugs.jquery.com/ticket/4833 + return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); +}; + +/** + * Sets document-related variables once based on the current document + * @param {Element|Object} [doc] An element or document object to use to set the document + * @returns {Object} Returns the current document + */ +setDocument = Sizzle.setDocument = function( node ) { + var hasCompare, subWindow, + doc = node ? node.ownerDocument || node : preferredDoc; + + // Return early if doc is invalid or already selected + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { + return document; + } + + // Update global variables + document = doc; + docElem = document.documentElement; + documentIsHTML = !isXML( document ); + + // Support: IE 9 - 11+, Edge 12 - 18+ + // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( preferredDoc != document && + ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { + + // Support: IE 11, Edge + if ( subWindow.addEventListener ) { + subWindow.addEventListener( "unload", unloadHandler, false ); + + // Support: IE 9 - 10 only + } else if ( subWindow.attachEvent ) { + subWindow.attachEvent( "onunload", unloadHandler ); + } + } + + // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, + // Safari 4 - 5 only, Opera <=11.6 - 12.x only + // IE/Edge & older browsers don't support the :scope pseudo-class. + // Support: Safari 6.0 only + // Safari 6.0 supports :scope but it's an alias of :root there. + support.scope = assert( function( el ) { + docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); + return typeof el.querySelectorAll !== "undefined" && + !el.querySelectorAll( ":scope fieldset div" ).length; + } ); + + /* Attributes + ---------------------------------------------------------------------- */ + + // Support: IE<8 + // Verify that getAttribute really returns attributes and not properties + // (excepting IE8 booleans) + support.attributes = assert( function( el ) { + el.className = "i"; + return !el.getAttribute( "className" ); + } ); + + /* getElement(s)By* + ---------------------------------------------------------------------- */ + + // Check if getElementsByTagName("*") returns only elements + support.getElementsByTagName = assert( function( el ) { + el.appendChild( document.createComment( "" ) ); + return !el.getElementsByTagName( "*" ).length; + } ); + + // Support: IE<9 + support.getElementsByClassName = rnative.test( document.getElementsByClassName ); + + // Support: IE<10 + // Check if getElementById returns elements by name + // The broken getElementById methods don't pick up programmatically-set names, + // so use a roundabout getElementsByName test + support.getById = assert( function( el ) { + docElem.appendChild( el ).id = expando; + return !document.getElementsByName || !document.getElementsByName( expando ).length; + } ); + + // ID filter and find + if ( support.getById ) { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + return elem.getAttribute( "id" ) === attrId; + }; + }; + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var elem = context.getElementById( id ); + return elem ? [ elem ] : []; + } + }; + } else { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + var node = typeof elem.getAttributeNode !== "undefined" && + elem.getAttributeNode( "id" ); + return node && node.value === attrId; + }; + }; + + // Support: IE 6 - 7 only + // getElementById is not reliable as a find shortcut + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var node, i, elems, + elem = context.getElementById( id ); + + if ( elem ) { + + // Verify the id attribute + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + + // Fall back on getElementsByName + elems = context.getElementsByName( id ); + i = 0; + while ( ( elem = elems[ i++ ] ) ) { + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + } + } + + return []; + } + }; + } + + // Tag + Expr.find[ "TAG" ] = support.getElementsByTagName ? + function( tag, context ) { + if ( typeof context.getElementsByTagName !== "undefined" ) { + return context.getElementsByTagName( tag ); + + // DocumentFragment nodes don't have gEBTN + } else if ( support.qsa ) { + return context.querySelectorAll( tag ); + } + } : + + function( tag, context ) { + var elem, + tmp = [], + i = 0, + + // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too + results = context.getElementsByTagName( tag ); + + // Filter out possible comments + if ( tag === "*" ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem.nodeType === 1 ) { + tmp.push( elem ); + } + } + + return tmp; + } + return results; + }; + + // Class + Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { + if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { + return context.getElementsByClassName( className ); + } + }; + + /* QSA/matchesSelector + ---------------------------------------------------------------------- */ + + // QSA and matchesSelector support + + // matchesSelector(:active) reports false when true (IE9/Opera 11.5) + rbuggyMatches = []; + + // qSa(:focus) reports false when true (Chrome 21) + // We allow this because of a bug in IE8/9 that throws an error + // whenever `document.activeElement` is accessed on an iframe + // So, we allow :focus to pass through QSA all the time to avoid the IE error + // See https://bugs.jquery.com/ticket/13378 + rbuggyQSA = []; + + if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { + + // Build QSA regex + // Regex strategy adopted from Diego Perini + assert( function( el ) { + + var input; + + // Select is set to empty string on purpose + // This is to test IE's treatment of not explicitly + // setting a boolean content attribute, + // since its presence should be enough + // https://bugs.jquery.com/ticket/12359 + docElem.appendChild( el ).innerHTML = "" + + ""; + + // Support: IE8, Opera 11-12.16 + // Nothing should be selected when empty strings follow ^= or $= or *= + // The test attribute must be unknown in Opera but "safe" for WinRT + // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section + if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { + rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); + } + + // Support: IE8 + // Boolean attributes and "value" are not treated correctly + if ( !el.querySelectorAll( "[selected]" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); + } + + // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ + if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { + rbuggyQSA.push( "~=" ); + } + + // Support: IE 11+, Edge 15 - 18+ + // IE 11/Edge don't find elements on a `[name='']` query in some cases. + // Adding a temporary attribute to the document before the selection works + // around the issue. + // Interestingly, IE 10 & older don't seem to have the issue. + input = document.createElement( "input" ); + input.setAttribute( "name", "" ); + el.appendChild( input ); + if ( !el.querySelectorAll( "[name='']" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + + whitespace + "*(?:''|\"\")" ); + } + + // Webkit/Opera - :checked should return selected option elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + // IE8 throws error here and will not see later tests + if ( !el.querySelectorAll( ":checked" ).length ) { + rbuggyQSA.push( ":checked" ); + } + + // Support: Safari 8+, iOS 8+ + // https://bugs.webkit.org/show_bug.cgi?id=136851 + // In-page `selector#id sibling-combinator selector` fails + if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { + rbuggyQSA.push( ".#.+[+~]" ); + } + + // Support: Firefox <=3.6 - 5 only + // Old Firefox doesn't throw on a badly-escaped identifier. + el.querySelectorAll( "\\\f" ); + rbuggyQSA.push( "[\\r\\n\\f]" ); + } ); + + assert( function( el ) { + el.innerHTML = "" + + ""; + + // Support: Windows 8 Native Apps + // The type and name attributes are restricted during .innerHTML assignment + var input = document.createElement( "input" ); + input.setAttribute( "type", "hidden" ); + el.appendChild( input ).setAttribute( "name", "D" ); + + // Support: IE8 + // Enforce case-sensitivity of name attribute + if ( el.querySelectorAll( "[name=d]" ).length ) { + rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); + } + + // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) + // IE8 throws error here and will not see later tests + if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: IE9-11+ + // IE's :disabled selector does not pick up the children of disabled fieldsets + docElem.appendChild( el ).disabled = true; + if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: Opera 10 - 11 only + // Opera 10-11 does not throw on post-comma invalid pseudos + el.querySelectorAll( "*,:x" ); + rbuggyQSA.push( ",.*:" ); + } ); + } + + if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || + docElem.webkitMatchesSelector || + docElem.mozMatchesSelector || + docElem.oMatchesSelector || + docElem.msMatchesSelector ) ) ) ) { + + assert( function( el ) { + + // Check to see if it's possible to do matchesSelector + // on a disconnected node (IE 9) + support.disconnectedMatch = matches.call( el, "*" ); + + // This should fail with an exception + // Gecko does not error, returns false instead + matches.call( el, "[s!='']:x" ); + rbuggyMatches.push( "!=", pseudos ); + } ); + } + + rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); + rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); + + /* Contains + ---------------------------------------------------------------------- */ + hasCompare = rnative.test( docElem.compareDocumentPosition ); + + // Element contains another + // Purposefully self-exclusive + // As in, an element does not contain itself + contains = hasCompare || rnative.test( docElem.contains ) ? + function( a, b ) { + var adown = a.nodeType === 9 ? a.documentElement : a, + bup = b && b.parentNode; + return a === bup || !!( bup && bup.nodeType === 1 && ( + adown.contains ? + adown.contains( bup ) : + a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 + ) ); + } : + function( a, b ) { + if ( b ) { + while ( ( b = b.parentNode ) ) { + if ( b === a ) { + return true; + } + } + } + return false; + }; + + /* Sorting + ---------------------------------------------------------------------- */ + + // Document order sorting + sortOrder = hasCompare ? + function( a, b ) { + + // Flag for duplicate removal + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + // Sort on method existence if only one input has compareDocumentPosition + var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; + if ( compare ) { + return compare; + } + + // Calculate position if both inputs belong to the same document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? + a.compareDocumentPosition( b ) : + + // Otherwise we know they are disconnected + 1; + + // Disconnected nodes + if ( compare & 1 || + ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { + + // Choose the first element that is related to our preferred document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( a == document || a.ownerDocument == preferredDoc && + contains( preferredDoc, a ) ) { + return -1; + } + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( b == document || b.ownerDocument == preferredDoc && + contains( preferredDoc, b ) ) { + return 1; + } + + // Maintain original order + return sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + } + + return compare & 4 ? -1 : 1; + } : + function( a, b ) { + + // Exit early if the nodes are identical + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + var cur, + i = 0, + aup = a.parentNode, + bup = b.parentNode, + ap = [ a ], + bp = [ b ]; + + // Parentless nodes are either documents or disconnected + if ( !aup || !bup ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + return a == document ? -1 : + b == document ? 1 : + /* eslint-enable eqeqeq */ + aup ? -1 : + bup ? 1 : + sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + + // If the nodes are siblings, we can do a quick check + } else if ( aup === bup ) { + return siblingCheck( a, b ); + } + + // Otherwise we need full lists of their ancestors for comparison + cur = a; + while ( ( cur = cur.parentNode ) ) { + ap.unshift( cur ); + } + cur = b; + while ( ( cur = cur.parentNode ) ) { + bp.unshift( cur ); + } + + // Walk down the tree looking for a discrepancy + while ( ap[ i ] === bp[ i ] ) { + i++; + } + + return i ? + + // Do a sibling check if the nodes have a common ancestor + siblingCheck( ap[ i ], bp[ i ] ) : + + // Otherwise nodes in our document sort first + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + ap[ i ] == preferredDoc ? -1 : + bp[ i ] == preferredDoc ? 1 : + /* eslint-enable eqeqeq */ + 0; + }; + + return document; +}; + +Sizzle.matches = function( expr, elements ) { + return Sizzle( expr, null, null, elements ); +}; + +Sizzle.matchesSelector = function( elem, expr ) { + setDocument( elem ); + + if ( support.matchesSelector && documentIsHTML && + !nonnativeSelectorCache[ expr + " " ] && + ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && + ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { + + try { + var ret = matches.call( elem, expr ); + + // IE 9's matchesSelector returns false on disconnected nodes + if ( ret || support.disconnectedMatch || + + // As well, disconnected nodes are said to be in a document + // fragment in IE 9 + elem.document && elem.document.nodeType !== 11 ) { + return ret; + } + } catch ( e ) { + nonnativeSelectorCache( expr, true ); + } + } + + return Sizzle( expr, document, null, [ elem ] ).length > 0; +}; + +Sizzle.contains = function( context, elem ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( context.ownerDocument || context ) != document ) { + setDocument( context ); + } + return contains( context, elem ); +}; + +Sizzle.attr = function( elem, name ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( elem.ownerDocument || elem ) != document ) { + setDocument( elem ); + } + + var fn = Expr.attrHandle[ name.toLowerCase() ], + + // Don't get fooled by Object.prototype properties (jQuery #13807) + val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? + fn( elem, name, !documentIsHTML ) : + undefined; + + return val !== undefined ? + val : + support.attributes || !documentIsHTML ? + elem.getAttribute( name ) : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; +}; + +Sizzle.escape = function( sel ) { + return ( sel + "" ).replace( rcssescape, fcssescape ); +}; + +Sizzle.error = function( msg ) { + throw new Error( "Syntax error, unrecognized expression: " + msg ); +}; + +/** + * Document sorting and removing duplicates + * @param {ArrayLike} results + */ +Sizzle.uniqueSort = function( results ) { + var elem, + duplicates = [], + j = 0, + i = 0; + + // Unless we *know* we can detect duplicates, assume their presence + hasDuplicate = !support.detectDuplicates; + sortInput = !support.sortStable && results.slice( 0 ); + results.sort( sortOrder ); + + if ( hasDuplicate ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem === results[ i ] ) { + j = duplicates.push( i ); + } + } + while ( j-- ) { + results.splice( duplicates[ j ], 1 ); + } + } + + // Clear input after sorting to release objects + // See https://github.com/jquery/sizzle/pull/225 + sortInput = null; + + return results; +}; + +/** + * Utility function for retrieving the text value of an array of DOM nodes + * @param {Array|Element} elem + */ +getText = Sizzle.getText = function( elem ) { + var node, + ret = "", + i = 0, + nodeType = elem.nodeType; + + if ( !nodeType ) { + + // If no nodeType, this is expected to be an array + while ( ( node = elem[ i++ ] ) ) { + + // Do not traverse comment nodes + ret += getText( node ); + } + } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { + + // Use textContent for elements + // innerText usage removed for consistency of new lines (jQuery #11153) + if ( typeof elem.textContent === "string" ) { + return elem.textContent; + } else { + + // Traverse its children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + ret += getText( elem ); + } + } + } else if ( nodeType === 3 || nodeType === 4 ) { + return elem.nodeValue; + } + + // Do not include comment or processing instruction nodes + + return ret; +}; + +Expr = Sizzle.selectors = { + + // Can be adjusted by the user + cacheLength: 50, + + createPseudo: markFunction, + + match: matchExpr, + + attrHandle: {}, + + find: {}, + + relative: { + ">": { dir: "parentNode", first: true }, + " ": { dir: "parentNode" }, + "+": { dir: "previousSibling", first: true }, + "~": { dir: "previousSibling" } + }, + + preFilter: { + "ATTR": function( match ) { + match[ 1 ] = match[ 1 ].replace( runescape, funescape ); + + // Move the given value to match[3] whether quoted or unquoted + match[ 3 ] = ( match[ 3 ] || match[ 4 ] || + match[ 5 ] || "" ).replace( runescape, funescape ); + + if ( match[ 2 ] === "~=" ) { + match[ 3 ] = " " + match[ 3 ] + " "; + } + + return match.slice( 0, 4 ); + }, + + "CHILD": function( match ) { + + /* matches from matchExpr["CHILD"] + 1 type (only|nth|...) + 2 what (child|of-type) + 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) + 4 xn-component of xn+y argument ([+-]?\d*n|) + 5 sign of xn-component + 6 x of xn-component + 7 sign of y-component + 8 y of y-component + */ + match[ 1 ] = match[ 1 ].toLowerCase(); + + if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { + + // nth-* requires argument + if ( !match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + // numeric x and y parameters for Expr.filter.CHILD + // remember that false/true cast respectively to 0/1 + match[ 4 ] = +( match[ 4 ] ? + match[ 5 ] + ( match[ 6 ] || 1 ) : + 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); + match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); + + // other types prohibit arguments + } else if ( match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + return match; + }, + + "PSEUDO": function( match ) { + var excess, + unquoted = !match[ 6 ] && match[ 2 ]; + + if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { + return null; + } + + // Accept quoted arguments as-is + if ( match[ 3 ] ) { + match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; + + // Strip excess characters from unquoted arguments + } else if ( unquoted && rpseudo.test( unquoted ) && + + // Get excess from tokenize (recursively) + ( excess = tokenize( unquoted, true ) ) && + + // advance to the next closing parenthesis + ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { + + // excess is a negative index + match[ 0 ] = match[ 0 ].slice( 0, excess ); + match[ 2 ] = unquoted.slice( 0, excess ); + } + + // Return only captures needed by the pseudo filter method (type and argument) + return match.slice( 0, 3 ); + } + }, + + filter: { + + "TAG": function( nodeNameSelector ) { + var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); + return nodeNameSelector === "*" ? + function() { + return true; + } : + function( elem ) { + return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; + }; + }, + + "CLASS": function( className ) { + var pattern = classCache[ className + " " ]; + + return pattern || + ( pattern = new RegExp( "(^|" + whitespace + + ")" + className + "(" + whitespace + "|$)" ) ) && classCache( + className, function( elem ) { + return pattern.test( + typeof elem.className === "string" && elem.className || + typeof elem.getAttribute !== "undefined" && + elem.getAttribute( "class" ) || + "" + ); + } ); + }, + + "ATTR": function( name, operator, check ) { + return function( elem ) { + var result = Sizzle.attr( elem, name ); + + if ( result == null ) { + return operator === "!="; + } + if ( !operator ) { + return true; + } + + result += ""; + + /* eslint-disable max-len */ + + return operator === "=" ? result === check : + operator === "!=" ? result !== check : + operator === "^=" ? check && result.indexOf( check ) === 0 : + operator === "*=" ? check && result.indexOf( check ) > -1 : + operator === "$=" ? check && result.slice( -check.length ) === check : + operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : + operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : + false; + /* eslint-enable max-len */ + + }; + }, + + "CHILD": function( type, what, _argument, first, last ) { + var simple = type.slice( 0, 3 ) !== "nth", + forward = type.slice( -4 ) !== "last", + ofType = what === "of-type"; + + return first === 1 && last === 0 ? + + // Shortcut for :nth-*(n) + function( elem ) { + return !!elem.parentNode; + } : + + function( elem, _context, xml ) { + var cache, uniqueCache, outerCache, node, nodeIndex, start, + dir = simple !== forward ? "nextSibling" : "previousSibling", + parent = elem.parentNode, + name = ofType && elem.nodeName.toLowerCase(), + useCache = !xml && !ofType, + diff = false; + + if ( parent ) { + + // :(first|last|only)-(child|of-type) + if ( simple ) { + while ( dir ) { + node = elem; + while ( ( node = node[ dir ] ) ) { + if ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) { + + return false; + } + } + + // Reverse direction for :only-* (if we haven't yet done so) + start = dir = type === "only" && !start && "nextSibling"; + } + return true; + } + + start = [ forward ? parent.firstChild : parent.lastChild ]; + + // non-xml :nth-child(...) stores cache data on `parent` + if ( forward && useCache ) { + + // Seek `elem` from a previously-cached index + + // ...in a gzip-friendly way + node = parent; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex && cache[ 2 ]; + node = nodeIndex && parent.childNodes[ nodeIndex ]; + + while ( ( node = ++nodeIndex && node && node[ dir ] || + + // Fallback to seeking `elem` from the start + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + // When found, cache indexes on `parent` and break + if ( node.nodeType === 1 && ++diff && node === elem ) { + uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; + break; + } + } + + } else { + + // Use previously-cached element index if available + if ( useCache ) { + + // ...in a gzip-friendly way + node = elem; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex; + } + + // xml :nth-child(...) + // or :nth-last-child(...) or :nth(-last)?-of-type(...) + if ( diff === false ) { + + // Use the same loop as above to seek `elem` from the start + while ( ( node = ++nodeIndex && node && node[ dir ] || + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + if ( ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) && + ++diff ) { + + // Cache the index of each encountered element + if ( useCache ) { + outerCache = node[ expando ] || + ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + uniqueCache[ type ] = [ dirruns, diff ]; + } + + if ( node === elem ) { + break; + } + } + } + } + } + + // Incorporate the offset, then check against cycle size + diff -= last; + return diff === first || ( diff % first === 0 && diff / first >= 0 ); + } + }; + }, + + "PSEUDO": function( pseudo, argument ) { + + // pseudo-class names are case-insensitive + // http://www.w3.org/TR/selectors/#pseudo-classes + // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters + // Remember that setFilters inherits from pseudos + var args, + fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || + Sizzle.error( "unsupported pseudo: " + pseudo ); + + // The user may use createPseudo to indicate that + // arguments are needed to create the filter function + // just as Sizzle does + if ( fn[ expando ] ) { + return fn( argument ); + } + + // But maintain support for old signatures + if ( fn.length > 1 ) { + args = [ pseudo, pseudo, "", argument ]; + return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? + markFunction( function( seed, matches ) { + var idx, + matched = fn( seed, argument ), + i = matched.length; + while ( i-- ) { + idx = indexOf( seed, matched[ i ] ); + seed[ idx ] = !( matches[ idx ] = matched[ i ] ); + } + } ) : + function( elem ) { + return fn( elem, 0, args ); + }; + } + + return fn; + } + }, + + pseudos: { + + // Potentially complex pseudos + "not": markFunction( function( selector ) { + + // Trim the selector passed to compile + // to avoid treating leading and trailing + // spaces as combinators + var input = [], + results = [], + matcher = compile( selector.replace( rtrim, "$1" ) ); + + return matcher[ expando ] ? + markFunction( function( seed, matches, _context, xml ) { + var elem, + unmatched = matcher( seed, null, xml, [] ), + i = seed.length; + + // Match elements unmatched by `matcher` + while ( i-- ) { + if ( ( elem = unmatched[ i ] ) ) { + seed[ i ] = !( matches[ i ] = elem ); + } + } + } ) : + function( elem, _context, xml ) { + input[ 0 ] = elem; + matcher( input, null, xml, results ); + + // Don't keep the element (issue #299) + input[ 0 ] = null; + return !results.pop(); + }; + } ), + + "has": markFunction( function( selector ) { + return function( elem ) { + return Sizzle( selector, elem ).length > 0; + }; + } ), + + "contains": markFunction( function( text ) { + text = text.replace( runescape, funescape ); + return function( elem ) { + return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; + }; + } ), + + // "Whether an element is represented by a :lang() selector + // is based solely on the element's language value + // being equal to the identifier C, + // or beginning with the identifier C immediately followed by "-". + // The matching of C against the element's language value is performed case-insensitively. + // The identifier C does not have to be a valid language name." + // http://www.w3.org/TR/selectors/#lang-pseudo + "lang": markFunction( function( lang ) { + + // lang value must be a valid identifier + if ( !ridentifier.test( lang || "" ) ) { + Sizzle.error( "unsupported lang: " + lang ); + } + lang = lang.replace( runescape, funescape ).toLowerCase(); + return function( elem ) { + var elemLang; + do { + if ( ( elemLang = documentIsHTML ? + elem.lang : + elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { + + elemLang = elemLang.toLowerCase(); + return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; + } + } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); + return false; + }; + } ), + + // Miscellaneous + "target": function( elem ) { + var hash = window.location && window.location.hash; + return hash && hash.slice( 1 ) === elem.id; + }, + + "root": function( elem ) { + return elem === docElem; + }, + + "focus": function( elem ) { + return elem === document.activeElement && + ( !document.hasFocus || document.hasFocus() ) && + !!( elem.type || elem.href || ~elem.tabIndex ); + }, + + // Boolean properties + "enabled": createDisabledPseudo( false ), + "disabled": createDisabledPseudo( true ), + + "checked": function( elem ) { + + // In CSS3, :checked should return both checked and selected elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + var nodeName = elem.nodeName.toLowerCase(); + return ( nodeName === "input" && !!elem.checked ) || + ( nodeName === "option" && !!elem.selected ); + }, + + "selected": function( elem ) { + + // Accessing this property makes selected-by-default + // options in Safari work properly + if ( elem.parentNode ) { + // eslint-disable-next-line no-unused-expressions + elem.parentNode.selectedIndex; + } + + return elem.selected === true; + }, + + // Contents + "empty": function( elem ) { + + // http://www.w3.org/TR/selectors/#empty-pseudo + // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), + // but not by others (comment: 8; processing instruction: 7; etc.) + // nodeType < 6 works because attributes (2) do not appear as children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + if ( elem.nodeType < 6 ) { + return false; + } + } + return true; + }, + + "parent": function( elem ) { + return !Expr.pseudos[ "empty" ]( elem ); + }, + + // Element/input types + "header": function( elem ) { + return rheader.test( elem.nodeName ); + }, + + "input": function( elem ) { + return rinputs.test( elem.nodeName ); + }, + + "button": function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === "button" || name === "button"; + }, + + "text": function( elem ) { + var attr; + return elem.nodeName.toLowerCase() === "input" && + elem.type === "text" && + + // Support: IE<8 + // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" + ( ( attr = elem.getAttribute( "type" ) ) == null || + attr.toLowerCase() === "text" ); + }, + + // Position-in-collection + "first": createPositionalPseudo( function() { + return [ 0 ]; + } ), + + "last": createPositionalPseudo( function( _matchIndexes, length ) { + return [ length - 1 ]; + } ), + + "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { + return [ argument < 0 ? argument + length : argument ]; + } ), + + "even": createPositionalPseudo( function( matchIndexes, length ) { + var i = 0; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "odd": createPositionalPseudo( function( matchIndexes, length ) { + var i = 1; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? + argument + length : + argument > length ? + length : + argument; + for ( ; --i >= 0; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; ++i < length; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ) + } +}; + +Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; + +// Add button/input type pseudos +for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { + Expr.pseudos[ i ] = createInputPseudo( i ); +} +for ( i in { submit: true, reset: true } ) { + Expr.pseudos[ i ] = createButtonPseudo( i ); +} + +// Easy API for creating new setFilters +function setFilters() {} +setFilters.prototype = Expr.filters = Expr.pseudos; +Expr.setFilters = new setFilters(); + +tokenize = Sizzle.tokenize = function( selector, parseOnly ) { + var matched, match, tokens, type, + soFar, groups, preFilters, + cached = tokenCache[ selector + " " ]; + + if ( cached ) { + return parseOnly ? 0 : cached.slice( 0 ); + } + + soFar = selector; + groups = []; + preFilters = Expr.preFilter; + + while ( soFar ) { + + // Comma and first run + if ( !matched || ( match = rcomma.exec( soFar ) ) ) { + if ( match ) { + + // Don't consume trailing commas as valid + soFar = soFar.slice( match[ 0 ].length ) || soFar; + } + groups.push( ( tokens = [] ) ); + } + + matched = false; + + // Combinators + if ( ( match = rcombinators.exec( soFar ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + + // Cast descendant combinators to space + type: match[ 0 ].replace( rtrim, " " ) + } ); + soFar = soFar.slice( matched.length ); + } + + // Filters + for ( type in Expr.filter ) { + if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || + ( match = preFilters[ type ]( match ) ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + type: type, + matches: match + } ); + soFar = soFar.slice( matched.length ); + } + } + + if ( !matched ) { + break; + } + } + + // Return the length of the invalid excess + // if we're just parsing + // Otherwise, throw an error or return tokens + return parseOnly ? + soFar.length : + soFar ? + Sizzle.error( selector ) : + + // Cache the tokens + tokenCache( selector, groups ).slice( 0 ); +}; + +function toSelector( tokens ) { + var i = 0, + len = tokens.length, + selector = ""; + for ( ; i < len; i++ ) { + selector += tokens[ i ].value; + } + return selector; +} + +function addCombinator( matcher, combinator, base ) { + var dir = combinator.dir, + skip = combinator.next, + key = skip || dir, + checkNonElements = base && key === "parentNode", + doneName = done++; + + return combinator.first ? + + // Check against closest ancestor/preceding element + function( elem, context, xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + return matcher( elem, context, xml ); + } + } + return false; + } : + + // Check against all ancestor/preceding elements + function( elem, context, xml ) { + var oldCache, uniqueCache, outerCache, + newCache = [ dirruns, doneName ]; + + // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching + if ( xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + if ( matcher( elem, context, xml ) ) { + return true; + } + } + } + } else { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + outerCache = elem[ expando ] || ( elem[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ elem.uniqueID ] || + ( outerCache[ elem.uniqueID ] = {} ); + + if ( skip && skip === elem.nodeName.toLowerCase() ) { + elem = elem[ dir ] || elem; + } else if ( ( oldCache = uniqueCache[ key ] ) && + oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { + + // Assign to newCache so results back-propagate to previous elements + return ( newCache[ 2 ] = oldCache[ 2 ] ); + } else { + + // Reuse newcache so results back-propagate to previous elements + uniqueCache[ key ] = newCache; + + // A match means we're done; a fail means we have to keep checking + if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { + return true; + } + } + } + } + } + return false; + }; +} + +function elementMatcher( matchers ) { + return matchers.length > 1 ? + function( elem, context, xml ) { + var i = matchers.length; + while ( i-- ) { + if ( !matchers[ i ]( elem, context, xml ) ) { + return false; + } + } + return true; + } : + matchers[ 0 ]; +} + +function multipleContexts( selector, contexts, results ) { + var i = 0, + len = contexts.length; + for ( ; i < len; i++ ) { + Sizzle( selector, contexts[ i ], results ); + } + return results; +} + +function condense( unmatched, map, filter, context, xml ) { + var elem, + newUnmatched = [], + i = 0, + len = unmatched.length, + mapped = map != null; + + for ( ; i < len; i++ ) { + if ( ( elem = unmatched[ i ] ) ) { + if ( !filter || filter( elem, context, xml ) ) { + newUnmatched.push( elem ); + if ( mapped ) { + map.push( i ); + } + } + } + } + + return newUnmatched; +} + +function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { + if ( postFilter && !postFilter[ expando ] ) { + postFilter = setMatcher( postFilter ); + } + if ( postFinder && !postFinder[ expando ] ) { + postFinder = setMatcher( postFinder, postSelector ); + } + return markFunction( function( seed, results, context, xml ) { + var temp, i, elem, + preMap = [], + postMap = [], + preexisting = results.length, + + // Get initial elements from seed or context + elems = seed || multipleContexts( + selector || "*", + context.nodeType ? [ context ] : context, + [] + ), + + // Prefilter to get matcher input, preserving a map for seed-results synchronization + matcherIn = preFilter && ( seed || !selector ) ? + condense( elems, preMap, preFilter, context, xml ) : + elems, + + matcherOut = matcher ? + + // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, + postFinder || ( seed ? preFilter : preexisting || postFilter ) ? + + // ...intermediate processing is necessary + [] : + + // ...otherwise use results directly + results : + matcherIn; + + // Find primary matches + if ( matcher ) { + matcher( matcherIn, matcherOut, context, xml ); + } + + // Apply postFilter + if ( postFilter ) { + temp = condense( matcherOut, postMap ); + postFilter( temp, [], context, xml ); + + // Un-match failing elements by moving them back to matcherIn + i = temp.length; + while ( i-- ) { + if ( ( elem = temp[ i ] ) ) { + matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); + } + } + } + + if ( seed ) { + if ( postFinder || preFilter ) { + if ( postFinder ) { + + // Get the final matcherOut by condensing this intermediate into postFinder contexts + temp = []; + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) ) { + + // Restore matcherIn since elem is not yet a final match + temp.push( ( matcherIn[ i ] = elem ) ); + } + } + postFinder( null, ( matcherOut = [] ), temp, xml ); + } + + // Move matched elements from seed to results to keep them synchronized + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) && + ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { + + seed[ temp ] = !( results[ temp ] = elem ); + } + } + } + + // Add elements to results, through postFinder if defined + } else { + matcherOut = condense( + matcherOut === results ? + matcherOut.splice( preexisting, matcherOut.length ) : + matcherOut + ); + if ( postFinder ) { + postFinder( null, results, matcherOut, xml ); + } else { + push.apply( results, matcherOut ); + } + } + } ); +} + +function matcherFromTokens( tokens ) { + var checkContext, matcher, j, + len = tokens.length, + leadingRelative = Expr.relative[ tokens[ 0 ].type ], + implicitRelative = leadingRelative || Expr.relative[ " " ], + i = leadingRelative ? 1 : 0, + + // The foundational matcher ensures that elements are reachable from top-level context(s) + matchContext = addCombinator( function( elem ) { + return elem === checkContext; + }, implicitRelative, true ), + matchAnyContext = addCombinator( function( elem ) { + return indexOf( checkContext, elem ) > -1; + }, implicitRelative, true ), + matchers = [ function( elem, context, xml ) { + var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( + ( checkContext = context ).nodeType ? + matchContext( elem, context, xml ) : + matchAnyContext( elem, context, xml ) ); + + // Avoid hanging onto element (issue #299) + checkContext = null; + return ret; + } ]; + + for ( ; i < len; i++ ) { + if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { + matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; + } else { + matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); + + // Return special upon seeing a positional matcher + if ( matcher[ expando ] ) { + + // Find the next relative operator (if any) for proper handling + j = ++i; + for ( ; j < len; j++ ) { + if ( Expr.relative[ tokens[ j ].type ] ) { + break; + } + } + return setMatcher( + i > 1 && elementMatcher( matchers ), + i > 1 && toSelector( + + // If the preceding token was a descendant combinator, insert an implicit any-element `*` + tokens + .slice( 0, i - 1 ) + .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) + ).replace( rtrim, "$1" ), + matcher, + i < j && matcherFromTokens( tokens.slice( i, j ) ), + j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), + j < len && toSelector( tokens ) + ); + } + matchers.push( matcher ); + } + } + + return elementMatcher( matchers ); +} + +function matcherFromGroupMatchers( elementMatchers, setMatchers ) { + var bySet = setMatchers.length > 0, + byElement = elementMatchers.length > 0, + superMatcher = function( seed, context, xml, results, outermost ) { + var elem, j, matcher, + matchedCount = 0, + i = "0", + unmatched = seed && [], + setMatched = [], + contextBackup = outermostContext, + + // We must always have either seed elements or outermost context + elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), + + // Use integer dirruns iff this is the outermost matcher + dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), + len = elems.length; + + if ( outermost ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + outermostContext = context == document || context || outermost; + } + + // Add elements passing elementMatchers directly to results + // Support: IE<9, Safari + // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id + for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { + if ( byElement && elem ) { + j = 0; + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( !context && elem.ownerDocument != document ) { + setDocument( elem ); + xml = !documentIsHTML; + } + while ( ( matcher = elementMatchers[ j++ ] ) ) { + if ( matcher( elem, context || document, xml ) ) { + results.push( elem ); + break; + } + } + if ( outermost ) { + dirruns = dirrunsUnique; + } + } + + // Track unmatched elements for set filters + if ( bySet ) { + + // They will have gone through all possible matchers + if ( ( elem = !matcher && elem ) ) { + matchedCount--; + } + + // Lengthen the array for every element, matched or not + if ( seed ) { + unmatched.push( elem ); + } + } + } + + // `i` is now the count of elements visited above, and adding it to `matchedCount` + // makes the latter nonnegative. + matchedCount += i; + + // Apply set filters to unmatched elements + // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` + // equals `i`), unless we didn't visit _any_ elements in the above loop because we have + // no element matchers and no seed. + // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that + // case, which will result in a "00" `matchedCount` that differs from `i` but is also + // numerically zero. + if ( bySet && i !== matchedCount ) { + j = 0; + while ( ( matcher = setMatchers[ j++ ] ) ) { + matcher( unmatched, setMatched, context, xml ); + } + + if ( seed ) { + + // Reintegrate element matches to eliminate the need for sorting + if ( matchedCount > 0 ) { + while ( i-- ) { + if ( !( unmatched[ i ] || setMatched[ i ] ) ) { + setMatched[ i ] = pop.call( results ); + } + } + } + + // Discard index placeholder values to get only actual matches + setMatched = condense( setMatched ); + } + + // Add matches to results + push.apply( results, setMatched ); + + // Seedless set matches succeeding multiple successful matchers stipulate sorting + if ( outermost && !seed && setMatched.length > 0 && + ( matchedCount + setMatchers.length ) > 1 ) { + + Sizzle.uniqueSort( results ); + } + } + + // Override manipulation of globals by nested matchers + if ( outermost ) { + dirruns = dirrunsUnique; + outermostContext = contextBackup; + } + + return unmatched; + }; + + return bySet ? + markFunction( superMatcher ) : + superMatcher; +} + +compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { + var i, + setMatchers = [], + elementMatchers = [], + cached = compilerCache[ selector + " " ]; + + if ( !cached ) { + + // Generate a function of recursive functions that can be used to check each element + if ( !match ) { + match = tokenize( selector ); + } + i = match.length; + while ( i-- ) { + cached = matcherFromTokens( match[ i ] ); + if ( cached[ expando ] ) { + setMatchers.push( cached ); + } else { + elementMatchers.push( cached ); + } + } + + // Cache the compiled function + cached = compilerCache( + selector, + matcherFromGroupMatchers( elementMatchers, setMatchers ) + ); + + // Save selector and tokenization + cached.selector = selector; + } + return cached; +}; + +/** + * A low-level selection function that works with Sizzle's compiled + * selector functions + * @param {String|Function} selector A selector or a pre-compiled + * selector function built with Sizzle.compile + * @param {Element} context + * @param {Array} [results] + * @param {Array} [seed] A set of elements to match against + */ +select = Sizzle.select = function( selector, context, results, seed ) { + var i, tokens, token, type, find, + compiled = typeof selector === "function" && selector, + match = !seed && tokenize( ( selector = compiled.selector || selector ) ); + + results = results || []; + + // Try to minimize operations if there is only one selector in the list and no seed + // (the latter of which guarantees us context) + if ( match.length === 1 ) { + + // Reduce context if the leading compound selector is an ID + tokens = match[ 0 ] = match[ 0 ].slice( 0 ); + if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && + context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { + + context = ( Expr.find[ "ID" ]( token.matches[ 0 ] + .replace( runescape, funescape ), context ) || [] )[ 0 ]; + if ( !context ) { + return results; + + // Precompiled matchers will still verify ancestry, so step up a level + } else if ( compiled ) { + context = context.parentNode; + } + + selector = selector.slice( tokens.shift().value.length ); + } + + // Fetch a seed set for right-to-left matching + i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; + while ( i-- ) { + token = tokens[ i ]; + + // Abort if we hit a combinator + if ( Expr.relative[ ( type = token.type ) ] ) { + break; + } + if ( ( find = Expr.find[ type ] ) ) { + + // Search, expanding context for leading sibling combinators + if ( ( seed = find( + token.matches[ 0 ].replace( runescape, funescape ), + rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || + context + ) ) ) { + + // If seed is empty or no tokens remain, we can return early + tokens.splice( i, 1 ); + selector = seed.length && toSelector( tokens ); + if ( !selector ) { + push.apply( results, seed ); + return results; + } + + break; + } + } + } + } + + // Compile and execute a filtering function if one is not provided + // Provide `match` to avoid retokenization if we modified the selector above + ( compiled || compile( selector, match ) )( + seed, + context, + !documentIsHTML, + results, + !context || rsibling.test( selector ) && testContext( context.parentNode ) || context + ); + return results; +}; + +// One-time assignments + +// Sort stability +support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; + +// Support: Chrome 14-35+ +// Always assume duplicates if they aren't passed to the comparison function +support.detectDuplicates = !!hasDuplicate; + +// Initialize against the default document +setDocument(); + +// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) +// Detached nodes confoundingly follow *each other* +support.sortDetached = assert( function( el ) { + + // Should return 1, but returns 4 (following) + return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; +} ); + +// Support: IE<8 +// Prevent attribute/property "interpolation" +// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx +if ( !assert( function( el ) { + el.innerHTML = ""; + return el.firstChild.getAttribute( "href" ) === "#"; +} ) ) { + addHandle( "type|href|height|width", function( elem, name, isXML ) { + if ( !isXML ) { + return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); + } + } ); +} + +// Support: IE<9 +// Use defaultValue in place of getAttribute("value") +if ( !support.attributes || !assert( function( el ) { + el.innerHTML = ""; + el.firstChild.setAttribute( "value", "" ); + return el.firstChild.getAttribute( "value" ) === ""; +} ) ) { + addHandle( "value", function( elem, _name, isXML ) { + if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { + return elem.defaultValue; + } + } ); +} + +// Support: IE<9 +// Use getAttributeNode to fetch booleans when getAttribute lies +if ( !assert( function( el ) { + return el.getAttribute( "disabled" ) == null; +} ) ) { + addHandle( booleans, function( elem, name, isXML ) { + var val; + if ( !isXML ) { + return elem[ name ] === true ? name.toLowerCase() : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; + } + } ); +} + +return Sizzle; + +} )( window ); + + + +jQuery.find = Sizzle; +jQuery.expr = Sizzle.selectors; + +// Deprecated +jQuery.expr[ ":" ] = jQuery.expr.pseudos; +jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; +jQuery.text = Sizzle.getText; +jQuery.isXMLDoc = Sizzle.isXML; +jQuery.contains = Sizzle.contains; +jQuery.escapeSelector = Sizzle.escape; + + + + +var dir = function( elem, dir, until ) { + var matched = [], + truncate = until !== undefined; + + while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { + if ( elem.nodeType === 1 ) { + if ( truncate && jQuery( elem ).is( until ) ) { + break; + } + matched.push( elem ); + } + } + return matched; +}; + + +var siblings = function( n, elem ) { + var matched = []; + + for ( ; n; n = n.nextSibling ) { + if ( n.nodeType === 1 && n !== elem ) { + matched.push( n ); + } + } + + return matched; +}; + + +var rneedsContext = jQuery.expr.match.needsContext; + + + +function nodeName( elem, name ) { + + return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); + +}; +var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); + + + +// Implement the identical functionality for filter and not +function winnow( elements, qualifier, not ) { + if ( isFunction( qualifier ) ) { + return jQuery.grep( elements, function( elem, i ) { + return !!qualifier.call( elem, i, elem ) !== not; + } ); + } + + // Single element + if ( qualifier.nodeType ) { + return jQuery.grep( elements, function( elem ) { + return ( elem === qualifier ) !== not; + } ); + } + + // Arraylike of elements (jQuery, arguments, Array) + if ( typeof qualifier !== "string" ) { + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not; + } ); + } + + // Filtered directly for both simple and complex selectors + return jQuery.filter( qualifier, elements, not ); +} + +jQuery.filter = function( expr, elems, not ) { + var elem = elems[ 0 ]; + + if ( not ) { + expr = ":not(" + expr + ")"; + } + + if ( elems.length === 1 && elem.nodeType === 1 ) { + return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; + } + + return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { + return elem.nodeType === 1; + } ) ); +}; + +jQuery.fn.extend( { + find: function( selector ) { + var i, ret, + len = this.length, + self = this; + + if ( typeof selector !== "string" ) { + return this.pushStack( jQuery( selector ).filter( function() { + for ( i = 0; i < len; i++ ) { + if ( jQuery.contains( self[ i ], this ) ) { + return true; + } + } + } ) ); + } + + ret = this.pushStack( [] ); + + for ( i = 0; i < len; i++ ) { + jQuery.find( selector, self[ i ], ret ); + } + + return len > 1 ? jQuery.uniqueSort( ret ) : ret; + }, + filter: function( selector ) { + return this.pushStack( winnow( this, selector || [], false ) ); + }, + not: function( selector ) { + return this.pushStack( winnow( this, selector || [], true ) ); + }, + is: function( selector ) { + return !!winnow( + this, + + // If this is a positional/relative selector, check membership in the returned set + // so $("p:first").is("p:last") won't return true for a doc with two "p". + typeof selector === "string" && rneedsContext.test( selector ) ? + jQuery( selector ) : + selector || [], + false + ).length; + } +} ); + + +// Initialize a jQuery object + + +// A central reference to the root jQuery(document) +var rootjQuery, + + // A simple way to check for HTML strings + // Prioritize #id over to avoid XSS via location.hash (#9521) + // Strict HTML recognition (#11290: must start with <) + // Shortcut simple #id case for speed + rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, + + init = jQuery.fn.init = function( selector, context, root ) { + var match, elem; + + // HANDLE: $(""), $(null), $(undefined), $(false) + if ( !selector ) { + return this; + } + + // Method init() accepts an alternate rootjQuery + // so migrate can support jQuery.sub (gh-2101) + root = root || rootjQuery; + + // Handle HTML strings + if ( typeof selector === "string" ) { + if ( selector[ 0 ] === "<" && + selector[ selector.length - 1 ] === ">" && + selector.length >= 3 ) { + + // Assume that strings that start and end with <> are HTML and skip the regex check + match = [ null, selector, null ]; + + } else { + match = rquickExpr.exec( selector ); + } + + // Match html or make sure no context is specified for #id + if ( match && ( match[ 1 ] || !context ) ) { + + // HANDLE: $(html) -> $(array) + if ( match[ 1 ] ) { + context = context instanceof jQuery ? context[ 0 ] : context; + + // Option to run scripts is true for back-compat + // Intentionally let the error be thrown if parseHTML is not present + jQuery.merge( this, jQuery.parseHTML( + match[ 1 ], + context && context.nodeType ? context.ownerDocument || context : document, + true + ) ); + + // HANDLE: $(html, props) + if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { + for ( match in context ) { + + // Properties of context are called as methods if possible + if ( isFunction( this[ match ] ) ) { + this[ match ]( context[ match ] ); + + // ...and otherwise set as attributes + } else { + this.attr( match, context[ match ] ); + } + } + } + + return this; + + // HANDLE: $(#id) + } else { + elem = document.getElementById( match[ 2 ] ); + + if ( elem ) { + + // Inject the element directly into the jQuery object + this[ 0 ] = elem; + this.length = 1; + } + return this; + } + + // HANDLE: $(expr, $(...)) + } else if ( !context || context.jquery ) { + return ( context || root ).find( selector ); + + // HANDLE: $(expr, context) + // (which is just equivalent to: $(context).find(expr) + } else { + return this.constructor( context ).find( selector ); + } + + // HANDLE: $(DOMElement) + } else if ( selector.nodeType ) { + this[ 0 ] = selector; + this.length = 1; + return this; + + // HANDLE: $(function) + // Shortcut for document ready + } else if ( isFunction( selector ) ) { + return root.ready !== undefined ? + root.ready( selector ) : + + // Execute immediately if ready is not present + selector( jQuery ); + } + + return jQuery.makeArray( selector, this ); + }; + +// Give the init function the jQuery prototype for later instantiation +init.prototype = jQuery.fn; + +// Initialize central reference +rootjQuery = jQuery( document ); + + +var rparentsprev = /^(?:parents|prev(?:Until|All))/, + + // Methods guaranteed to produce a unique set when starting from a unique set + guaranteedUnique = { + children: true, + contents: true, + next: true, + prev: true + }; + +jQuery.fn.extend( { + has: function( target ) { + var targets = jQuery( target, this ), + l = targets.length; + + return this.filter( function() { + var i = 0; + for ( ; i < l; i++ ) { + if ( jQuery.contains( this, targets[ i ] ) ) { + return true; + } + } + } ); + }, + + closest: function( selectors, context ) { + var cur, + i = 0, + l = this.length, + matched = [], + targets = typeof selectors !== "string" && jQuery( selectors ); + + // Positional selectors never match, since there's no _selection_ context + if ( !rneedsContext.test( selectors ) ) { + for ( ; i < l; i++ ) { + for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { + + // Always skip document fragments + if ( cur.nodeType < 11 && ( targets ? + targets.index( cur ) > -1 : + + // Don't pass non-elements to Sizzle + cur.nodeType === 1 && + jQuery.find.matchesSelector( cur, selectors ) ) ) { + + matched.push( cur ); + break; + } + } + } + } + + return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); + }, + + // Determine the position of an element within the set + index: function( elem ) { + + // No argument, return index in parent + if ( !elem ) { + return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; + } + + // Index in selector + if ( typeof elem === "string" ) { + return indexOf.call( jQuery( elem ), this[ 0 ] ); + } + + // Locate the position of the desired element + return indexOf.call( this, + + // If it receives a jQuery object, the first element is used + elem.jquery ? elem[ 0 ] : elem + ); + }, + + add: function( selector, context ) { + return this.pushStack( + jQuery.uniqueSort( + jQuery.merge( this.get(), jQuery( selector, context ) ) + ) + ); + }, + + addBack: function( selector ) { + return this.add( selector == null ? + this.prevObject : this.prevObject.filter( selector ) + ); + } +} ); + +function sibling( cur, dir ) { + while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} + return cur; +} + +jQuery.each( { + parent: function( elem ) { + var parent = elem.parentNode; + return parent && parent.nodeType !== 11 ? parent : null; + }, + parents: function( elem ) { + return dir( elem, "parentNode" ); + }, + parentsUntil: function( elem, _i, until ) { + return dir( elem, "parentNode", until ); + }, + next: function( elem ) { + return sibling( elem, "nextSibling" ); + }, + prev: function( elem ) { + return sibling( elem, "previousSibling" ); + }, + nextAll: function( elem ) { + return dir( elem, "nextSibling" ); + }, + prevAll: function( elem ) { + return dir( elem, "previousSibling" ); + }, + nextUntil: function( elem, _i, until ) { + return dir( elem, "nextSibling", until ); + }, + prevUntil: function( elem, _i, until ) { + return dir( elem, "previousSibling", until ); + }, + siblings: function( elem ) { + return siblings( ( elem.parentNode || {} ).firstChild, elem ); + }, + children: function( elem ) { + return siblings( elem.firstChild ); + }, + contents: function( elem ) { + if ( elem.contentDocument != null && + + // Support: IE 11+ + // elements with no `data` attribute has an object + // `contentDocument` with a `null` prototype. + getProto( elem.contentDocument ) ) { + + return elem.contentDocument; + } + + // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only + // Treat the template element as a regular one in browsers that + // don't support it. + if ( nodeName( elem, "template" ) ) { + elem = elem.content || elem; + } + + return jQuery.merge( [], elem.childNodes ); + } +}, function( name, fn ) { + jQuery.fn[ name ] = function( until, selector ) { + var matched = jQuery.map( this, fn, until ); + + if ( name.slice( -5 ) !== "Until" ) { + selector = until; + } + + if ( selector && typeof selector === "string" ) { + matched = jQuery.filter( selector, matched ); + } + + if ( this.length > 1 ) { + + // Remove duplicates + if ( !guaranteedUnique[ name ] ) { + jQuery.uniqueSort( matched ); + } + + // Reverse order for parents* and prev-derivatives + if ( rparentsprev.test( name ) ) { + matched.reverse(); + } + } + + return this.pushStack( matched ); + }; +} ); +var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); + + + +// Convert String-formatted options into Object-formatted ones +function createOptions( options ) { + var object = {}; + jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { + object[ flag ] = true; + } ); + return object; +} + +/* + * Create a callback list using the following parameters: + * + * options: an optional list of space-separated options that will change how + * the callback list behaves or a more traditional option object + * + * By default a callback list will act like an event callback list and can be + * "fired" multiple times. + * + * Possible options: + * + * once: will ensure the callback list can only be fired once (like a Deferred) + * + * memory: will keep track of previous values and will call any callback added + * after the list has been fired right away with the latest "memorized" + * values (like a Deferred) + * + * unique: will ensure a callback can only be added once (no duplicate in the list) + * + * stopOnFalse: interrupt callings when a callback returns false + * + */ +jQuery.Callbacks = function( options ) { + + // Convert options from String-formatted to Object-formatted if needed + // (we check in cache first) + options = typeof options === "string" ? + createOptions( options ) : + jQuery.extend( {}, options ); + + var // Flag to know if list is currently firing + firing, + + // Last fire value for non-forgettable lists + memory, + + // Flag to know if list was already fired + fired, + + // Flag to prevent firing + locked, + + // Actual callback list + list = [], + + // Queue of execution data for repeatable lists + queue = [], + + // Index of currently firing callback (modified by add/remove as needed) + firingIndex = -1, + + // Fire callbacks + fire = function() { + + // Enforce single-firing + locked = locked || options.once; + + // Execute callbacks for all pending executions, + // respecting firingIndex overrides and runtime changes + fired = firing = true; + for ( ; queue.length; firingIndex = -1 ) { + memory = queue.shift(); + while ( ++firingIndex < list.length ) { + + // Run callback and check for early termination + if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && + options.stopOnFalse ) { + + // Jump to end and forget the data so .add doesn't re-fire + firingIndex = list.length; + memory = false; + } + } + } + + // Forget the data if we're done with it + if ( !options.memory ) { + memory = false; + } + + firing = false; + + // Clean up if we're done firing for good + if ( locked ) { + + // Keep an empty list if we have data for future add calls + if ( memory ) { + list = []; + + // Otherwise, this object is spent + } else { + list = ""; + } + } + }, + + // Actual Callbacks object + self = { + + // Add a callback or a collection of callbacks to the list + add: function() { + if ( list ) { + + // If we have memory from a past run, we should fire after adding + if ( memory && !firing ) { + firingIndex = list.length - 1; + queue.push( memory ); + } + + ( function add( args ) { + jQuery.each( args, function( _, arg ) { + if ( isFunction( arg ) ) { + if ( !options.unique || !self.has( arg ) ) { + list.push( arg ); + } + } else if ( arg && arg.length && toType( arg ) !== "string" ) { + + // Inspect recursively + add( arg ); + } + } ); + } )( arguments ); + + if ( memory && !firing ) { + fire(); + } + } + return this; + }, + + // Remove a callback from the list + remove: function() { + jQuery.each( arguments, function( _, arg ) { + var index; + while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { + list.splice( index, 1 ); + + // Handle firing indexes + if ( index <= firingIndex ) { + firingIndex--; + } + } + } ); + return this; + }, + + // Check if a given callback is in the list. + // If no argument is given, return whether or not list has callbacks attached. + has: function( fn ) { + return fn ? + jQuery.inArray( fn, list ) > -1 : + list.length > 0; + }, + + // Remove all callbacks from the list + empty: function() { + if ( list ) { + list = []; + } + return this; + }, + + // Disable .fire and .add + // Abort any current/pending executions + // Clear all callbacks and values + disable: function() { + locked = queue = []; + list = memory = ""; + return this; + }, + disabled: function() { + return !list; + }, + + // Disable .fire + // Also disable .add unless we have memory (since it would have no effect) + // Abort any pending executions + lock: function() { + locked = queue = []; + if ( !memory && !firing ) { + list = memory = ""; + } + return this; + }, + locked: function() { + return !!locked; + }, + + // Call all callbacks with the given context and arguments + fireWith: function( context, args ) { + if ( !locked ) { + args = args || []; + args = [ context, args.slice ? args.slice() : args ]; + queue.push( args ); + if ( !firing ) { + fire(); + } + } + return this; + }, + + // Call all the callbacks with the given arguments + fire: function() { + self.fireWith( this, arguments ); + return this; + }, + + // To know if the callbacks have already been called at least once + fired: function() { + return !!fired; + } + }; + + return self; +}; + + +function Identity( v ) { + return v; +} +function Thrower( ex ) { + throw ex; +} + +function adoptValue( value, resolve, reject, noValue ) { + var method; + + try { + + // Check for promise aspect first to privilege synchronous behavior + if ( value && isFunction( ( method = value.promise ) ) ) { + method.call( value ).done( resolve ).fail( reject ); + + // Other thenables + } else if ( value && isFunction( ( method = value.then ) ) ) { + method.call( value, resolve, reject ); + + // Other non-thenables + } else { + + // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: + // * false: [ value ].slice( 0 ) => resolve( value ) + // * true: [ value ].slice( 1 ) => resolve() + resolve.apply( undefined, [ value ].slice( noValue ) ); + } + + // For Promises/A+, convert exceptions into rejections + // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in + // Deferred#then to conditionally suppress rejection. + } catch ( value ) { + + // Support: Android 4.0 only + // Strict mode functions invoked without .call/.apply get global-object context + reject.apply( undefined, [ value ] ); + } +} + +jQuery.extend( { + + Deferred: function( func ) { + var tuples = [ + + // action, add listener, callbacks, + // ... .then handlers, argument index, [final state] + [ "notify", "progress", jQuery.Callbacks( "memory" ), + jQuery.Callbacks( "memory" ), 2 ], + [ "resolve", "done", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 0, "resolved" ], + [ "reject", "fail", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 1, "rejected" ] + ], + state = "pending", + promise = { + state: function() { + return state; + }, + always: function() { + deferred.done( arguments ).fail( arguments ); + return this; + }, + "catch": function( fn ) { + return promise.then( null, fn ); + }, + + // Keep pipe for back-compat + pipe: function( /* fnDone, fnFail, fnProgress */ ) { + var fns = arguments; + + return jQuery.Deferred( function( newDefer ) { + jQuery.each( tuples, function( _i, tuple ) { + + // Map tuples (progress, done, fail) to arguments (done, fail, progress) + var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; + + // deferred.progress(function() { bind to newDefer or newDefer.notify }) + // deferred.done(function() { bind to newDefer or newDefer.resolve }) + // deferred.fail(function() { bind to newDefer or newDefer.reject }) + deferred[ tuple[ 1 ] ]( function() { + var returned = fn && fn.apply( this, arguments ); + if ( returned && isFunction( returned.promise ) ) { + returned.promise() + .progress( newDefer.notify ) + .done( newDefer.resolve ) + .fail( newDefer.reject ); + } else { + newDefer[ tuple[ 0 ] + "With" ]( + this, + fn ? [ returned ] : arguments + ); + } + } ); + } ); + fns = null; + } ).promise(); + }, + then: function( onFulfilled, onRejected, onProgress ) { + var maxDepth = 0; + function resolve( depth, deferred, handler, special ) { + return function() { + var that = this, + args = arguments, + mightThrow = function() { + var returned, then; + + // Support: Promises/A+ section 2.3.3.3.3 + // https://promisesaplus.com/#point-59 + // Ignore double-resolution attempts + if ( depth < maxDepth ) { + return; + } + + returned = handler.apply( that, args ); + + // Support: Promises/A+ section 2.3.1 + // https://promisesaplus.com/#point-48 + if ( returned === deferred.promise() ) { + throw new TypeError( "Thenable self-resolution" ); + } + + // Support: Promises/A+ sections 2.3.3.1, 3.5 + // https://promisesaplus.com/#point-54 + // https://promisesaplus.com/#point-75 + // Retrieve `then` only once + then = returned && + + // Support: Promises/A+ section 2.3.4 + // https://promisesaplus.com/#point-64 + // Only check objects and functions for thenability + ( typeof returned === "object" || + typeof returned === "function" ) && + returned.then; + + // Handle a returned thenable + if ( isFunction( then ) ) { + + // Special processors (notify) just wait for resolution + if ( special ) { + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ) + ); + + // Normal processors (resolve) also hook into progress + } else { + + // ...and disregard older resolution values + maxDepth++; + + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ), + resolve( maxDepth, deferred, Identity, + deferred.notifyWith ) + ); + } + + // Handle all other returned values + } else { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Identity ) { + that = undefined; + args = [ returned ]; + } + + // Process the value(s) + // Default process is resolve + ( special || deferred.resolveWith )( that, args ); + } + }, + + // Only normal processors (resolve) catch and reject exceptions + process = special ? + mightThrow : + function() { + try { + mightThrow(); + } catch ( e ) { + + if ( jQuery.Deferred.exceptionHook ) { + jQuery.Deferred.exceptionHook( e, + process.stackTrace ); + } + + // Support: Promises/A+ section 2.3.3.3.4.1 + // https://promisesaplus.com/#point-61 + // Ignore post-resolution exceptions + if ( depth + 1 >= maxDepth ) { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Thrower ) { + that = undefined; + args = [ e ]; + } + + deferred.rejectWith( that, args ); + } + } + }; + + // Support: Promises/A+ section 2.3.3.3.1 + // https://promisesaplus.com/#point-57 + // Re-resolve promises immediately to dodge false rejection from + // subsequent errors + if ( depth ) { + process(); + } else { + + // Call an optional hook to record the stack, in case of exception + // since it's otherwise lost when execution goes async + if ( jQuery.Deferred.getStackHook ) { + process.stackTrace = jQuery.Deferred.getStackHook(); + } + window.setTimeout( process ); + } + }; + } + + return jQuery.Deferred( function( newDefer ) { + + // progress_handlers.add( ... ) + tuples[ 0 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onProgress ) ? + onProgress : + Identity, + newDefer.notifyWith + ) + ); + + // fulfilled_handlers.add( ... ) + tuples[ 1 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onFulfilled ) ? + onFulfilled : + Identity + ) + ); + + // rejected_handlers.add( ... ) + tuples[ 2 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onRejected ) ? + onRejected : + Thrower + ) + ); + } ).promise(); + }, + + // Get a promise for this deferred + // If obj is provided, the promise aspect is added to the object + promise: function( obj ) { + return obj != null ? jQuery.extend( obj, promise ) : promise; + } + }, + deferred = {}; + + // Add list-specific methods + jQuery.each( tuples, function( i, tuple ) { + var list = tuple[ 2 ], + stateString = tuple[ 5 ]; + + // promise.progress = list.add + // promise.done = list.add + // promise.fail = list.add + promise[ tuple[ 1 ] ] = list.add; + + // Handle state + if ( stateString ) { + list.add( + function() { + + // state = "resolved" (i.e., fulfilled) + // state = "rejected" + state = stateString; + }, + + // rejected_callbacks.disable + // fulfilled_callbacks.disable + tuples[ 3 - i ][ 2 ].disable, + + // rejected_handlers.disable + // fulfilled_handlers.disable + tuples[ 3 - i ][ 3 ].disable, + + // progress_callbacks.lock + tuples[ 0 ][ 2 ].lock, + + // progress_handlers.lock + tuples[ 0 ][ 3 ].lock + ); + } + + // progress_handlers.fire + // fulfilled_handlers.fire + // rejected_handlers.fire + list.add( tuple[ 3 ].fire ); + + // deferred.notify = function() { deferred.notifyWith(...) } + // deferred.resolve = function() { deferred.resolveWith(...) } + // deferred.reject = function() { deferred.rejectWith(...) } + deferred[ tuple[ 0 ] ] = function() { + deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); + return this; + }; + + // deferred.notifyWith = list.fireWith + // deferred.resolveWith = list.fireWith + // deferred.rejectWith = list.fireWith + deferred[ tuple[ 0 ] + "With" ] = list.fireWith; + } ); + + // Make the deferred a promise + promise.promise( deferred ); + + // Call given func if any + if ( func ) { + func.call( deferred, deferred ); + } + + // All done! + return deferred; + }, + + // Deferred helper + when: function( singleValue ) { + var + + // count of uncompleted subordinates + remaining = arguments.length, + + // count of unprocessed arguments + i = remaining, + + // subordinate fulfillment data + resolveContexts = Array( i ), + resolveValues = slice.call( arguments ), + + // the master Deferred + master = jQuery.Deferred(), + + // subordinate callback factory + updateFunc = function( i ) { + return function( value ) { + resolveContexts[ i ] = this; + resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; + if ( !( --remaining ) ) { + master.resolveWith( resolveContexts, resolveValues ); + } + }; + }; + + // Single- and empty arguments are adopted like Promise.resolve + if ( remaining <= 1 ) { + adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, + !remaining ); + + // Use .then() to unwrap secondary thenables (cf. gh-3000) + if ( master.state() === "pending" || + isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { + + return master.then(); + } + } + + // Multiple arguments are aggregated like Promise.all array elements + while ( i-- ) { + adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); + } + + return master.promise(); + } +} ); + + +// These usually indicate a programmer mistake during development, +// warn about them ASAP rather than swallowing them by default. +var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; + +jQuery.Deferred.exceptionHook = function( error, stack ) { + + // Support: IE 8 - 9 only + // Console exists when dev tools are open, which can happen at any time + if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { + window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); + } +}; + + + + +jQuery.readyException = function( error ) { + window.setTimeout( function() { + throw error; + } ); +}; + + + + +// The deferred used on DOM ready +var readyList = jQuery.Deferred(); + +jQuery.fn.ready = function( fn ) { + + readyList + .then( fn ) + + // Wrap jQuery.readyException in a function so that the lookup + // happens at the time of error handling instead of callback + // registration. + .catch( function( error ) { + jQuery.readyException( error ); + } ); + + return this; +}; + +jQuery.extend( { + + // Is the DOM ready to be used? Set to true once it occurs. + isReady: false, + + // A counter to track how many items to wait for before + // the ready event fires. See #6781 + readyWait: 1, + + // Handle when the DOM is ready + ready: function( wait ) { + + // Abort if there are pending holds or we're already ready + if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { + return; + } + + // Remember that the DOM is ready + jQuery.isReady = true; + + // If a normal DOM Ready event fired, decrement, and wait if need be + if ( wait !== true && --jQuery.readyWait > 0 ) { + return; + } + + // If there are functions bound, to execute + readyList.resolveWith( document, [ jQuery ] ); + } +} ); + +jQuery.ready.then = readyList.then; + +// The ready event handler and self cleanup method +function completed() { + document.removeEventListener( "DOMContentLoaded", completed ); + window.removeEventListener( "load", completed ); + jQuery.ready(); +} + +// Catch cases where $(document).ready() is called +// after the browser event has already occurred. +// Support: IE <=9 - 10 only +// Older IE sometimes signals "interactive" too soon +if ( document.readyState === "complete" || + ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { + + // Handle it asynchronously to allow scripts the opportunity to delay ready + window.setTimeout( jQuery.ready ); + +} else { + + // Use the handy event callback + document.addEventListener( "DOMContentLoaded", completed ); + + // A fallback to window.onload, that will always work + window.addEventListener( "load", completed ); +} + + + + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( toType( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( !isFunction( value ) ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, _key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +}; + + +// Matches dashed string for camelizing +var rmsPrefix = /^-ms-/, + rdashAlpha = /-([a-z])/g; + +// Used by camelCase as callback to replace() +function fcamelCase( _all, letter ) { + return letter.toUpperCase(); +} + +// Convert dashed to camelCase; used by the css and data modules +// Support: IE <=9 - 11, Edge 12 - 15 +// Microsoft forgot to hump their vendor prefix (#9572) +function camelCase( string ) { + return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); +} +var acceptData = function( owner ) { + + // Accepts only: + // - Node + // - Node.ELEMENT_NODE + // - Node.DOCUMENT_NODE + // - Object + // - Any + return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); +}; + + + + +function Data() { + this.expando = jQuery.expando + Data.uid++; +} + +Data.uid = 1; + +Data.prototype = { + + cache: function( owner ) { + + // Check if the owner object already has a cache + var value = owner[ this.expando ]; + + // If not, create one + if ( !value ) { + value = {}; + + // We can accept data for non-element nodes in modern browsers, + // but we should not, see #8335. + // Always return an empty object. + if ( acceptData( owner ) ) { + + // If it is a node unlikely to be stringify-ed or looped over + // use plain assignment + if ( owner.nodeType ) { + owner[ this.expando ] = value; + + // Otherwise secure it in a non-enumerable property + // configurable must be true to allow the property to be + // deleted when data is removed + } else { + Object.defineProperty( owner, this.expando, { + value: value, + configurable: true + } ); + } + } + } + + return value; + }, + set: function( owner, data, value ) { + var prop, + cache = this.cache( owner ); + + // Handle: [ owner, key, value ] args + // Always use camelCase key (gh-2257) + if ( typeof data === "string" ) { + cache[ camelCase( data ) ] = value; + + // Handle: [ owner, { properties } ] args + } else { + + // Copy the properties one-by-one to the cache object + for ( prop in data ) { + cache[ camelCase( prop ) ] = data[ prop ]; + } + } + return cache; + }, + get: function( owner, key ) { + return key === undefined ? + this.cache( owner ) : + + // Always use camelCase key (gh-2257) + owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; + }, + access: function( owner, key, value ) { + + // In cases where either: + // + // 1. No key was specified + // 2. A string key was specified, but no value provided + // + // Take the "read" path and allow the get method to determine + // which value to return, respectively either: + // + // 1. The entire cache object + // 2. The data stored at the key + // + if ( key === undefined || + ( ( key && typeof key === "string" ) && value === undefined ) ) { + + return this.get( owner, key ); + } + + // When the key is not a string, or both a key and value + // are specified, set or extend (existing objects) with either: + // + // 1. An object of properties + // 2. A key and value + // + this.set( owner, key, value ); + + // Since the "set" path can have two possible entry points + // return the expected data based on which path was taken[*] + return value !== undefined ? value : key; + }, + remove: function( owner, key ) { + var i, + cache = owner[ this.expando ]; + + if ( cache === undefined ) { + return; + } + + if ( key !== undefined ) { + + // Support array or space separated string of keys + if ( Array.isArray( key ) ) { + + // If key is an array of keys... + // We always set camelCase keys, so remove that. + key = key.map( camelCase ); + } else { + key = camelCase( key ); + + // If a key with the spaces exists, use it. + // Otherwise, create an array by matching non-whitespace + key = key in cache ? + [ key ] : + ( key.match( rnothtmlwhite ) || [] ); + } + + i = key.length; + + while ( i-- ) { + delete cache[ key[ i ] ]; + } + } + + // Remove the expando if there's no more data + if ( key === undefined || jQuery.isEmptyObject( cache ) ) { + + // Support: Chrome <=35 - 45 + // Webkit & Blink performance suffers when deleting properties + // from DOM nodes, so set to undefined instead + // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) + if ( owner.nodeType ) { + owner[ this.expando ] = undefined; + } else { + delete owner[ this.expando ]; + } + } + }, + hasData: function( owner ) { + var cache = owner[ this.expando ]; + return cache !== undefined && !jQuery.isEmptyObject( cache ); + } +}; +var dataPriv = new Data(); + +var dataUser = new Data(); + + + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11 only + // The attrs elements can be null (#14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + + +jQuery.extend( { + queue: function( elem, type, data ) { + var queue; + + if ( elem ) { + type = ( type || "fx" ) + "queue"; + queue = dataPriv.get( elem, type ); + + // Speed up dequeue by getting out quickly if this is just a lookup + if ( data ) { + if ( !queue || Array.isArray( data ) ) { + queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); + } else { + queue.push( data ); + } + } + return queue || []; + } + }, + + dequeue: function( elem, type ) { + type = type || "fx"; + + var queue = jQuery.queue( elem, type ), + startLength = queue.length, + fn = queue.shift(), + hooks = jQuery._queueHooks( elem, type ), + next = function() { + jQuery.dequeue( elem, type ); + }; + + // If the fx queue is dequeued, always remove the progress sentinel + if ( fn === "inprogress" ) { + fn = queue.shift(); + startLength--; + } + + if ( fn ) { + + // Add a progress sentinel to prevent the fx queue from being + // automatically dequeued + if ( type === "fx" ) { + queue.unshift( "inprogress" ); + } + + // Clear up the last queue stop function + delete hooks.stop; + fn.call( elem, next, hooks ); + } + + if ( !startLength && hooks ) { + hooks.empty.fire(); + } + }, + + // Not public - generate a queueHooks object, or return the current one + _queueHooks: function( elem, type ) { + var key = type + "queueHooks"; + return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { + empty: jQuery.Callbacks( "once memory" ).add( function() { + dataPriv.remove( elem, [ type + "queue", key ] ); + } ) + } ); + } +} ); + +jQuery.fn.extend( { + queue: function( type, data ) { + var setter = 2; + + if ( typeof type !== "string" ) { + data = type; + type = "fx"; + setter--; + } + + if ( arguments.length < setter ) { + return jQuery.queue( this[ 0 ], type ); + } + + return data === undefined ? + this : + this.each( function() { + var queue = jQuery.queue( this, type, data ); + + // Ensure a hooks for this queue + jQuery._queueHooks( this, type ); + + if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { + jQuery.dequeue( this, type ); + } + } ); + }, + dequeue: function( type ) { + return this.each( function() { + jQuery.dequeue( this, type ); + } ); + }, + clearQueue: function( type ) { + return this.queue( type || "fx", [] ); + }, + + // Get a promise resolved when queues of a certain type + // are emptied (fx is the type by default) + promise: function( type, obj ) { + var tmp, + count = 1, + defer = jQuery.Deferred(), + elements = this, + i = this.length, + resolve = function() { + if ( !( --count ) ) { + defer.resolveWith( elements, [ elements ] ); + } + }; + + if ( typeof type !== "string" ) { + obj = type; + type = undefined; + } + type = type || "fx"; + + while ( i-- ) { + tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); + if ( tmp && tmp.empty ) { + count++; + tmp.empty.add( resolve ); + } + } + resolve(); + return defer.promise( obj ); + } +} ); +var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; + +var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); + + +var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; + +var documentElement = document.documentElement; + + + + var isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ); + }, + composed = { composed: true }; + + // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only + // Check attachment across shadow DOM boundaries when possible (gh-3504) + // Support: iOS 10.0-10.2 only + // Early iOS 10 versions support `attachShadow` but not `getRootNode`, + // leading to errors. We need to check for `getRootNode`. + if ( documentElement.getRootNode ) { + isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ) || + elem.getRootNode( composed ) === elem.ownerDocument; + }; + } +var isHiddenWithinTree = function( elem, el ) { + + // isHiddenWithinTree might be called from jQuery#filter function; + // in that case, element will be second argument + elem = el || elem; + + // Inline style trumps all + return elem.style.display === "none" || + elem.style.display === "" && + + // Otherwise, check computed style + // Support: Firefox <=43 - 45 + // Disconnected elements can have computed display: none, so first confirm that elem is + // in the document. + isAttached( elem ) && + + jQuery.css( elem, "display" ) === "none"; + }; + + + +function adjustCSS( elem, prop, valueParts, tween ) { + var adjusted, scale, + maxIterations = 20, + currentValue = tween ? + function() { + return tween.cur(); + } : + function() { + return jQuery.css( elem, prop, "" ); + }, + initial = currentValue(), + unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), + + // Starting value computation is required for potential unit mismatches + initialInUnit = elem.nodeType && + ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && + rcssNum.exec( jQuery.css( elem, prop ) ); + + if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { + + // Support: Firefox <=54 + // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) + initial = initial / 2; + + // Trust units reported by jQuery.css + unit = unit || initialInUnit[ 3 ]; + + // Iteratively approximate from a nonzero starting point + initialInUnit = +initial || 1; + + while ( maxIterations-- ) { + + // Evaluate and update our best guess (doubling guesses that zero out). + // Finish if the scale equals or crosses 1 (making the old*new product non-positive). + jQuery.style( elem, prop, initialInUnit + unit ); + if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { + maxIterations = 0; + } + initialInUnit = initialInUnit / scale; + + } + + initialInUnit = initialInUnit * 2; + jQuery.style( elem, prop, initialInUnit + unit ); + + // Make sure we update the tween properties later on + valueParts = valueParts || []; + } + + if ( valueParts ) { + initialInUnit = +initialInUnit || +initial || 0; + + // Apply relative offset (+=/-=) if specified + adjusted = valueParts[ 1 ] ? + initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : + +valueParts[ 2 ]; + if ( tween ) { + tween.unit = unit; + tween.start = initialInUnit; + tween.end = adjusted; + } + } + return adjusted; +} + + +var defaultDisplayMap = {}; + +function getDefaultDisplay( elem ) { + var temp, + doc = elem.ownerDocument, + nodeName = elem.nodeName, + display = defaultDisplayMap[ nodeName ]; + + if ( display ) { + return display; + } + + temp = doc.body.appendChild( doc.createElement( nodeName ) ); + display = jQuery.css( temp, "display" ); + + temp.parentNode.removeChild( temp ); + + if ( display === "none" ) { + display = "block"; + } + defaultDisplayMap[ nodeName ] = display; + + return display; +} + +function showHide( elements, show ) { + var display, elem, + values = [], + index = 0, + length = elements.length; + + // Determine new display value for elements that need to change + for ( ; index < length; index++ ) { + elem = elements[ index ]; + if ( !elem.style ) { + continue; + } + + display = elem.style.display; + if ( show ) { + + // Since we force visibility upon cascade-hidden elements, an immediate (and slow) + // check is required in this first loop unless we have a nonempty display value (either + // inline or about-to-be-restored) + if ( display === "none" ) { + values[ index ] = dataPriv.get( elem, "display" ) || null; + if ( !values[ index ] ) { + elem.style.display = ""; + } + } + if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { + values[ index ] = getDefaultDisplay( elem ); + } + } else { + if ( display !== "none" ) { + values[ index ] = "none"; + + // Remember what we're overwriting + dataPriv.set( elem, "display", display ); + } + } + } + + // Set the display of the elements in a second loop to avoid constant reflow + for ( index = 0; index < length; index++ ) { + if ( values[ index ] != null ) { + elements[ index ].style.display = values[ index ]; + } + } + + return elements; +} + +jQuery.fn.extend( { + show: function() { + return showHide( this, true ); + }, + hide: function() { + return showHide( this ); + }, + toggle: function( state ) { + if ( typeof state === "boolean" ) { + return state ? this.show() : this.hide(); + } + + return this.each( function() { + if ( isHiddenWithinTree( this ) ) { + jQuery( this ).show(); + } else { + jQuery( this ).hide(); + } + } ); + } +} ); +var rcheckableType = ( /^(?:checkbox|radio)$/i ); + +var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); + +var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); + + + +( function() { + var fragment = document.createDocumentFragment(), + div = fragment.appendChild( document.createElement( "div" ) ), + input = document.createElement( "input" ); + + // Support: Android 4.0 - 4.3 only + // Check state lost if the name is set (#11217) + // Support: Windows Web Apps (WWA) + // `name` and `type` must use .setAttribute for WWA (#14901) + input.setAttribute( "type", "radio" ); + input.setAttribute( "checked", "checked" ); + input.setAttribute( "name", "t" ); + + div.appendChild( input ); + + // Support: Android <=4.1 only + // Older WebKit doesn't clone checked state correctly in fragments + support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; + + // Support: IE <=11 only + // Make sure textarea (and checkbox) defaultValue is properly cloned + div.innerHTML = ""; + support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; + + // Support: IE <=9 only + // IE <=9 replaces "; + support.option = !!div.lastChild; +} )(); + + +// We have to close these tags to support XHTML (#13200) +var wrapMap = { + + // XHTML parsers do not magically insert elements in the + // same way that tag soup parsers do. So we cannot shorten + // this by omitting or other required elements. + thead: [ 1, "", "
" ], + col: [ 2, "", "
" ], + tr: [ 2, "", "
" ], + td: [ 3, "", "
" ], + + _default: [ 0, "", "" ] +}; + +wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; +wrapMap.th = wrapMap.td; + +// Support: IE <=9 only +if ( !support.option ) { + wrapMap.optgroup = wrapMap.option = [ 1, "" ]; +} + + +function getAll( context, tag ) { + + // Support: IE <=9 - 11 only + // Use typeof to avoid zero-argument method invocation on host objects (#15151) + var ret; + + if ( typeof context.getElementsByTagName !== "undefined" ) { + ret = context.getElementsByTagName( tag || "*" ); + + } else if ( typeof context.querySelectorAll !== "undefined" ) { + ret = context.querySelectorAll( tag || "*" ); + + } else { + ret = []; + } + + if ( tag === undefined || tag && nodeName( context, tag ) ) { + return jQuery.merge( [ context ], ret ); + } + + return ret; +} + + +// Mark scripts as having already been evaluated +function setGlobalEval( elems, refElements ) { + var i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + dataPriv.set( + elems[ i ], + "globalEval", + !refElements || dataPriv.get( refElements[ i ], "globalEval" ) + ); + } +} + + +var rhtml = /<|&#?\w+;/; + +function buildFragment( elems, context, scripts, selection, ignored ) { + var elem, tmp, tag, wrap, attached, j, + fragment = context.createDocumentFragment(), + nodes = [], + i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + elem = elems[ i ]; + + if ( elem || elem === 0 ) { + + // Add nodes directly + if ( toType( elem ) === "object" ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); + + // Convert non-html into a text node + } else if ( !rhtml.test( elem ) ) { + nodes.push( context.createTextNode( elem ) ); + + // Convert html into DOM nodes + } else { + tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); + + // Deserialize a standard representation + tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); + wrap = wrapMap[ tag ] || wrapMap._default; + tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; + + // Descend through wrappers to the right content + j = wrap[ 0 ]; + while ( j-- ) { + tmp = tmp.lastChild; + } + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, tmp.childNodes ); + + // Remember the top-level container + tmp = fragment.firstChild; + + // Ensure the created nodes are orphaned (#12392) + tmp.textContent = ""; + } + } + } + + // Remove wrapper from fragment + fragment.textContent = ""; + + i = 0; + while ( ( elem = nodes[ i++ ] ) ) { + + // Skip elements already in the context collection (trac-4087) + if ( selection && jQuery.inArray( elem, selection ) > -1 ) { + if ( ignored ) { + ignored.push( elem ); + } + continue; + } + + attached = isAttached( elem ); + + // Append to fragment + tmp = getAll( fragment.appendChild( elem ), "script" ); + + // Preserve script evaluation history + if ( attached ) { + setGlobalEval( tmp ); + } + + // Capture executables + if ( scripts ) { + j = 0; + while ( ( elem = tmp[ j++ ] ) ) { + if ( rscriptType.test( elem.type || "" ) ) { + scripts.push( elem ); + } + } + } + } + + return fragment; +} + + +var + rkeyEvent = /^key/, + rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, + rtypenamespace = /^([^.]*)(?:\.(.+)|)/; + +function returnTrue() { + return true; +} + +function returnFalse() { + return false; +} + +// Support: IE <=9 - 11+ +// focus() and blur() are asynchronous, except when they are no-op. +// So expect focus to be synchronous when the element is already active, +// and blur to be synchronous when the element is not already active. +// (focus and blur are always synchronous in other supported browsers, +// this just defines when we can count on it). +function expectSync( elem, type ) { + return ( elem === safeActiveElement() ) === ( type === "focus" ); +} + +// Support: IE <=9 only +// Accessing document.activeElement can throw unexpectedly +// https://bugs.jquery.com/ticket/13393 +function safeActiveElement() { + try { + return document.activeElement; + } catch ( err ) { } +} + +function on( elem, types, selector, data, fn, one ) { + var origFn, type; + + // Types can be a map of types/handlers + if ( typeof types === "object" ) { + + // ( types-Object, selector, data ) + if ( typeof selector !== "string" ) { + + // ( types-Object, data ) + data = data || selector; + selector = undefined; + } + for ( type in types ) { + on( elem, type, selector, data, types[ type ], one ); + } + return elem; + } + + if ( data == null && fn == null ) { + + // ( types, fn ) + fn = selector; + data = selector = undefined; + } else if ( fn == null ) { + if ( typeof selector === "string" ) { + + // ( types, selector, fn ) + fn = data; + data = undefined; + } else { + + // ( types, data, fn ) + fn = data; + data = selector; + selector = undefined; + } + } + if ( fn === false ) { + fn = returnFalse; + } else if ( !fn ) { + return elem; + } + + if ( one === 1 ) { + origFn = fn; + fn = function( event ) { + + // Can use an empty set, since event contains the info + jQuery().off( event ); + return origFn.apply( this, arguments ); + }; + + // Use same guid so caller can remove using origFn + fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); + } + return elem.each( function() { + jQuery.event.add( this, types, fn, data, selector ); + } ); +} + +/* + * Helper functions for managing events -- not part of the public interface. + * Props to Dean Edwards' addEvent library for many of the ideas. + */ +jQuery.event = { + + global: {}, + + add: function( elem, types, handler, data, selector ) { + + var handleObjIn, eventHandle, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.get( elem ); + + // Only attach events to objects that accept data + if ( !acceptData( elem ) ) { + return; + } + + // Caller can pass in an object of custom data in lieu of the handler + if ( handler.handler ) { + handleObjIn = handler; + handler = handleObjIn.handler; + selector = handleObjIn.selector; + } + + // Ensure that invalid selectors throw exceptions at attach time + // Evaluate against documentElement in case elem is a non-element node (e.g., document) + if ( selector ) { + jQuery.find.matchesSelector( documentElement, selector ); + } + + // Make sure that the handler has a unique ID, used to find/remove it later + if ( !handler.guid ) { + handler.guid = jQuery.guid++; + } + + // Init the element's event structure and main handler, if this is the first + if ( !( events = elemData.events ) ) { + events = elemData.events = Object.create( null ); + } + if ( !( eventHandle = elemData.handle ) ) { + eventHandle = elemData.handle = function( e ) { + + // Discard the second event of a jQuery.event.trigger() and + // when an event is called after a page has unloaded + return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? + jQuery.event.dispatch.apply( elem, arguments ) : undefined; + }; + } + + // Handle multiple events separated by a space + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // There *must* be a type, no attaching namespace-only handlers + if ( !type ) { + continue; + } + + // If event changes its type, use the special event handlers for the changed type + special = jQuery.event.special[ type ] || {}; + + // If selector defined, determine special event api type, otherwise given type + type = ( selector ? special.delegateType : special.bindType ) || type; + + // Update special based on newly reset type + special = jQuery.event.special[ type ] || {}; + + // handleObj is passed to all event handlers + handleObj = jQuery.extend( { + type: type, + origType: origType, + data: data, + handler: handler, + guid: handler.guid, + selector: selector, + needsContext: selector && jQuery.expr.match.needsContext.test( selector ), + namespace: namespaces.join( "." ) + }, handleObjIn ); + + // Init the event handler queue if we're the first + if ( !( handlers = events[ type ] ) ) { + handlers = events[ type ] = []; + handlers.delegateCount = 0; + + // Only use addEventListener if the special events handler returns false + if ( !special.setup || + special.setup.call( elem, data, namespaces, eventHandle ) === false ) { + + if ( elem.addEventListener ) { + elem.addEventListener( type, eventHandle ); + } + } + } + + if ( special.add ) { + special.add.call( elem, handleObj ); + + if ( !handleObj.handler.guid ) { + handleObj.handler.guid = handler.guid; + } + } + + // Add to the element's handler list, delegates in front + if ( selector ) { + handlers.splice( handlers.delegateCount++, 0, handleObj ); + } else { + handlers.push( handleObj ); + } + + // Keep track of which events have ever been used, for event optimization + jQuery.event.global[ type ] = true; + } + + }, + + // Detach an event or set of events from an element + remove: function( elem, types, handler, selector, mappedTypes ) { + + var j, origCount, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); + + if ( !elemData || !( events = elemData.events ) ) { + return; + } + + // Once for each type.namespace in types; type may be omitted + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // Unbind all events (on this namespace, if provided) for the element + if ( !type ) { + for ( type in events ) { + jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); + } + continue; + } + + special = jQuery.event.special[ type ] || {}; + type = ( selector ? special.delegateType : special.bindType ) || type; + handlers = events[ type ] || []; + tmp = tmp[ 2 ] && + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); + + // Remove matching events + origCount = j = handlers.length; + while ( j-- ) { + handleObj = handlers[ j ]; + + if ( ( mappedTypes || origType === handleObj.origType ) && + ( !handler || handler.guid === handleObj.guid ) && + ( !tmp || tmp.test( handleObj.namespace ) ) && + ( !selector || selector === handleObj.selector || + selector === "**" && handleObj.selector ) ) { + handlers.splice( j, 1 ); + + if ( handleObj.selector ) { + handlers.delegateCount--; + } + if ( special.remove ) { + special.remove.call( elem, handleObj ); + } + } + } + + // Remove generic event handler if we removed something and no more handlers exist + // (avoids potential for endless recursion during removal of special event handlers) + if ( origCount && !handlers.length ) { + if ( !special.teardown || + special.teardown.call( elem, namespaces, elemData.handle ) === false ) { + + jQuery.removeEvent( elem, type, elemData.handle ); + } + + delete events[ type ]; + } + } + + // Remove data and the expando if it's no longer used + if ( jQuery.isEmptyObject( events ) ) { + dataPriv.remove( elem, "handle events" ); + } + }, + + dispatch: function( nativeEvent ) { + + var i, j, ret, matched, handleObj, handlerQueue, + args = new Array( arguments.length ), + + // Make a writable jQuery.Event from the native event object + event = jQuery.event.fix( nativeEvent ), + + handlers = ( + dataPriv.get( this, "events" ) || Object.create( null ) + )[ event.type ] || [], + special = jQuery.event.special[ event.type ] || {}; + + // Use the fix-ed jQuery.Event rather than the (read-only) native event + args[ 0 ] = event; + + for ( i = 1; i < arguments.length; i++ ) { + args[ i ] = arguments[ i ]; + } + + event.delegateTarget = this; + + // Call the preDispatch hook for the mapped type, and let it bail if desired + if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { + return; + } + + // Determine handlers + handlerQueue = jQuery.event.handlers.call( this, event, handlers ); + + // Run delegates first; they may want to stop propagation beneath us + i = 0; + while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { + event.currentTarget = matched.elem; + + j = 0; + while ( ( handleObj = matched.handlers[ j++ ] ) && + !event.isImmediatePropagationStopped() ) { + + // If the event is namespaced, then each handler is only invoked if it is + // specially universal or its namespaces are a superset of the event's. + if ( !event.rnamespace || handleObj.namespace === false || + event.rnamespace.test( handleObj.namespace ) ) { + + event.handleObj = handleObj; + event.data = handleObj.data; + + ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || + handleObj.handler ).apply( matched.elem, args ); + + if ( ret !== undefined ) { + if ( ( event.result = ret ) === false ) { + event.preventDefault(); + event.stopPropagation(); + } + } + } + } + } + + // Call the postDispatch hook for the mapped type + if ( special.postDispatch ) { + special.postDispatch.call( this, event ); + } + + return event.result; + }, + + handlers: function( event, handlers ) { + var i, handleObj, sel, matchedHandlers, matchedSelectors, + handlerQueue = [], + delegateCount = handlers.delegateCount, + cur = event.target; + + // Find delegate handlers + if ( delegateCount && + + // Support: IE <=9 + // Black-hole SVG instance trees (trac-13180) + cur.nodeType && + + // Support: Firefox <=42 + // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) + // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click + // Support: IE 11 only + // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) + !( event.type === "click" && event.button >= 1 ) ) { + + for ( ; cur !== this; cur = cur.parentNode || this ) { + + // Don't check non-elements (#13208) + // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) + if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { + matchedHandlers = []; + matchedSelectors = {}; + for ( i = 0; i < delegateCount; i++ ) { + handleObj = handlers[ i ]; + + // Don't conflict with Object.prototype properties (#13203) + sel = handleObj.selector + " "; + + if ( matchedSelectors[ sel ] === undefined ) { + matchedSelectors[ sel ] = handleObj.needsContext ? + jQuery( sel, this ).index( cur ) > -1 : + jQuery.find( sel, this, null, [ cur ] ).length; + } + if ( matchedSelectors[ sel ] ) { + matchedHandlers.push( handleObj ); + } + } + if ( matchedHandlers.length ) { + handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); + } + } + } + } + + // Add the remaining (directly-bound) handlers + cur = this; + if ( delegateCount < handlers.length ) { + handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); + } + + return handlerQueue; + }, + + addProp: function( name, hook ) { + Object.defineProperty( jQuery.Event.prototype, name, { + enumerable: true, + configurable: true, + + get: isFunction( hook ) ? + function() { + if ( this.originalEvent ) { + return hook( this.originalEvent ); + } + } : + function() { + if ( this.originalEvent ) { + return this.originalEvent[ name ]; + } + }, + + set: function( value ) { + Object.defineProperty( this, name, { + enumerable: true, + configurable: true, + writable: true, + value: value + } ); + } + } ); + }, + + fix: function( originalEvent ) { + return originalEvent[ jQuery.expando ] ? + originalEvent : + new jQuery.Event( originalEvent ); + }, + + special: { + load: { + + // Prevent triggered image.load events from bubbling to window.load + noBubble: true + }, + click: { + + // Utilize native event to ensure correct state for checkable inputs + setup: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Claim the first handler + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + // dataPriv.set( el, "click", ... ) + leverageNative( el, "click", returnTrue ); + } + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Force setup before triggering a click + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + leverageNative( el, "click" ); + } + + // Return non-false to allow normal event-path propagation + return true; + }, + + // For cross-browser consistency, suppress native .click() on links + // Also prevent it if we're currently inside a leveraged native-event stack + _default: function( event ) { + var target = event.target; + return rcheckableType.test( target.type ) && + target.click && nodeName( target, "input" ) && + dataPriv.get( target, "click" ) || + nodeName( target, "a" ); + } + }, + + beforeunload: { + postDispatch: function( event ) { + + // Support: Firefox 20+ + // Firefox doesn't alert if the returnValue field is not set. + if ( event.result !== undefined && event.originalEvent ) { + event.originalEvent.returnValue = event.result; + } + } + } + } +}; + +// Ensure the presence of an event listener that handles manually-triggered +// synthetic events by interrupting progress until reinvoked in response to +// *native* events that it fires directly, ensuring that state changes have +// already occurred before other listeners are invoked. +function leverageNative( el, type, expectSync ) { + + // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add + if ( !expectSync ) { + if ( dataPriv.get( el, type ) === undefined ) { + jQuery.event.add( el, type, returnTrue ); + } + return; + } + + // Register the controller as a special universal handler for all event namespaces + dataPriv.set( el, type, false ); + jQuery.event.add( el, type, { + namespace: false, + handler: function( event ) { + var notAsync, result, + saved = dataPriv.get( this, type ); + + if ( ( event.isTrigger & 1 ) && this[ type ] ) { + + // Interrupt processing of the outer synthetic .trigger()ed event + // Saved data should be false in such cases, but might be a leftover capture object + // from an async native handler (gh-4350) + if ( !saved.length ) { + + // Store arguments for use when handling the inner native event + // There will always be at least one argument (an event object), so this array + // will not be confused with a leftover capture object. + saved = slice.call( arguments ); + dataPriv.set( this, type, saved ); + + // Trigger the native event and capture its result + // Support: IE <=9 - 11+ + // focus() and blur() are asynchronous + notAsync = expectSync( this, type ); + this[ type ](); + result = dataPriv.get( this, type ); + if ( saved !== result || notAsync ) { + dataPriv.set( this, type, false ); + } else { + result = {}; + } + if ( saved !== result ) { + + // Cancel the outer synthetic event + event.stopImmediatePropagation(); + event.preventDefault(); + return result.value; + } + + // If this is an inner synthetic event for an event with a bubbling surrogate + // (focus or blur), assume that the surrogate already propagated from triggering the + // native event and prevent that from happening again here. + // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the + // bubbling surrogate propagates *after* the non-bubbling base), but that seems + // less bad than duplication. + } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { + event.stopPropagation(); + } + + // If this is a native event triggered above, everything is now in order + // Fire an inner synthetic event with the original arguments + } else if ( saved.length ) { + + // ...and capture the result + dataPriv.set( this, type, { + value: jQuery.event.trigger( + + // Support: IE <=9 - 11+ + // Extend with the prototype to reset the above stopImmediatePropagation() + jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), + saved.slice( 1 ), + this + ) + } ); + + // Abort handling of the native event + event.stopImmediatePropagation(); + } + } + } ); +} + +jQuery.removeEvent = function( elem, type, handle ) { + + // This "if" is needed for plain objects + if ( elem.removeEventListener ) { + elem.removeEventListener( type, handle ); + } +}; + +jQuery.Event = function( src, props ) { + + // Allow instantiation without the 'new' keyword + if ( !( this instanceof jQuery.Event ) ) { + return new jQuery.Event( src, props ); + } + + // Event object + if ( src && src.type ) { + this.originalEvent = src; + this.type = src.type; + + // Events bubbling up the document may have been marked as prevented + // by a handler lower down the tree; reflect the correct value. + this.isDefaultPrevented = src.defaultPrevented || + src.defaultPrevented === undefined && + + // Support: Android <=2.3 only + src.returnValue === false ? + returnTrue : + returnFalse; + + // Create target properties + // Support: Safari <=6 - 7 only + // Target should not be a text node (#504, #13143) + this.target = ( src.target && src.target.nodeType === 3 ) ? + src.target.parentNode : + src.target; + + this.currentTarget = src.currentTarget; + this.relatedTarget = src.relatedTarget; + + // Event type + } else { + this.type = src; + } + + // Put explicitly provided properties onto the event object + if ( props ) { + jQuery.extend( this, props ); + } + + // Create a timestamp if incoming event doesn't have one + this.timeStamp = src && src.timeStamp || Date.now(); + + // Mark it as fixed + this[ jQuery.expando ] = true; +}; + +// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding +// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html +jQuery.Event.prototype = { + constructor: jQuery.Event, + isDefaultPrevented: returnFalse, + isPropagationStopped: returnFalse, + isImmediatePropagationStopped: returnFalse, + isSimulated: false, + + preventDefault: function() { + var e = this.originalEvent; + + this.isDefaultPrevented = returnTrue; + + if ( e && !this.isSimulated ) { + e.preventDefault(); + } + }, + stopPropagation: function() { + var e = this.originalEvent; + + this.isPropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopPropagation(); + } + }, + stopImmediatePropagation: function() { + var e = this.originalEvent; + + this.isImmediatePropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopImmediatePropagation(); + } + + this.stopPropagation(); + } +}; + +// Includes all common event props including KeyEvent and MouseEvent specific props +jQuery.each( { + altKey: true, + bubbles: true, + cancelable: true, + changedTouches: true, + ctrlKey: true, + detail: true, + eventPhase: true, + metaKey: true, + pageX: true, + pageY: true, + shiftKey: true, + view: true, + "char": true, + code: true, + charCode: true, + key: true, + keyCode: true, + button: true, + buttons: true, + clientX: true, + clientY: true, + offsetX: true, + offsetY: true, + pointerId: true, + pointerType: true, + screenX: true, + screenY: true, + targetTouches: true, + toElement: true, + touches: true, + + which: function( event ) { + var button = event.button; + + // Add which for key events + if ( event.which == null && rkeyEvent.test( event.type ) ) { + return event.charCode != null ? event.charCode : event.keyCode; + } + + // Add which for click: 1 === left; 2 === middle; 3 === right + if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { + if ( button & 1 ) { + return 1; + } + + if ( button & 2 ) { + return 3; + } + + if ( button & 4 ) { + return 2; + } + + return 0; + } + + return event.which; + } +}, jQuery.event.addProp ); + +jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { + jQuery.event.special[ type ] = { + + // Utilize native event if possible so blur/focus sequence is correct + setup: function() { + + // Claim the first handler + // dataPriv.set( this, "focus", ... ) + // dataPriv.set( this, "blur", ... ) + leverageNative( this, type, expectSync ); + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function() { + + // Force setup before trigger + leverageNative( this, type ); + + // Return non-false to allow normal event-path propagation + return true; + }, + + delegateType: delegateType + }; +} ); + +// Create mouseenter/leave events using mouseover/out and event-time checks +// so that event delegation works in jQuery. +// Do the same for pointerenter/pointerleave and pointerover/pointerout +// +// Support: Safari 7 only +// Safari sends mouseenter too often; see: +// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 +// for the description of the bug (it existed in older Chrome versions as well). +jQuery.each( { + mouseenter: "mouseover", + mouseleave: "mouseout", + pointerenter: "pointerover", + pointerleave: "pointerout" +}, function( orig, fix ) { + jQuery.event.special[ orig ] = { + delegateType: fix, + bindType: fix, + + handle: function( event ) { + var ret, + target = this, + related = event.relatedTarget, + handleObj = event.handleObj; + + // For mouseenter/leave call the handler if related is outside the target. + // NB: No relatedTarget if the mouse left/entered the browser window + if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { + event.type = handleObj.origType; + ret = handleObj.handler.apply( this, arguments ); + event.type = fix; + } + return ret; + } + }; +} ); + +jQuery.fn.extend( { + + on: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn ); + }, + one: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn, 1 ); + }, + off: function( types, selector, fn ) { + var handleObj, type; + if ( types && types.preventDefault && types.handleObj ) { + + // ( event ) dispatched jQuery.Event + handleObj = types.handleObj; + jQuery( types.delegateTarget ).off( + handleObj.namespace ? + handleObj.origType + "." + handleObj.namespace : + handleObj.origType, + handleObj.selector, + handleObj.handler + ); + return this; + } + if ( typeof types === "object" ) { + + // ( types-object [, selector] ) + for ( type in types ) { + this.off( type, selector, types[ type ] ); + } + return this; + } + if ( selector === false || typeof selector === "function" ) { + + // ( types [, fn] ) + fn = selector; + selector = undefined; + } + if ( fn === false ) { + fn = returnFalse; + } + return this.each( function() { + jQuery.event.remove( this, types, fn, selector ); + } ); + } +} ); + + +var + + // Support: IE <=10 - 11, Edge 12 - 13 only + // In IE/Edge using regex groups here causes severe slowdowns. + // See https://connect.microsoft.com/IE/feedback/details/1736512/ + rnoInnerhtml = /\s*$/g; + +// Prefer a tbody over its parent table for containing new rows +function manipulationTarget( elem, content ) { + if ( nodeName( elem, "table" ) && + nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { + + return jQuery( elem ).children( "tbody" )[ 0 ] || elem; + } + + return elem; +} + +// Replace/restore the type attribute of script elements for safe DOM manipulation +function disableScript( elem ) { + elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; + return elem; +} +function restoreScript( elem ) { + if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { + elem.type = elem.type.slice( 5 ); + } else { + elem.removeAttribute( "type" ); + } + + return elem; +} + +function cloneCopyEvent( src, dest ) { + var i, l, type, pdataOld, udataOld, udataCur, events; + + if ( dest.nodeType !== 1 ) { + return; + } + + // 1. Copy private data: events, handlers, etc. + if ( dataPriv.hasData( src ) ) { + pdataOld = dataPriv.get( src ); + events = pdataOld.events; + + if ( events ) { + dataPriv.remove( dest, "handle events" ); + + for ( type in events ) { + for ( i = 0, l = events[ type ].length; i < l; i++ ) { + jQuery.event.add( dest, type, events[ type ][ i ] ); + } + } + } + } + + // 2. Copy user data + if ( dataUser.hasData( src ) ) { + udataOld = dataUser.access( src ); + udataCur = jQuery.extend( {}, udataOld ); + + dataUser.set( dest, udataCur ); + } +} + +// Fix IE bugs, see support tests +function fixInput( src, dest ) { + var nodeName = dest.nodeName.toLowerCase(); + + // Fails to persist the checked state of a cloned checkbox or radio button. + if ( nodeName === "input" && rcheckableType.test( src.type ) ) { + dest.checked = src.checked; + + // Fails to return the selected option to the default selected state when cloning options + } else if ( nodeName === "input" || nodeName === "textarea" ) { + dest.defaultValue = src.defaultValue; + } +} + +function domManip( collection, args, callback, ignored ) { + + // Flatten any nested arrays + args = flat( args ); + + var fragment, first, scripts, hasScripts, node, doc, + i = 0, + l = collection.length, + iNoClone = l - 1, + value = args[ 0 ], + valueIsFunction = isFunction( value ); + + // We can't cloneNode fragments that contain checked, in WebKit + if ( valueIsFunction || + ( l > 1 && typeof value === "string" && + !support.checkClone && rchecked.test( value ) ) ) { + return collection.each( function( index ) { + var self = collection.eq( index ); + if ( valueIsFunction ) { + args[ 0 ] = value.call( this, index, self.html() ); + } + domManip( self, args, callback, ignored ); + } ); + } + + if ( l ) { + fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); + first = fragment.firstChild; + + if ( fragment.childNodes.length === 1 ) { + fragment = first; + } + + // Require either new content or an interest in ignored elements to invoke the callback + if ( first || ignored ) { + scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); + hasScripts = scripts.length; + + // Use the original fragment for the last item + // instead of the first because it can end up + // being emptied incorrectly in certain situations (#8070). + for ( ; i < l; i++ ) { + node = fragment; + + if ( i !== iNoClone ) { + node = jQuery.clone( node, true, true ); + + // Keep references to cloned scripts for later restoration + if ( hasScripts ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( scripts, getAll( node, "script" ) ); + } + } + + callback.call( collection[ i ], node, i ); + } + + if ( hasScripts ) { + doc = scripts[ scripts.length - 1 ].ownerDocument; + + // Reenable scripts + jQuery.map( scripts, restoreScript ); + + // Evaluate executable scripts on first document insertion + for ( i = 0; i < hasScripts; i++ ) { + node = scripts[ i ]; + if ( rscriptType.test( node.type || "" ) && + !dataPriv.access( node, "globalEval" ) && + jQuery.contains( doc, node ) ) { + + if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { + + // Optional AJAX dependency, but won't run scripts if not present + if ( jQuery._evalUrl && !node.noModule ) { + jQuery._evalUrl( node.src, { + nonce: node.nonce || node.getAttribute( "nonce" ) + }, doc ); + } + } else { + DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); + } + } + } + } + } + } + + return collection; +} + +function remove( elem, selector, keepData ) { + var node, + nodes = selector ? jQuery.filter( selector, elem ) : elem, + i = 0; + + for ( ; ( node = nodes[ i ] ) != null; i++ ) { + if ( !keepData && node.nodeType === 1 ) { + jQuery.cleanData( getAll( node ) ); + } + + if ( node.parentNode ) { + if ( keepData && isAttached( node ) ) { + setGlobalEval( getAll( node, "script" ) ); + } + node.parentNode.removeChild( node ); + } + } + + return elem; +} + +jQuery.extend( { + htmlPrefilter: function( html ) { + return html; + }, + + clone: function( elem, dataAndEvents, deepDataAndEvents ) { + var i, l, srcElements, destElements, + clone = elem.cloneNode( true ), + inPage = isAttached( elem ); + + // Fix IE cloning issues + if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && + !jQuery.isXMLDoc( elem ) ) { + + // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 + destElements = getAll( clone ); + srcElements = getAll( elem ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + fixInput( srcElements[ i ], destElements[ i ] ); + } + } + + // Copy the events from the original to the clone + if ( dataAndEvents ) { + if ( deepDataAndEvents ) { + srcElements = srcElements || getAll( elem ); + destElements = destElements || getAll( clone ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + cloneCopyEvent( srcElements[ i ], destElements[ i ] ); + } + } else { + cloneCopyEvent( elem, clone ); + } + } + + // Preserve script evaluation history + destElements = getAll( clone, "script" ); + if ( destElements.length > 0 ) { + setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); + } + + // Return the cloned set + return clone; + }, + + cleanData: function( elems ) { + var data, elem, type, + special = jQuery.event.special, + i = 0; + + for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { + if ( acceptData( elem ) ) { + if ( ( data = elem[ dataPriv.expando ] ) ) { + if ( data.events ) { + for ( type in data.events ) { + if ( special[ type ] ) { + jQuery.event.remove( elem, type ); + + // This is a shortcut to avoid jQuery.event.remove's overhead + } else { + jQuery.removeEvent( elem, type, data.handle ); + } + } + } + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataPriv.expando ] = undefined; + } + if ( elem[ dataUser.expando ] ) { + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataUser.expando ] = undefined; + } + } + } + } +} ); + +jQuery.fn.extend( { + detach: function( selector ) { + return remove( this, selector, true ); + }, + + remove: function( selector ) { + return remove( this, selector ); + }, + + text: function( value ) { + return access( this, function( value ) { + return value === undefined ? + jQuery.text( this ) : + this.empty().each( function() { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + this.textContent = value; + } + } ); + }, null, value, arguments.length ); + }, + + append: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.appendChild( elem ); + } + } ); + }, + + prepend: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.insertBefore( elem, target.firstChild ); + } + } ); + }, + + before: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this ); + } + } ); + }, + + after: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this.nextSibling ); + } + } ); + }, + + empty: function() { + var elem, + i = 0; + + for ( ; ( elem = this[ i ] ) != null; i++ ) { + if ( elem.nodeType === 1 ) { + + // Prevent memory leaks + jQuery.cleanData( getAll( elem, false ) ); + + // Remove any remaining nodes + elem.textContent = ""; + } + } + + return this; + }, + + clone: function( dataAndEvents, deepDataAndEvents ) { + dataAndEvents = dataAndEvents == null ? false : dataAndEvents; + deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; + + return this.map( function() { + return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); + } ); + }, + + html: function( value ) { + return access( this, function( value ) { + var elem = this[ 0 ] || {}, + i = 0, + l = this.length; + + if ( value === undefined && elem.nodeType === 1 ) { + return elem.innerHTML; + } + + // See if we can take a shortcut and just use innerHTML + if ( typeof value === "string" && !rnoInnerhtml.test( value ) && + !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { + + value = jQuery.htmlPrefilter( value ); + + try { + for ( ; i < l; i++ ) { + elem = this[ i ] || {}; + + // Remove element nodes and prevent memory leaks + if ( elem.nodeType === 1 ) { + jQuery.cleanData( getAll( elem, false ) ); + elem.innerHTML = value; + } + } + + elem = 0; + + // If using innerHTML throws an exception, use the fallback method + } catch ( e ) {} + } + + if ( elem ) { + this.empty().append( value ); + } + }, null, value, arguments.length ); + }, + + replaceWith: function() { + var ignored = []; + + // Make the changes, replacing each non-ignored context element with the new content + return domManip( this, arguments, function( elem ) { + var parent = this.parentNode; + + if ( jQuery.inArray( this, ignored ) < 0 ) { + jQuery.cleanData( getAll( this ) ); + if ( parent ) { + parent.replaceChild( elem, this ); + } + } + + // Force callback invocation + }, ignored ); + } +} ); + +jQuery.each( { + appendTo: "append", + prependTo: "prepend", + insertBefore: "before", + insertAfter: "after", + replaceAll: "replaceWith" +}, function( name, original ) { + jQuery.fn[ name ] = function( selector ) { + var elems, + ret = [], + insert = jQuery( selector ), + last = insert.length - 1, + i = 0; + + for ( ; i <= last; i++ ) { + elems = i === last ? this : this.clone( true ); + jQuery( insert[ i ] )[ original ]( elems ); + + // Support: Android <=4.0 only, PhantomJS 1 only + // .get() because push.apply(_, arraylike) throws on ancient WebKit + push.apply( ret, elems.get() ); + } + + return this.pushStack( ret ); + }; +} ); +var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); + +var getStyles = function( elem ) { + + // Support: IE <=11 only, Firefox <=30 (#15098, #14150) + // IE throws on elements created in popups + // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" + var view = elem.ownerDocument.defaultView; + + if ( !view || !view.opener ) { + view = window; + } + + return view.getComputedStyle( elem ); + }; + +var swap = function( elem, options, callback ) { + var ret, name, + old = {}; + + // Remember the old values, and insert the new ones + for ( name in options ) { + old[ name ] = elem.style[ name ]; + elem.style[ name ] = options[ name ]; + } + + ret = callback.call( elem ); + + // Revert the old values + for ( name in options ) { + elem.style[ name ] = old[ name ]; + } + + return ret; +}; + + +var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); + + + +( function() { + + // Executing both pixelPosition & boxSizingReliable tests require only one layout + // so they're executed at the same time to save the second computation. + function computeStyleTests() { + + // This is a singleton, we need to execute it only once + if ( !div ) { + return; + } + + container.style.cssText = "position:absolute;left:-11111px;width:60px;" + + "margin-top:1px;padding:0;border:0"; + div.style.cssText = + "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + + "margin:auto;border:1px;padding:1px;" + + "width:60%;top:1%"; + documentElement.appendChild( container ).appendChild( div ); + + var divStyle = window.getComputedStyle( div ); + pixelPositionVal = divStyle.top !== "1%"; + + // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 + reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; + + // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 + // Some styles come back with percentage values, even though they shouldn't + div.style.right = "60%"; + pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; + + // Support: IE 9 - 11 only + // Detect misreporting of content dimensions for box-sizing:border-box elements + boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; + + // Support: IE 9 only + // Detect overflow:scroll screwiness (gh-3699) + // Support: Chrome <=64 + // Don't get tricked when zoom affects offsetWidth (gh-4029) + div.style.position = "absolute"; + scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; + + documentElement.removeChild( container ); + + // Nullify the div so it wouldn't be stored in the memory and + // it will also be a sign that checks already performed + div = null; + } + + function roundPixelMeasures( measure ) { + return Math.round( parseFloat( measure ) ); + } + + var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, + reliableTrDimensionsVal, reliableMarginLeftVal, + container = document.createElement( "div" ), + div = document.createElement( "div" ); + + // Finish early in limited (non-browser) environments + if ( !div.style ) { + return; + } + + // Support: IE <=9 - 11 only + // Style of cloned element affects source element cloned (#8908) + div.style.backgroundClip = "content-box"; + div.cloneNode( true ).style.backgroundClip = ""; + support.clearCloneStyle = div.style.backgroundClip === "content-box"; + + jQuery.extend( support, { + boxSizingReliable: function() { + computeStyleTests(); + return boxSizingReliableVal; + }, + pixelBoxStyles: function() { + computeStyleTests(); + return pixelBoxStylesVal; + }, + pixelPosition: function() { + computeStyleTests(); + return pixelPositionVal; + }, + reliableMarginLeft: function() { + computeStyleTests(); + return reliableMarginLeftVal; + }, + scrollboxSize: function() { + computeStyleTests(); + return scrollboxSizeVal; + }, + + // Support: IE 9 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Behavior in IE 9 is more subtle than in newer versions & it passes + // some versions of this test; make sure not to make it pass there! + reliableTrDimensions: function() { + var table, tr, trChild, trStyle; + if ( reliableTrDimensionsVal == null ) { + table = document.createElement( "table" ); + tr = document.createElement( "tr" ); + trChild = document.createElement( "div" ); + + table.style.cssText = "position:absolute;left:-11111px"; + tr.style.height = "1px"; + trChild.style.height = "9px"; + + documentElement + .appendChild( table ) + .appendChild( tr ) + .appendChild( trChild ); + + trStyle = window.getComputedStyle( tr ); + reliableTrDimensionsVal = parseInt( trStyle.height ) > 3; + + documentElement.removeChild( table ); + } + return reliableTrDimensionsVal; + } + } ); +} )(); + + +function curCSS( elem, name, computed ) { + var width, minWidth, maxWidth, ret, + + // Support: Firefox 51+ + // Retrieving style before computed somehow + // fixes an issue with getting wrong values + // on detached elements + style = elem.style; + + computed = computed || getStyles( elem ); + + // getPropertyValue is needed for: + // .css('filter') (IE 9 only, #12537) + // .css('--customProperty) (#3144) + if ( computed ) { + ret = computed.getPropertyValue( name ) || computed[ name ]; + + if ( ret === "" && !isAttached( elem ) ) { + ret = jQuery.style( elem, name ); + } + + // A tribute to the "awesome hack by Dean Edwards" + // Android Browser returns percentage for some values, + // but width seems to be reliably pixels. + // This is against the CSSOM draft spec: + // https://drafts.csswg.org/cssom/#resolved-values + if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { + + // Remember the original values + width = style.width; + minWidth = style.minWidth; + maxWidth = style.maxWidth; + + // Put in the new values to get a computed value out + style.minWidth = style.maxWidth = style.width = ret; + ret = computed.width; + + // Revert the changed values + style.width = width; + style.minWidth = minWidth; + style.maxWidth = maxWidth; + } + } + + return ret !== undefined ? + + // Support: IE <=9 - 11 only + // IE returns zIndex value as an integer. + ret + "" : + ret; +} + + +function addGetHookIf( conditionFn, hookFn ) { + + // Define the hook, we'll check on the first run if it's really needed. + return { + get: function() { + if ( conditionFn() ) { + + // Hook not needed (or it's not possible to use it due + // to missing dependency), remove it. + delete this.get; + return; + } + + // Hook needed; redefine it so that the support test is not executed again. + return ( this.get = hookFn ).apply( this, arguments ); + } + }; +} + + +var cssPrefixes = [ "Webkit", "Moz", "ms" ], + emptyStyle = document.createElement( "div" ).style, + vendorProps = {}; + +// Return a vendor-prefixed property or undefined +function vendorPropName( name ) { + + // Check for vendor prefixed names + var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), + i = cssPrefixes.length; + + while ( i-- ) { + name = cssPrefixes[ i ] + capName; + if ( name in emptyStyle ) { + return name; + } + } +} + +// Return a potentially-mapped jQuery.cssProps or vendor prefixed property +function finalPropName( name ) { + var final = jQuery.cssProps[ name ] || vendorProps[ name ]; + + if ( final ) { + return final; + } + if ( name in emptyStyle ) { + return name; + } + return vendorProps[ name ] = vendorPropName( name ) || name; +} + + +var + + // Swappable if display is none or starts with table + // except "table", "table-cell", or "table-caption" + // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display + rdisplayswap = /^(none|table(?!-c[ea]).+)/, + rcustomProp = /^--/, + cssShow = { position: "absolute", visibility: "hidden", display: "block" }, + cssNormalTransform = { + letterSpacing: "0", + fontWeight: "400" + }; + +function setPositiveNumber( _elem, value, subtract ) { + + // Any relative (+/-) values have already been + // normalized at this point + var matches = rcssNum.exec( value ); + return matches ? + + // Guard against undefined "subtract", e.g., when used as in cssHooks + Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : + value; +} + +function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { + var i = dimension === "width" ? 1 : 0, + extra = 0, + delta = 0; + + // Adjustment may not be necessary + if ( box === ( isBorderBox ? "border" : "content" ) ) { + return 0; + } + + for ( ; i < 4; i += 2 ) { + + // Both box models exclude margin + if ( box === "margin" ) { + delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); + } + + // If we get here with a content-box, we're seeking "padding" or "border" or "margin" + if ( !isBorderBox ) { + + // Add padding + delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + + // For "border" or "margin", add border + if ( box !== "padding" ) { + delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + + // But still keep track of it otherwise + } else { + extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + + // If we get here with a border-box (content + padding + border), we're seeking "content" or + // "padding" or "margin" + } else { + + // For "content", subtract padding + if ( box === "content" ) { + delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + } + + // For "content" or "padding", subtract border + if ( box !== "margin" ) { + delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } + } + + // Account for positive content-box scroll gutter when requested by providing computedVal + if ( !isBorderBox && computedVal >= 0 ) { + + // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border + // Assuming integer scroll gutter, subtract the rest and round down + delta += Math.max( 0, Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + computedVal - + delta - + extra - + 0.5 + + // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter + // Use an explicit zero to avoid NaN (gh-3964) + ) ) || 0; + } + + return delta; +} + +function getWidthOrHeight( elem, dimension, extra ) { + + // Start with computed style + var styles = getStyles( elem ), + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). + // Fake content-box until we know it's needed to know the true value. + boxSizingNeeded = !support.boxSizingReliable() || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + valueIsBorderBox = isBorderBox, + + val = curCSS( elem, dimension, styles ), + offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); + + // Support: Firefox <=54 + // Return a confounding non-pixel value or feign ignorance, as appropriate. + if ( rnumnonpx.test( val ) ) { + if ( !extra ) { + return val; + } + val = "auto"; + } + + + // Support: IE 9 - 11 only + // Use offsetWidth/offsetHeight for when box sizing is unreliable. + // In those cases, the computed value can be trusted to be border-box. + if ( ( !support.boxSizingReliable() && isBorderBox || + + // Support: IE 10 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Interestingly, in some cases IE 9 doesn't suffer from this issue. + !support.reliableTrDimensions() && nodeName( elem, "tr" ) || + + // Fall back to offsetWidth/offsetHeight when value is "auto" + // This happens for inline elements with no explicit setting (gh-3571) + val === "auto" || + + // Support: Android <=4.1 - 4.3 only + // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) + !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && + + // Make sure the element is visible & connected + elem.getClientRects().length ) { + + isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; + + // Where available, offsetWidth/offsetHeight approximate border box dimensions. + // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the + // retrieved value as a content box dimension. + valueIsBorderBox = offsetProp in elem; + if ( valueIsBorderBox ) { + val = elem[ offsetProp ]; + } + } + + // Normalize "" and auto + val = parseFloat( val ) || 0; + + // Adjust for the element's box model + return ( val + + boxModelAdjustment( + elem, + dimension, + extra || ( isBorderBox ? "border" : "content" ), + valueIsBorderBox, + styles, + + // Provide the current computed size to request scroll gutter calculation (gh-3589) + val + ) + ) + "px"; +} + +jQuery.extend( { + + // Add in style property hooks for overriding the default + // behavior of getting and setting a style property + cssHooks: { + opacity: { + get: function( elem, computed ) { + if ( computed ) { + + // We should always get a number back from opacity + var ret = curCSS( elem, "opacity" ); + return ret === "" ? "1" : ret; + } + } + } + }, + + // Don't automatically add "px" to these possibly-unitless properties + cssNumber: { + "animationIterationCount": true, + "columnCount": true, + "fillOpacity": true, + "flexGrow": true, + "flexShrink": true, + "fontWeight": true, + "gridArea": true, + "gridColumn": true, + "gridColumnEnd": true, + "gridColumnStart": true, + "gridRow": true, + "gridRowEnd": true, + "gridRowStart": true, + "lineHeight": true, + "opacity": true, + "order": true, + "orphans": true, + "widows": true, + "zIndex": true, + "zoom": true + }, + + // Add in properties whose names you wish to fix before + // setting or getting the value + cssProps: {}, + + // Get and set the style property on a DOM Node + style: function( elem, name, value, extra ) { + + // Don't set styles on text and comment nodes + if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { + return; + } + + // Make sure that we're working with the right name + var ret, type, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ), + style = elem.style; + + // Make sure that we're working with the right name. We don't + // want to query the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Gets hook for the prefixed version, then unprefixed version + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // Check if we're setting a value + if ( value !== undefined ) { + type = typeof value; + + // Convert "+=" or "-=" to relative numbers (#7345) + if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { + value = adjustCSS( elem, name, ret ); + + // Fixes bug #9237 + type = "number"; + } + + // Make sure that null and NaN values aren't set (#7116) + if ( value == null || value !== value ) { + return; + } + + // If a number was passed in, add the unit (except for certain CSS properties) + // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append + // "px" to a few hardcoded values. + if ( type === "number" && !isCustomProp ) { + value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); + } + + // background-* props affect original clone's values + if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { + style[ name ] = "inherit"; + } + + // If a hook was provided, use that value, otherwise just set the specified value + if ( !hooks || !( "set" in hooks ) || + ( value = hooks.set( elem, value, extra ) ) !== undefined ) { + + if ( isCustomProp ) { + style.setProperty( name, value ); + } else { + style[ name ] = value; + } + } + + } else { + + // If a hook was provided get the non-computed value from there + if ( hooks && "get" in hooks && + ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { + + return ret; + } + + // Otherwise just get the value from the style object + return style[ name ]; + } + }, + + css: function( elem, name, extra, styles ) { + var val, num, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ); + + // Make sure that we're working with the right name. We don't + // want to modify the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Try prefixed name followed by the unprefixed name + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // If a hook was provided get the computed value from there + if ( hooks && "get" in hooks ) { + val = hooks.get( elem, true, extra ); + } + + // Otherwise, if a way to get the computed value exists, use that + if ( val === undefined ) { + val = curCSS( elem, name, styles ); + } + + // Convert "normal" to computed value + if ( val === "normal" && name in cssNormalTransform ) { + val = cssNormalTransform[ name ]; + } + + // Make numeric if forced or a qualifier was provided and val looks numeric + if ( extra === "" || extra ) { + num = parseFloat( val ); + return extra === true || isFinite( num ) ? num || 0 : val; + } + + return val; + } +} ); + +jQuery.each( [ "height", "width" ], function( _i, dimension ) { + jQuery.cssHooks[ dimension ] = { + get: function( elem, computed, extra ) { + if ( computed ) { + + // Certain elements can have dimension info if we invisibly show them + // but it must have a current display style that would benefit + return rdisplayswap.test( jQuery.css( elem, "display" ) ) && + + // Support: Safari 8+ + // Table columns in Safari have non-zero offsetWidth & zero + // getBoundingClientRect().width unless display is changed. + // Support: IE <=11 only + // Running getBoundingClientRect on a disconnected node + // in IE throws an error. + ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? + swap( elem, cssShow, function() { + return getWidthOrHeight( elem, dimension, extra ); + } ) : + getWidthOrHeight( elem, dimension, extra ); + } + }, + + set: function( elem, value, extra ) { + var matches, + styles = getStyles( elem ), + + // Only read styles.position if the test has a chance to fail + // to avoid forcing a reflow. + scrollboxSizeBuggy = !support.scrollboxSize() && + styles.position === "absolute", + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) + boxSizingNeeded = scrollboxSizeBuggy || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + subtract = extra ? + boxModelAdjustment( + elem, + dimension, + extra, + isBorderBox, + styles + ) : + 0; + + // Account for unreliable border-box dimensions by comparing offset* to computed and + // faking a content-box to get border and padding (gh-3699) + if ( isBorderBox && scrollboxSizeBuggy ) { + subtract -= Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + parseFloat( styles[ dimension ] ) - + boxModelAdjustment( elem, dimension, "border", false, styles ) - + 0.5 + ); + } + + // Convert to pixels if value adjustment is needed + if ( subtract && ( matches = rcssNum.exec( value ) ) && + ( matches[ 3 ] || "px" ) !== "px" ) { + + elem.style[ dimension ] = value; + value = jQuery.css( elem, dimension ); + } + + return setPositiveNumber( elem, value, subtract ); + } + }; +} ); + +jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, + function( elem, computed ) { + if ( computed ) { + return ( parseFloat( curCSS( elem, "marginLeft" ) ) || + elem.getBoundingClientRect().left - + swap( elem, { marginLeft: 0 }, function() { + return elem.getBoundingClientRect().left; + } ) + ) + "px"; + } + } +); + +// These hooks are used by animate to expand properties +jQuery.each( { + margin: "", + padding: "", + border: "Width" +}, function( prefix, suffix ) { + jQuery.cssHooks[ prefix + suffix ] = { + expand: function( value ) { + var i = 0, + expanded = {}, + + // Assumes a single number if not a string + parts = typeof value === "string" ? value.split( " " ) : [ value ]; + + for ( ; i < 4; i++ ) { + expanded[ prefix + cssExpand[ i ] + suffix ] = + parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; + } + + return expanded; + } + }; + + if ( prefix !== "margin" ) { + jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; + } +} ); + +jQuery.fn.extend( { + css: function( name, value ) { + return access( this, function( elem, name, value ) { + var styles, len, + map = {}, + i = 0; + + if ( Array.isArray( name ) ) { + styles = getStyles( elem ); + len = name.length; + + for ( ; i < len; i++ ) { + map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); + } + + return map; + } + + return value !== undefined ? + jQuery.style( elem, name, value ) : + jQuery.css( elem, name ); + }, name, value, arguments.length > 1 ); + } +} ); + + +function Tween( elem, options, prop, end, easing ) { + return new Tween.prototype.init( elem, options, prop, end, easing ); +} +jQuery.Tween = Tween; + +Tween.prototype = { + constructor: Tween, + init: function( elem, options, prop, end, easing, unit ) { + this.elem = elem; + this.prop = prop; + this.easing = easing || jQuery.easing._default; + this.options = options; + this.start = this.now = this.cur(); + this.end = end; + this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); + }, + cur: function() { + var hooks = Tween.propHooks[ this.prop ]; + + return hooks && hooks.get ? + hooks.get( this ) : + Tween.propHooks._default.get( this ); + }, + run: function( percent ) { + var eased, + hooks = Tween.propHooks[ this.prop ]; + + if ( this.options.duration ) { + this.pos = eased = jQuery.easing[ this.easing ]( + percent, this.options.duration * percent, 0, 1, this.options.duration + ); + } else { + this.pos = eased = percent; + } + this.now = ( this.end - this.start ) * eased + this.start; + + if ( this.options.step ) { + this.options.step.call( this.elem, this.now, this ); + } + + if ( hooks && hooks.set ) { + hooks.set( this ); + } else { + Tween.propHooks._default.set( this ); + } + return this; + } +}; + +Tween.prototype.init.prototype = Tween.prototype; + +Tween.propHooks = { + _default: { + get: function( tween ) { + var result; + + // Use a property on the element directly when it is not a DOM element, + // or when there is no matching style property that exists. + if ( tween.elem.nodeType !== 1 || + tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { + return tween.elem[ tween.prop ]; + } + + // Passing an empty string as a 3rd parameter to .css will automatically + // attempt a parseFloat and fallback to a string if the parse fails. + // Simple values such as "10px" are parsed to Float; + // complex values such as "rotate(1rad)" are returned as-is. + result = jQuery.css( tween.elem, tween.prop, "" ); + + // Empty strings, null, undefined and "auto" are converted to 0. + return !result || result === "auto" ? 0 : result; + }, + set: function( tween ) { + + // Use step hook for back compat. + // Use cssHook if its there. + // Use .style if available and use plain properties where available. + if ( jQuery.fx.step[ tween.prop ] ) { + jQuery.fx.step[ tween.prop ]( tween ); + } else if ( tween.elem.nodeType === 1 && ( + jQuery.cssHooks[ tween.prop ] || + tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { + jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); + } else { + tween.elem[ tween.prop ] = tween.now; + } + } + } +}; + +// Support: IE <=9 only +// Panic based approach to setting things on disconnected nodes +Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { + set: function( tween ) { + if ( tween.elem.nodeType && tween.elem.parentNode ) { + tween.elem[ tween.prop ] = tween.now; + } + } +}; + +jQuery.easing = { + linear: function( p ) { + return p; + }, + swing: function( p ) { + return 0.5 - Math.cos( p * Math.PI ) / 2; + }, + _default: "swing" +}; + +jQuery.fx = Tween.prototype.init; + +// Back compat <1.8 extension point +jQuery.fx.step = {}; + + + + +var + fxNow, inProgress, + rfxtypes = /^(?:toggle|show|hide)$/, + rrun = /queueHooks$/; + +function schedule() { + if ( inProgress ) { + if ( document.hidden === false && window.requestAnimationFrame ) { + window.requestAnimationFrame( schedule ); + } else { + window.setTimeout( schedule, jQuery.fx.interval ); + } + + jQuery.fx.tick(); + } +} + +// Animations created synchronously will run synchronously +function createFxNow() { + window.setTimeout( function() { + fxNow = undefined; + } ); + return ( fxNow = Date.now() ); +} + +// Generate parameters to create a standard animation +function genFx( type, includeWidth ) { + var which, + i = 0, + attrs = { height: type }; + + // If we include width, step value is 1 to do all cssExpand values, + // otherwise step value is 2 to skip over Left and Right + includeWidth = includeWidth ? 1 : 0; + for ( ; i < 4; i += 2 - includeWidth ) { + which = cssExpand[ i ]; + attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; + } + + if ( includeWidth ) { + attrs.opacity = attrs.width = type; + } + + return attrs; +} + +function createTween( value, prop, animation ) { + var tween, + collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), + index = 0, + length = collection.length; + for ( ; index < length; index++ ) { + if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { + + // We're done with this property + return tween; + } + } +} + +function defaultPrefilter( elem, props, opts ) { + var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, + isBox = "width" in props || "height" in props, + anim = this, + orig = {}, + style = elem.style, + hidden = elem.nodeType && isHiddenWithinTree( elem ), + dataShow = dataPriv.get( elem, "fxshow" ); + + // Queue-skipping animations hijack the fx hooks + if ( !opts.queue ) { + hooks = jQuery._queueHooks( elem, "fx" ); + if ( hooks.unqueued == null ) { + hooks.unqueued = 0; + oldfire = hooks.empty.fire; + hooks.empty.fire = function() { + if ( !hooks.unqueued ) { + oldfire(); + } + }; + } + hooks.unqueued++; + + anim.always( function() { + + // Ensure the complete handler is called before this completes + anim.always( function() { + hooks.unqueued--; + if ( !jQuery.queue( elem, "fx" ).length ) { + hooks.empty.fire(); + } + } ); + } ); + } + + // Detect show/hide animations + for ( prop in props ) { + value = props[ prop ]; + if ( rfxtypes.test( value ) ) { + delete props[ prop ]; + toggle = toggle || value === "toggle"; + if ( value === ( hidden ? "hide" : "show" ) ) { + + // Pretend to be hidden if this is a "show" and + // there is still data from a stopped show/hide + if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { + hidden = true; + + // Ignore all other no-op show/hide data + } else { + continue; + } + } + orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); + } + } + + // Bail out if this is a no-op like .hide().hide() + propTween = !jQuery.isEmptyObject( props ); + if ( !propTween && jQuery.isEmptyObject( orig ) ) { + return; + } + + // Restrict "overflow" and "display" styles during box animations + if ( isBox && elem.nodeType === 1 ) { + + // Support: IE <=9 - 11, Edge 12 - 15 + // Record all 3 overflow attributes because IE does not infer the shorthand + // from identically-valued overflowX and overflowY and Edge just mirrors + // the overflowX value there. + opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; + + // Identify a display type, preferring old show/hide data over the CSS cascade + restoreDisplay = dataShow && dataShow.display; + if ( restoreDisplay == null ) { + restoreDisplay = dataPriv.get( elem, "display" ); + } + display = jQuery.css( elem, "display" ); + if ( display === "none" ) { + if ( restoreDisplay ) { + display = restoreDisplay; + } else { + + // Get nonempty value(s) by temporarily forcing visibility + showHide( [ elem ], true ); + restoreDisplay = elem.style.display || restoreDisplay; + display = jQuery.css( elem, "display" ); + showHide( [ elem ] ); + } + } + + // Animate inline elements as inline-block + if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { + if ( jQuery.css( elem, "float" ) === "none" ) { + + // Restore the original display value at the end of pure show/hide animations + if ( !propTween ) { + anim.done( function() { + style.display = restoreDisplay; + } ); + if ( restoreDisplay == null ) { + display = style.display; + restoreDisplay = display === "none" ? "" : display; + } + } + style.display = "inline-block"; + } + } + } + + if ( opts.overflow ) { + style.overflow = "hidden"; + anim.always( function() { + style.overflow = opts.overflow[ 0 ]; + style.overflowX = opts.overflow[ 1 ]; + style.overflowY = opts.overflow[ 2 ]; + } ); + } + + // Implement show/hide animations + propTween = false; + for ( prop in orig ) { + + // General show/hide setup for this element animation + if ( !propTween ) { + if ( dataShow ) { + if ( "hidden" in dataShow ) { + hidden = dataShow.hidden; + } + } else { + dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); + } + + // Store hidden/visible for toggle so `.stop().toggle()` "reverses" + if ( toggle ) { + dataShow.hidden = !hidden; + } + + // Show elements before animating them + if ( hidden ) { + showHide( [ elem ], true ); + } + + /* eslint-disable no-loop-func */ + + anim.done( function() { + + /* eslint-enable no-loop-func */ + + // The final step of a "hide" animation is actually hiding the element + if ( !hidden ) { + showHide( [ elem ] ); + } + dataPriv.remove( elem, "fxshow" ); + for ( prop in orig ) { + jQuery.style( elem, prop, orig[ prop ] ); + } + } ); + } + + // Per-property setup + propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); + if ( !( prop in dataShow ) ) { + dataShow[ prop ] = propTween.start; + if ( hidden ) { + propTween.end = propTween.start; + propTween.start = 0; + } + } + } +} + +function propFilter( props, specialEasing ) { + var index, name, easing, value, hooks; + + // camelCase, specialEasing and expand cssHook pass + for ( index in props ) { + name = camelCase( index ); + easing = specialEasing[ name ]; + value = props[ index ]; + if ( Array.isArray( value ) ) { + easing = value[ 1 ]; + value = props[ index ] = value[ 0 ]; + } + + if ( index !== name ) { + props[ name ] = value; + delete props[ index ]; + } + + hooks = jQuery.cssHooks[ name ]; + if ( hooks && "expand" in hooks ) { + value = hooks.expand( value ); + delete props[ name ]; + + // Not quite $.extend, this won't overwrite existing keys. + // Reusing 'index' because we have the correct "name" + for ( index in value ) { + if ( !( index in props ) ) { + props[ index ] = value[ index ]; + specialEasing[ index ] = easing; + } + } + } else { + specialEasing[ name ] = easing; + } + } +} + +function Animation( elem, properties, options ) { + var result, + stopped, + index = 0, + length = Animation.prefilters.length, + deferred = jQuery.Deferred().always( function() { + + // Don't match elem in the :animated selector + delete tick.elem; + } ), + tick = function() { + if ( stopped ) { + return false; + } + var currentTime = fxNow || createFxNow(), + remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), + + // Support: Android 2.3 only + // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) + temp = remaining / animation.duration || 0, + percent = 1 - temp, + index = 0, + length = animation.tweens.length; + + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( percent ); + } + + deferred.notifyWith( elem, [ animation, percent, remaining ] ); + + // If there's more to do, yield + if ( percent < 1 && length ) { + return remaining; + } + + // If this was an empty animation, synthesize a final progress notification + if ( !length ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + } + + // Resolve the animation and report its conclusion + deferred.resolveWith( elem, [ animation ] ); + return false; + }, + animation = deferred.promise( { + elem: elem, + props: jQuery.extend( {}, properties ), + opts: jQuery.extend( true, { + specialEasing: {}, + easing: jQuery.easing._default + }, options ), + originalProperties: properties, + originalOptions: options, + startTime: fxNow || createFxNow(), + duration: options.duration, + tweens: [], + createTween: function( prop, end ) { + var tween = jQuery.Tween( elem, animation.opts, prop, end, + animation.opts.specialEasing[ prop ] || animation.opts.easing ); + animation.tweens.push( tween ); + return tween; + }, + stop: function( gotoEnd ) { + var index = 0, + + // If we are going to the end, we want to run all the tweens + // otherwise we skip this part + length = gotoEnd ? animation.tweens.length : 0; + if ( stopped ) { + return this; + } + stopped = true; + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( 1 ); + } + + // Resolve when we played the last frame; otherwise, reject + if ( gotoEnd ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + deferred.resolveWith( elem, [ animation, gotoEnd ] ); + } else { + deferred.rejectWith( elem, [ animation, gotoEnd ] ); + } + return this; + } + } ), + props = animation.props; + + propFilter( props, animation.opts.specialEasing ); + + for ( ; index < length; index++ ) { + result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); + if ( result ) { + if ( isFunction( result.stop ) ) { + jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = + result.stop.bind( result ); + } + return result; + } + } + + jQuery.map( props, createTween, animation ); + + if ( isFunction( animation.opts.start ) ) { + animation.opts.start.call( elem, animation ); + } + + // Attach callbacks from options + animation + .progress( animation.opts.progress ) + .done( animation.opts.done, animation.opts.complete ) + .fail( animation.opts.fail ) + .always( animation.opts.always ); + + jQuery.fx.timer( + jQuery.extend( tick, { + elem: elem, + anim: animation, + queue: animation.opts.queue + } ) + ); + + return animation; +} + +jQuery.Animation = jQuery.extend( Animation, { + + tweeners: { + "*": [ function( prop, value ) { + var tween = this.createTween( prop, value ); + adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); + return tween; + } ] + }, + + tweener: function( props, callback ) { + if ( isFunction( props ) ) { + callback = props; + props = [ "*" ]; + } else { + props = props.match( rnothtmlwhite ); + } + + var prop, + index = 0, + length = props.length; + + for ( ; index < length; index++ ) { + prop = props[ index ]; + Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; + Animation.tweeners[ prop ].unshift( callback ); + } + }, + + prefilters: [ defaultPrefilter ], + + prefilter: function( callback, prepend ) { + if ( prepend ) { + Animation.prefilters.unshift( callback ); + } else { + Animation.prefilters.push( callback ); + } + } +} ); + +jQuery.speed = function( speed, easing, fn ) { + var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { + complete: fn || !fn && easing || + isFunction( speed ) && speed, + duration: speed, + easing: fn && easing || easing && !isFunction( easing ) && easing + }; + + // Go to the end state if fx are off + if ( jQuery.fx.off ) { + opt.duration = 0; + + } else { + if ( typeof opt.duration !== "number" ) { + if ( opt.duration in jQuery.fx.speeds ) { + opt.duration = jQuery.fx.speeds[ opt.duration ]; + + } else { + opt.duration = jQuery.fx.speeds._default; + } + } + } + + // Normalize opt.queue - true/undefined/null -> "fx" + if ( opt.queue == null || opt.queue === true ) { + opt.queue = "fx"; + } + + // Queueing + opt.old = opt.complete; + + opt.complete = function() { + if ( isFunction( opt.old ) ) { + opt.old.call( this ); + } + + if ( opt.queue ) { + jQuery.dequeue( this, opt.queue ); + } + }; + + return opt; +}; + +jQuery.fn.extend( { + fadeTo: function( speed, to, easing, callback ) { + + // Show any hidden elements after setting opacity to 0 + return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() + + // Animate to the value specified + .end().animate( { opacity: to }, speed, easing, callback ); + }, + animate: function( prop, speed, easing, callback ) { + var empty = jQuery.isEmptyObject( prop ), + optall = jQuery.speed( speed, easing, callback ), + doAnimation = function() { + + // Operate on a copy of prop so per-property easing won't be lost + var anim = Animation( this, jQuery.extend( {}, prop ), optall ); + + // Empty animations, or finishing resolves immediately + if ( empty || dataPriv.get( this, "finish" ) ) { + anim.stop( true ); + } + }; + doAnimation.finish = doAnimation; + + return empty || optall.queue === false ? + this.each( doAnimation ) : + this.queue( optall.queue, doAnimation ); + }, + stop: function( type, clearQueue, gotoEnd ) { + var stopQueue = function( hooks ) { + var stop = hooks.stop; + delete hooks.stop; + stop( gotoEnd ); + }; + + if ( typeof type !== "string" ) { + gotoEnd = clearQueue; + clearQueue = type; + type = undefined; + } + if ( clearQueue ) { + this.queue( type || "fx", [] ); + } + + return this.each( function() { + var dequeue = true, + index = type != null && type + "queueHooks", + timers = jQuery.timers, + data = dataPriv.get( this ); + + if ( index ) { + if ( data[ index ] && data[ index ].stop ) { + stopQueue( data[ index ] ); + } + } else { + for ( index in data ) { + if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { + stopQueue( data[ index ] ); + } + } + } + + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && + ( type == null || timers[ index ].queue === type ) ) { + + timers[ index ].anim.stop( gotoEnd ); + dequeue = false; + timers.splice( index, 1 ); + } + } + + // Start the next in the queue if the last step wasn't forced. + // Timers currently will call their complete callbacks, which + // will dequeue but only if they were gotoEnd. + if ( dequeue || !gotoEnd ) { + jQuery.dequeue( this, type ); + } + } ); + }, + finish: function( type ) { + if ( type !== false ) { + type = type || "fx"; + } + return this.each( function() { + var index, + data = dataPriv.get( this ), + queue = data[ type + "queue" ], + hooks = data[ type + "queueHooks" ], + timers = jQuery.timers, + length = queue ? queue.length : 0; + + // Enable finishing flag on private data + data.finish = true; + + // Empty the queue first + jQuery.queue( this, type, [] ); + + if ( hooks && hooks.stop ) { + hooks.stop.call( this, true ); + } + + // Look for any active animations, and finish them + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && timers[ index ].queue === type ) { + timers[ index ].anim.stop( true ); + timers.splice( index, 1 ); + } + } + + // Look for any animations in the old queue and finish them + for ( index = 0; index < length; index++ ) { + if ( queue[ index ] && queue[ index ].finish ) { + queue[ index ].finish.call( this ); + } + } + + // Turn off finishing flag + delete data.finish; + } ); + } +} ); + +jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { + var cssFn = jQuery.fn[ name ]; + jQuery.fn[ name ] = function( speed, easing, callback ) { + return speed == null || typeof speed === "boolean" ? + cssFn.apply( this, arguments ) : + this.animate( genFx( name, true ), speed, easing, callback ); + }; +} ); + +// Generate shortcuts for custom animations +jQuery.each( { + slideDown: genFx( "show" ), + slideUp: genFx( "hide" ), + slideToggle: genFx( "toggle" ), + fadeIn: { opacity: "show" }, + fadeOut: { opacity: "hide" }, + fadeToggle: { opacity: "toggle" } +}, function( name, props ) { + jQuery.fn[ name ] = function( speed, easing, callback ) { + return this.animate( props, speed, easing, callback ); + }; +} ); + +jQuery.timers = []; +jQuery.fx.tick = function() { + var timer, + i = 0, + timers = jQuery.timers; + + fxNow = Date.now(); + + for ( ; i < timers.length; i++ ) { + timer = timers[ i ]; + + // Run the timer and safely remove it when done (allowing for external removal) + if ( !timer() && timers[ i ] === timer ) { + timers.splice( i--, 1 ); + } + } + + if ( !timers.length ) { + jQuery.fx.stop(); + } + fxNow = undefined; +}; + +jQuery.fx.timer = function( timer ) { + jQuery.timers.push( timer ); + jQuery.fx.start(); +}; + +jQuery.fx.interval = 13; +jQuery.fx.start = function() { + if ( inProgress ) { + return; + } + + inProgress = true; + schedule(); +}; + +jQuery.fx.stop = function() { + inProgress = null; +}; + +jQuery.fx.speeds = { + slow: 600, + fast: 200, + + // Default speed + _default: 400 +}; + + +// Based off of the plugin by Clint Helfers, with permission. +// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ +jQuery.fn.delay = function( time, type ) { + time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; + type = type || "fx"; + + return this.queue( type, function( next, hooks ) { + var timeout = window.setTimeout( next, time ); + hooks.stop = function() { + window.clearTimeout( timeout ); + }; + } ); +}; + + +( function() { + var input = document.createElement( "input" ), + select = document.createElement( "select" ), + opt = select.appendChild( document.createElement( "option" ) ); + + input.type = "checkbox"; + + // Support: Android <=4.3 only + // Default value for a checkbox should be "on" + support.checkOn = input.value !== ""; + + // Support: IE <=11 only + // Must access selectedIndex to make default options select + support.optSelected = opt.selected; + + // Support: IE <=11 only + // An input loses its value after becoming a radio + input = document.createElement( "input" ); + input.value = "t"; + input.type = "radio"; + support.radioValue = input.value === "t"; +} )(); + + +var boolHook, + attrHandle = jQuery.expr.attrHandle; + +jQuery.fn.extend( { + attr: function( name, value ) { + return access( this, jQuery.attr, name, value, arguments.length > 1 ); + }, + + removeAttr: function( name ) { + return this.each( function() { + jQuery.removeAttr( this, name ); + } ); + } +} ); + +jQuery.extend( { + attr: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set attributes on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + // Fallback to prop when attributes are not supported + if ( typeof elem.getAttribute === "undefined" ) { + return jQuery.prop( elem, name, value ); + } + + // Attribute hooks are determined by the lowercase version + // Grab necessary hook if one is defined + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + hooks = jQuery.attrHooks[ name.toLowerCase() ] || + ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); + } + + if ( value !== undefined ) { + if ( value === null ) { + jQuery.removeAttr( elem, name ); + return; + } + + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + elem.setAttribute( name, value + "" ); + return value; + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + ret = jQuery.find.attr( elem, name ); + + // Non-existent attributes return null, we normalize to undefined + return ret == null ? undefined : ret; + }, + + attrHooks: { + type: { + set: function( elem, value ) { + if ( !support.radioValue && value === "radio" && + nodeName( elem, "input" ) ) { + var val = elem.value; + elem.setAttribute( "type", value ); + if ( val ) { + elem.value = val; + } + return value; + } + } + } + }, + + removeAttr: function( elem, value ) { + var name, + i = 0, + + // Attribute names can contain non-HTML whitespace characters + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + attrNames = value && value.match( rnothtmlwhite ); + + if ( attrNames && elem.nodeType === 1 ) { + while ( ( name = attrNames[ i++ ] ) ) { + elem.removeAttribute( name ); + } + } + } +} ); + +// Hooks for boolean attributes +boolHook = { + set: function( elem, value, name ) { + if ( value === false ) { + + // Remove boolean attributes when set to false + jQuery.removeAttr( elem, name ); + } else { + elem.setAttribute( name, name ); + } + return name; + } +}; + +jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { + var getter = attrHandle[ name ] || jQuery.find.attr; + + attrHandle[ name ] = function( elem, name, isXML ) { + var ret, handle, + lowercaseName = name.toLowerCase(); + + if ( !isXML ) { + + // Avoid an infinite loop by temporarily removing this function from the getter + handle = attrHandle[ lowercaseName ]; + attrHandle[ lowercaseName ] = ret; + ret = getter( elem, name, isXML ) != null ? + lowercaseName : + null; + attrHandle[ lowercaseName ] = handle; + } + return ret; + }; +} ); + + + + +var rfocusable = /^(?:input|select|textarea|button)$/i, + rclickable = /^(?:a|area)$/i; + +jQuery.fn.extend( { + prop: function( name, value ) { + return access( this, jQuery.prop, name, value, arguments.length > 1 ); + }, + + removeProp: function( name ) { + return this.each( function() { + delete this[ jQuery.propFix[ name ] || name ]; + } ); + } +} ); + +jQuery.extend( { + prop: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set properties on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + + // Fix name and attach hooks + name = jQuery.propFix[ name ] || name; + hooks = jQuery.propHooks[ name ]; + } + + if ( value !== undefined ) { + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + return ( elem[ name ] = value ); + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + return elem[ name ]; + }, + + propHooks: { + tabIndex: { + get: function( elem ) { + + // Support: IE <=9 - 11 only + // elem.tabIndex doesn't always return the + // correct value when it hasn't been explicitly set + // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ + // Use proper attribute retrieval(#12072) + var tabindex = jQuery.find.attr( elem, "tabindex" ); + + if ( tabindex ) { + return parseInt( tabindex, 10 ); + } + + if ( + rfocusable.test( elem.nodeName ) || + rclickable.test( elem.nodeName ) && + elem.href + ) { + return 0; + } + + return -1; + } + } + }, + + propFix: { + "for": "htmlFor", + "class": "className" + } +} ); + +// Support: IE <=11 only +// Accessing the selectedIndex property +// forces the browser to respect setting selected +// on the option +// The getter ensures a default option is selected +// when in an optgroup +// eslint rule "no-unused-expressions" is disabled for this code +// since it considers such accessions noop +if ( !support.optSelected ) { + jQuery.propHooks.selected = { + get: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent && parent.parentNode ) { + parent.parentNode.selectedIndex; + } + return null; + }, + set: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent ) { + parent.selectedIndex; + + if ( parent.parentNode ) { + parent.parentNode.selectedIndex; + } + } + } + }; +} + +jQuery.each( [ + "tabIndex", + "readOnly", + "maxLength", + "cellSpacing", + "cellPadding", + "rowSpan", + "colSpan", + "useMap", + "frameBorder", + "contentEditable" +], function() { + jQuery.propFix[ this.toLowerCase() ] = this; +} ); + + + + + // Strip and collapse whitespace according to HTML spec + // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace + function stripAndCollapse( value ) { + var tokens = value.match( rnothtmlwhite ) || []; + return tokens.join( " " ); + } + + +function getClass( elem ) { + return elem.getAttribute && elem.getAttribute( "class" ) || ""; +} + +function classesToArray( value ) { + if ( Array.isArray( value ) ) { + return value; + } + if ( typeof value === "string" ) { + return value.match( rnothtmlwhite ) || []; + } + return []; +} + +jQuery.fn.extend( { + addClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + if ( cur.indexOf( " " + clazz + " " ) < 0 ) { + cur += clazz + " "; + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + removeClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( !arguments.length ) { + return this.attr( "class", "" ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + + // This expression is here for better compressibility (see addClass) + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + + // Remove *all* instances + while ( cur.indexOf( " " + clazz + " " ) > -1 ) { + cur = cur.replace( " " + clazz + " ", " " ); + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + toggleClass: function( value, stateVal ) { + var type = typeof value, + isValidValue = type === "string" || Array.isArray( value ); + + if ( typeof stateVal === "boolean" && isValidValue ) { + return stateVal ? this.addClass( value ) : this.removeClass( value ); + } + + if ( isFunction( value ) ) { + return this.each( function( i ) { + jQuery( this ).toggleClass( + value.call( this, i, getClass( this ), stateVal ), + stateVal + ); + } ); + } + + return this.each( function() { + var className, i, self, classNames; + + if ( isValidValue ) { + + // Toggle individual class names + i = 0; + self = jQuery( this ); + classNames = classesToArray( value ); + + while ( ( className = classNames[ i++ ] ) ) { + + // Check each className given, space separated list + if ( self.hasClass( className ) ) { + self.removeClass( className ); + } else { + self.addClass( className ); + } + } + + // Toggle whole class name + } else if ( value === undefined || type === "boolean" ) { + className = getClass( this ); + if ( className ) { + + // Store className if set + dataPriv.set( this, "__className__", className ); + } + + // If the element has a class name or if we're passed `false`, + // then remove the whole classname (if there was one, the above saved it). + // Otherwise bring back whatever was previously saved (if anything), + // falling back to the empty string if nothing was stored. + if ( this.setAttribute ) { + this.setAttribute( "class", + className || value === false ? + "" : + dataPriv.get( this, "__className__" ) || "" + ); + } + } + } ); + }, + + hasClass: function( selector ) { + var className, elem, + i = 0; + + className = " " + selector + " "; + while ( ( elem = this[ i++ ] ) ) { + if ( elem.nodeType === 1 && + ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { + return true; + } + } + + return false; + } +} ); + + + + +var rreturn = /\r/g; + +jQuery.fn.extend( { + val: function( value ) { + var hooks, ret, valueIsFunction, + elem = this[ 0 ]; + + if ( !arguments.length ) { + if ( elem ) { + hooks = jQuery.valHooks[ elem.type ] || + jQuery.valHooks[ elem.nodeName.toLowerCase() ]; + + if ( hooks && + "get" in hooks && + ( ret = hooks.get( elem, "value" ) ) !== undefined + ) { + return ret; + } + + ret = elem.value; + + // Handle most common string cases + if ( typeof ret === "string" ) { + return ret.replace( rreturn, "" ); + } + + // Handle cases where value is null/undef or number + return ret == null ? "" : ret; + } + + return; + } + + valueIsFunction = isFunction( value ); + + return this.each( function( i ) { + var val; + + if ( this.nodeType !== 1 ) { + return; + } + + if ( valueIsFunction ) { + val = value.call( this, i, jQuery( this ).val() ); + } else { + val = value; + } + + // Treat null/undefined as ""; convert numbers to string + if ( val == null ) { + val = ""; + + } else if ( typeof val === "number" ) { + val += ""; + + } else if ( Array.isArray( val ) ) { + val = jQuery.map( val, function( value ) { + return value == null ? "" : value + ""; + } ); + } + + hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; + + // If set returns undefined, fall back to normal setting + if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { + this.value = val; + } + } ); + } +} ); + +jQuery.extend( { + valHooks: { + option: { + get: function( elem ) { + + var val = jQuery.find.attr( elem, "value" ); + return val != null ? + val : + + // Support: IE <=10 - 11 only + // option.text throws exceptions (#14686, #14858) + // Strip and collapse whitespace + // https://html.spec.whatwg.org/#strip-and-collapse-whitespace + stripAndCollapse( jQuery.text( elem ) ); + } + }, + select: { + get: function( elem ) { + var value, option, i, + options = elem.options, + index = elem.selectedIndex, + one = elem.type === "select-one", + values = one ? null : [], + max = one ? index + 1 : options.length; + + if ( index < 0 ) { + i = max; + + } else { + i = one ? index : 0; + } + + // Loop through all the selected options + for ( ; i < max; i++ ) { + option = options[ i ]; + + // Support: IE <=9 only + // IE8-9 doesn't update selected after form reset (#2551) + if ( ( option.selected || i === index ) && + + // Don't return options that are disabled or in a disabled optgroup + !option.disabled && + ( !option.parentNode.disabled || + !nodeName( option.parentNode, "optgroup" ) ) ) { + + // Get the specific value for the option + value = jQuery( option ).val(); + + // We don't need an array for one selects + if ( one ) { + return value; + } + + // Multi-Selects return an array + values.push( value ); + } + } + + return values; + }, + + set: function( elem, value ) { + var optionSet, option, + options = elem.options, + values = jQuery.makeArray( value ), + i = options.length; + + while ( i-- ) { + option = options[ i ]; + + /* eslint-disable no-cond-assign */ + + if ( option.selected = + jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 + ) { + optionSet = true; + } + + /* eslint-enable no-cond-assign */ + } + + // Force browsers to behave consistently when non-matching value is set + if ( !optionSet ) { + elem.selectedIndex = -1; + } + return values; + } + } + } +} ); + +// Radios and checkboxes getter/setter +jQuery.each( [ "radio", "checkbox" ], function() { + jQuery.valHooks[ this ] = { + set: function( elem, value ) { + if ( Array.isArray( value ) ) { + return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); + } + } + }; + if ( !support.checkOn ) { + jQuery.valHooks[ this ].get = function( elem ) { + return elem.getAttribute( "value" ) === null ? "on" : elem.value; + }; + } +} ); + + + + +// Return jQuery for attributes-only inclusion + + +support.focusin = "onfocusin" in window; + + +var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, + stopPropagationCallback = function( e ) { + e.stopPropagation(); + }; + +jQuery.extend( jQuery.event, { + + trigger: function( event, data, elem, onlyHandlers ) { + + var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, + eventPath = [ elem || document ], + type = hasOwn.call( event, "type" ) ? event.type : event, + namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; + + cur = lastElement = tmp = elem = elem || document; + + // Don't do events on text and comment nodes + if ( elem.nodeType === 3 || elem.nodeType === 8 ) { + return; + } + + // focus/blur morphs to focusin/out; ensure we're not firing them right now + if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { + return; + } + + if ( type.indexOf( "." ) > -1 ) { + + // Namespaced trigger; create a regexp to match event type in handle() + namespaces = type.split( "." ); + type = namespaces.shift(); + namespaces.sort(); + } + ontype = type.indexOf( ":" ) < 0 && "on" + type; + + // Caller can pass in a jQuery.Event object, Object, or just an event type string + event = event[ jQuery.expando ] ? + event : + new jQuery.Event( type, typeof event === "object" && event ); + + // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) + event.isTrigger = onlyHandlers ? 2 : 3; + event.namespace = namespaces.join( "." ); + event.rnamespace = event.namespace ? + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : + null; + + // Clean up the event in case it is being reused + event.result = undefined; + if ( !event.target ) { + event.target = elem; + } + + // Clone any incoming data and prepend the event, creating the handler arg list + data = data == null ? + [ event ] : + jQuery.makeArray( data, [ event ] ); + + // Allow special events to draw outside the lines + special = jQuery.event.special[ type ] || {}; + if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { + return; + } + + // Determine event propagation path in advance, per W3C events spec (#9951) + // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) + if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { + + bubbleType = special.delegateType || type; + if ( !rfocusMorph.test( bubbleType + type ) ) { + cur = cur.parentNode; + } + for ( ; cur; cur = cur.parentNode ) { + eventPath.push( cur ); + tmp = cur; + } + + // Only add window if we got to document (e.g., not plain obj or detached DOM) + if ( tmp === ( elem.ownerDocument || document ) ) { + eventPath.push( tmp.defaultView || tmp.parentWindow || window ); + } + } + + // Fire handlers on the event path + i = 0; + while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { + lastElement = cur; + event.type = i > 1 ? + bubbleType : + special.bindType || type; + + // jQuery handler + handle = ( + dataPriv.get( cur, "events" ) || Object.create( null ) + )[ event.type ] && + dataPriv.get( cur, "handle" ); + if ( handle ) { + handle.apply( cur, data ); + } + + // Native handler + handle = ontype && cur[ ontype ]; + if ( handle && handle.apply && acceptData( cur ) ) { + event.result = handle.apply( cur, data ); + if ( event.result === false ) { + event.preventDefault(); + } + } + } + event.type = type; + + // If nobody prevented the default action, do it now + if ( !onlyHandlers && !event.isDefaultPrevented() ) { + + if ( ( !special._default || + special._default.apply( eventPath.pop(), data ) === false ) && + acceptData( elem ) ) { + + // Call a native DOM method on the target with the same name as the event. + // Don't do default actions on window, that's where global variables be (#6170) + if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { + + // Don't re-trigger an onFOO event when we call its FOO() method + tmp = elem[ ontype ]; + + if ( tmp ) { + elem[ ontype ] = null; + } + + // Prevent re-triggering of the same event, since we already bubbled it above + jQuery.event.triggered = type; + + if ( event.isPropagationStopped() ) { + lastElement.addEventListener( type, stopPropagationCallback ); + } + + elem[ type ](); + + if ( event.isPropagationStopped() ) { + lastElement.removeEventListener( type, stopPropagationCallback ); + } + + jQuery.event.triggered = undefined; + + if ( tmp ) { + elem[ ontype ] = tmp; + } + } + } + } + + return event.result; + }, + + // Piggyback on a donor event to simulate a different one + // Used only for `focus(in | out)` events + simulate: function( type, elem, event ) { + var e = jQuery.extend( + new jQuery.Event(), + event, + { + type: type, + isSimulated: true + } + ); + + jQuery.event.trigger( e, null, elem ); + } + +} ); + +jQuery.fn.extend( { + + trigger: function( type, data ) { + return this.each( function() { + jQuery.event.trigger( type, data, this ); + } ); + }, + triggerHandler: function( type, data ) { + var elem = this[ 0 ]; + if ( elem ) { + return jQuery.event.trigger( type, data, elem, true ); + } + } +} ); + + +// Support: Firefox <=44 +// Firefox doesn't have focus(in | out) events +// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 +// +// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 +// focus(in | out) events fire after focus & blur events, +// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order +// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 +if ( !support.focusin ) { + jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { + + // Attach a single capturing handler on the document while someone wants focusin/focusout + var handler = function( event ) { + jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); + }; + + jQuery.event.special[ fix ] = { + setup: function() { + + // Handle: regular nodes (via `this.ownerDocument`), window + // (via `this.document`) & document (via `this`). + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ); + + if ( !attaches ) { + doc.addEventListener( orig, handler, true ); + } + dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); + }, + teardown: function() { + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ) - 1; + + if ( !attaches ) { + doc.removeEventListener( orig, handler, true ); + dataPriv.remove( doc, fix ); + + } else { + dataPriv.access( doc, fix, attaches ); + } + } + }; + } ); +} +var location = window.location; + +var nonce = { guid: Date.now() }; + +var rquery = ( /\?/ ); + + + +// Cross-browser xml parsing +jQuery.parseXML = function( data ) { + var xml; + if ( !data || typeof data !== "string" ) { + return null; + } + + // Support: IE 9 - 11 only + // IE throws on parseFromString with invalid input. + try { + xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); + } catch ( e ) { + xml = undefined; + } + + if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { + jQuery.error( "Invalid XML: " + data ); + } + return xml; +}; + + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && toType( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = isFunction( valueOrFunction ) ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + if ( a == null ) { + return ""; + } + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ) + .filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ) + .map( function( _i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + + +var + r20 = /%20/g, + rhash = /#.*$/, + rantiCache = /([?&])_=[^&]*/, + rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, + + // #7653, #8125, #8152: local protocol detection + rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, + rnoContent = /^(?:GET|HEAD)$/, + rprotocol = /^\/\//, + + /* Prefilters + * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) + * 2) These are called: + * - BEFORE asking for a transport + * - AFTER param serialization (s.data is a string if s.processData is true) + * 3) key is the dataType + * 4) the catchall symbol "*" can be used + * 5) execution will start with transport dataType and THEN continue down to "*" if needed + */ + prefilters = {}, + + /* Transports bindings + * 1) key is the dataType + * 2) the catchall symbol "*" can be used + * 3) selection will start with transport dataType and THEN go to "*" if needed + */ + transports = {}, + + // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression + allTypes = "*/".concat( "*" ), + + // Anchor tag for parsing the document origin + originAnchor = document.createElement( "a" ); + originAnchor.href = location.href; + +// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport +function addToPrefiltersOrTransports( structure ) { + + // dataTypeExpression is optional and defaults to "*" + return function( dataTypeExpression, func ) { + + if ( typeof dataTypeExpression !== "string" ) { + func = dataTypeExpression; + dataTypeExpression = "*"; + } + + var dataType, + i = 0, + dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; + + if ( isFunction( func ) ) { + + // For each dataType in the dataTypeExpression + while ( ( dataType = dataTypes[ i++ ] ) ) { + + // Prepend if requested + if ( dataType[ 0 ] === "+" ) { + dataType = dataType.slice( 1 ) || "*"; + ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); + + // Otherwise append + } else { + ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); + } + } + } + }; +} + +// Base inspection function for prefilters and transports +function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { + + var inspected = {}, + seekingTransport = ( structure === transports ); + + function inspect( dataType ) { + var selected; + inspected[ dataType ] = true; + jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { + var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); + if ( typeof dataTypeOrTransport === "string" && + !seekingTransport && !inspected[ dataTypeOrTransport ] ) { + + options.dataTypes.unshift( dataTypeOrTransport ); + inspect( dataTypeOrTransport ); + return false; + } else if ( seekingTransport ) { + return !( selected = dataTypeOrTransport ); + } + } ); + return selected; + } + + return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); +} + +// A special extend for ajax options +// that takes "flat" options (not to be deep extended) +// Fixes #9887 +function ajaxExtend( target, src ) { + var key, deep, + flatOptions = jQuery.ajaxSettings.flatOptions || {}; + + for ( key in src ) { + if ( src[ key ] !== undefined ) { + ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; + } + } + if ( deep ) { + jQuery.extend( true, target, deep ); + } + + return target; +} + +/* Handles responses to an ajax request: + * - finds the right dataType (mediates between content-type and expected dataType) + * - returns the corresponding response + */ +function ajaxHandleResponses( s, jqXHR, responses ) { + + var ct, type, finalDataType, firstDataType, + contents = s.contents, + dataTypes = s.dataTypes; + + // Remove auto dataType and get content-type in the process + while ( dataTypes[ 0 ] === "*" ) { + dataTypes.shift(); + if ( ct === undefined ) { + ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); + } + } + + // Check if we're dealing with a known content-type + if ( ct ) { + for ( type in contents ) { + if ( contents[ type ] && contents[ type ].test( ct ) ) { + dataTypes.unshift( type ); + break; + } + } + } + + // Check to see if we have a response for the expected dataType + if ( dataTypes[ 0 ] in responses ) { + finalDataType = dataTypes[ 0 ]; + } else { + + // Try convertible dataTypes + for ( type in responses ) { + if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { + finalDataType = type; + break; + } + if ( !firstDataType ) { + firstDataType = type; + } + } + + // Or just use first one + finalDataType = finalDataType || firstDataType; + } + + // If we found a dataType + // We add the dataType to the list if needed + // and return the corresponding response + if ( finalDataType ) { + if ( finalDataType !== dataTypes[ 0 ] ) { + dataTypes.unshift( finalDataType ); + } + return responses[ finalDataType ]; + } +} + +/* Chain conversions given the request and the original response + * Also sets the responseXXX fields on the jqXHR instance + */ +function ajaxConvert( s, response, jqXHR, isSuccess ) { + var conv2, current, conv, tmp, prev, + converters = {}, + + // Work with a copy of dataTypes in case we need to modify it for conversion + dataTypes = s.dataTypes.slice(); + + // Create converters map with lowercased keys + if ( dataTypes[ 1 ] ) { + for ( conv in s.converters ) { + converters[ conv.toLowerCase() ] = s.converters[ conv ]; + } + } + + current = dataTypes.shift(); + + // Convert to each sequential dataType + while ( current ) { + + if ( s.responseFields[ current ] ) { + jqXHR[ s.responseFields[ current ] ] = response; + } + + // Apply the dataFilter if provided + if ( !prev && isSuccess && s.dataFilter ) { + response = s.dataFilter( response, s.dataType ); + } + + prev = current; + current = dataTypes.shift(); + + if ( current ) { + + // There's only work to do if current dataType is non-auto + if ( current === "*" ) { + + current = prev; + + // Convert response if prev dataType is non-auto and differs from current + } else if ( prev !== "*" && prev !== current ) { + + // Seek a direct converter + conv = converters[ prev + " " + current ] || converters[ "* " + current ]; + + // If none found, seek a pair + if ( !conv ) { + for ( conv2 in converters ) { + + // If conv2 outputs current + tmp = conv2.split( " " ); + if ( tmp[ 1 ] === current ) { + + // If prev can be converted to accepted input + conv = converters[ prev + " " + tmp[ 0 ] ] || + converters[ "* " + tmp[ 0 ] ]; + if ( conv ) { + + // Condense equivalence converters + if ( conv === true ) { + conv = converters[ conv2 ]; + + // Otherwise, insert the intermediate dataType + } else if ( converters[ conv2 ] !== true ) { + current = tmp[ 0 ]; + dataTypes.unshift( tmp[ 1 ] ); + } + break; + } + } + } + } + + // Apply converter (if not an equivalence) + if ( conv !== true ) { + + // Unless errors are allowed to bubble, catch and return them + if ( conv && s.throws ) { + response = conv( response ); + } else { + try { + response = conv( response ); + } catch ( e ) { + return { + state: "parsererror", + error: conv ? e : "No conversion from " + prev + " to " + current + }; + } + } + } + } + } + } + + return { state: "success", data: response }; +} + +jQuery.extend( { + + // Counter for holding the number of active queries + active: 0, + + // Last-Modified header cache for next request + lastModified: {}, + etag: {}, + + ajaxSettings: { + url: location.href, + type: "GET", + isLocal: rlocalProtocol.test( location.protocol ), + global: true, + processData: true, + async: true, + contentType: "application/x-www-form-urlencoded; charset=UTF-8", + + /* + timeout: 0, + data: null, + dataType: null, + username: null, + password: null, + cache: null, + throws: false, + traditional: false, + headers: {}, + */ + + accepts: { + "*": allTypes, + text: "text/plain", + html: "text/html", + xml: "application/xml, text/xml", + json: "application/json, text/javascript" + }, + + contents: { + xml: /\bxml\b/, + html: /\bhtml/, + json: /\bjson\b/ + }, + + responseFields: { + xml: "responseXML", + text: "responseText", + json: "responseJSON" + }, + + // Data converters + // Keys separate source (or catchall "*") and destination types with a single space + converters: { + + // Convert anything to text + "* text": String, + + // Text to html (true = no transformation) + "text html": true, + + // Evaluate text as a json expression + "text json": JSON.parse, + + // Parse text as xml + "text xml": jQuery.parseXML + }, + + // For options that shouldn't be deep extended: + // you can add your own custom options here if + // and when you create one that shouldn't be + // deep extended (see ajaxExtend) + flatOptions: { + url: true, + context: true + } + }, + + // Creates a full fledged settings object into target + // with both ajaxSettings and settings fields. + // If target is omitted, writes into ajaxSettings. + ajaxSetup: function( target, settings ) { + return settings ? + + // Building a settings object + ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : + + // Extending ajaxSettings + ajaxExtend( jQuery.ajaxSettings, target ); + }, + + ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), + ajaxTransport: addToPrefiltersOrTransports( transports ), + + // Main method + ajax: function( url, options ) { + + // If url is an object, simulate pre-1.5 signature + if ( typeof url === "object" ) { + options = url; + url = undefined; + } + + // Force options to be an object + options = options || {}; + + var transport, + + // URL without anti-cache param + cacheURL, + + // Response headers + responseHeadersString, + responseHeaders, + + // timeout handle + timeoutTimer, + + // Url cleanup var + urlAnchor, + + // Request state (becomes false upon send and true upon completion) + completed, + + // To know if global events are to be dispatched + fireGlobals, + + // Loop variable + i, + + // uncached part of the url + uncached, + + // Create the final options object + s = jQuery.ajaxSetup( {}, options ), + + // Callbacks context + callbackContext = s.context || s, + + // Context for global events is callbackContext if it is a DOM node or jQuery collection + globalEventContext = s.context && + ( callbackContext.nodeType || callbackContext.jquery ) ? + jQuery( callbackContext ) : + jQuery.event, + + // Deferreds + deferred = jQuery.Deferred(), + completeDeferred = jQuery.Callbacks( "once memory" ), + + // Status-dependent callbacks + statusCode = s.statusCode || {}, + + // Headers (they are sent all at once) + requestHeaders = {}, + requestHeadersNames = {}, + + // Default abort message + strAbort = "canceled", + + // Fake xhr + jqXHR = { + readyState: 0, + + // Builds headers hashtable if needed + getResponseHeader: function( key ) { + var match; + if ( completed ) { + if ( !responseHeaders ) { + responseHeaders = {}; + while ( ( match = rheaders.exec( responseHeadersString ) ) ) { + responseHeaders[ match[ 1 ].toLowerCase() + " " ] = + ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) + .concat( match[ 2 ] ); + } + } + match = responseHeaders[ key.toLowerCase() + " " ]; + } + return match == null ? null : match.join( ", " ); + }, + + // Raw string + getAllResponseHeaders: function() { + return completed ? responseHeadersString : null; + }, + + // Caches the header + setRequestHeader: function( name, value ) { + if ( completed == null ) { + name = requestHeadersNames[ name.toLowerCase() ] = + requestHeadersNames[ name.toLowerCase() ] || name; + requestHeaders[ name ] = value; + } + return this; + }, + + // Overrides response content-type header + overrideMimeType: function( type ) { + if ( completed == null ) { + s.mimeType = type; + } + return this; + }, + + // Status-dependent callbacks + statusCode: function( map ) { + var code; + if ( map ) { + if ( completed ) { + + // Execute the appropriate callbacks + jqXHR.always( map[ jqXHR.status ] ); + } else { + + // Lazy-add the new callbacks in a way that preserves old ones + for ( code in map ) { + statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; + } + } + } + return this; + }, + + // Cancel the request + abort: function( statusText ) { + var finalText = statusText || strAbort; + if ( transport ) { + transport.abort( finalText ); + } + done( 0, finalText ); + return this; + } + }; + + // Attach deferreds + deferred.promise( jqXHR ); + + // Add protocol if not provided (prefilters might expect it) + // Handle falsy url in the settings object (#10093: consistency with old signature) + // We also use the url parameter if available + s.url = ( ( url || s.url || location.href ) + "" ) + .replace( rprotocol, location.protocol + "//" ); + + // Alias method option to type as per ticket #12004 + s.type = options.method || options.type || s.method || s.type; + + // Extract dataTypes list + s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; + + // A cross-domain request is in order when the origin doesn't match the current origin. + if ( s.crossDomain == null ) { + urlAnchor = document.createElement( "a" ); + + // Support: IE <=8 - 11, Edge 12 - 15 + // IE throws exception on accessing the href property if url is malformed, + // e.g. http://example.com:80x/ + try { + urlAnchor.href = s.url; + + // Support: IE <=8 - 11 only + // Anchor's host property isn't correctly set when s.url is relative + urlAnchor.href = urlAnchor.href; + s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== + urlAnchor.protocol + "//" + urlAnchor.host; + } catch ( e ) { + + // If there is an error parsing the URL, assume it is crossDomain, + // it can be rejected by the transport if it is invalid + s.crossDomain = true; + } + } + + // Convert data if not already a string + if ( s.data && s.processData && typeof s.data !== "string" ) { + s.data = jQuery.param( s.data, s.traditional ); + } + + // Apply prefilters + inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); + + // If request was aborted inside a prefilter, stop there + if ( completed ) { + return jqXHR; + } + + // We can fire global events as of now if asked to + // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) + fireGlobals = jQuery.event && s.global; + + // Watch for a new set of requests + if ( fireGlobals && jQuery.active++ === 0 ) { + jQuery.event.trigger( "ajaxStart" ); + } + + // Uppercase the type + s.type = s.type.toUpperCase(); + + // Determine if request has content + s.hasContent = !rnoContent.test( s.type ); + + // Save the URL in case we're toying with the If-Modified-Since + // and/or If-None-Match header later on + // Remove hash to simplify url manipulation + cacheURL = s.url.replace( rhash, "" ); + + // More options handling for requests with no content + if ( !s.hasContent ) { + + // Remember the hash so we can put it back + uncached = s.url.slice( cacheURL.length ); + + // If data is available and should be processed, append data to url + if ( s.data && ( s.processData || typeof s.data === "string" ) ) { + cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; + + // #9682: remove data so that it's not used in an eventual retry + delete s.data; + } + + // Add or update anti-cache param if needed + if ( s.cache === false ) { + cacheURL = cacheURL.replace( rantiCache, "$1" ); + uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + + uncached; + } + + // Put hash and anti-cache on the URL that will be requested (gh-1732) + s.url = cacheURL + uncached; + + // Change '%20' to '+' if this is encoded form body content (gh-2658) + } else if ( s.data && s.processData && + ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { + s.data = s.data.replace( r20, "+" ); + } + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + if ( jQuery.lastModified[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); + } + if ( jQuery.etag[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); + } + } + + // Set the correct header, if data is being sent + if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { + jqXHR.setRequestHeader( "Content-Type", s.contentType ); + } + + // Set the Accepts header for the server, depending on the dataType + jqXHR.setRequestHeader( + "Accept", + s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? + s.accepts[ s.dataTypes[ 0 ] ] + + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : + s.accepts[ "*" ] + ); + + // Check for headers option + for ( i in s.headers ) { + jqXHR.setRequestHeader( i, s.headers[ i ] ); + } + + // Allow custom headers/mimetypes and early abort + if ( s.beforeSend && + ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { + + // Abort if not done already and return + return jqXHR.abort(); + } + + // Aborting is no longer a cancellation + strAbort = "abort"; + + // Install callbacks on deferreds + completeDeferred.add( s.complete ); + jqXHR.done( s.success ); + jqXHR.fail( s.error ); + + // Get transport + transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); + + // If no transport, we auto-abort + if ( !transport ) { + done( -1, "No Transport" ); + } else { + jqXHR.readyState = 1; + + // Send global event + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); + } + + // If request was aborted inside ajaxSend, stop there + if ( completed ) { + return jqXHR; + } + + // Timeout + if ( s.async && s.timeout > 0 ) { + timeoutTimer = window.setTimeout( function() { + jqXHR.abort( "timeout" ); + }, s.timeout ); + } + + try { + completed = false; + transport.send( requestHeaders, done ); + } catch ( e ) { + + // Rethrow post-completion exceptions + if ( completed ) { + throw e; + } + + // Propagate others as results + done( -1, e ); + } + } + + // Callback for when everything is done + function done( status, nativeStatusText, responses, headers ) { + var isSuccess, success, error, response, modified, + statusText = nativeStatusText; + + // Ignore repeat invocations + if ( completed ) { + return; + } + + completed = true; + + // Clear timeout if it exists + if ( timeoutTimer ) { + window.clearTimeout( timeoutTimer ); + } + + // Dereference transport for early garbage collection + // (no matter how long the jqXHR object will be used) + transport = undefined; + + // Cache response headers + responseHeadersString = headers || ""; + + // Set readyState + jqXHR.readyState = status > 0 ? 4 : 0; + + // Determine if successful + isSuccess = status >= 200 && status < 300 || status === 304; + + // Get response data + if ( responses ) { + response = ajaxHandleResponses( s, jqXHR, responses ); + } + + // Use a noop converter for missing script + if ( !isSuccess && jQuery.inArray( "script", s.dataTypes ) > -1 ) { + s.converters[ "text script" ] = function() {}; + } + + // Convert no matter what (that way responseXXX fields are always set) + response = ajaxConvert( s, response, jqXHR, isSuccess ); + + // If successful, handle type chaining + if ( isSuccess ) { + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + modified = jqXHR.getResponseHeader( "Last-Modified" ); + if ( modified ) { + jQuery.lastModified[ cacheURL ] = modified; + } + modified = jqXHR.getResponseHeader( "etag" ); + if ( modified ) { + jQuery.etag[ cacheURL ] = modified; + } + } + + // if no content + if ( status === 204 || s.type === "HEAD" ) { + statusText = "nocontent"; + + // if not modified + } else if ( status === 304 ) { + statusText = "notmodified"; + + // If we have data, let's convert it + } else { + statusText = response.state; + success = response.data; + error = response.error; + isSuccess = !error; + } + } else { + + // Extract error from statusText and normalize for non-aborts + error = statusText; + if ( status || !statusText ) { + statusText = "error"; + if ( status < 0 ) { + status = 0; + } + } + } + + // Set data for the fake xhr object + jqXHR.status = status; + jqXHR.statusText = ( nativeStatusText || statusText ) + ""; + + // Success/Error + if ( isSuccess ) { + deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); + } else { + deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); + } + + // Status-dependent callbacks + jqXHR.statusCode( statusCode ); + statusCode = undefined; + + if ( fireGlobals ) { + globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", + [ jqXHR, s, isSuccess ? success : error ] ); + } + + // Complete + completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); + + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); + + // Handle the global AJAX counter + if ( !( --jQuery.active ) ) { + jQuery.event.trigger( "ajaxStop" ); + } + } + } + + return jqXHR; + }, + + getJSON: function( url, data, callback ) { + return jQuery.get( url, data, callback, "json" ); + }, + + getScript: function( url, callback ) { + return jQuery.get( url, undefined, callback, "script" ); + } +} ); + +jQuery.each( [ "get", "post" ], function( _i, method ) { + jQuery[ method ] = function( url, data, callback, type ) { + + // Shift arguments if data argument was omitted + if ( isFunction( data ) ) { + type = type || callback; + callback = data; + data = undefined; + } + + // The url can be an options object (which then must have .url) + return jQuery.ajax( jQuery.extend( { + url: url, + type: method, + dataType: type, + data: data, + success: callback + }, jQuery.isPlainObject( url ) && url ) ); + }; +} ); + +jQuery.ajaxPrefilter( function( s ) { + var i; + for ( i in s.headers ) { + if ( i.toLowerCase() === "content-type" ) { + s.contentType = s.headers[ i ] || ""; + } + } +} ); + + +jQuery._evalUrl = function( url, options, doc ) { + return jQuery.ajax( { + url: url, + + // Make this explicit, since user can override this through ajaxSetup (#11264) + type: "GET", + dataType: "script", + cache: true, + async: false, + global: false, + + // Only evaluate the response if it is successful (gh-4126) + // dataFilter is not invoked for failure responses, so using it instead + // of the default converter is kludgy but it works. + converters: { + "text script": function() {} + }, + dataFilter: function( response ) { + jQuery.globalEval( response, options, doc ); + } + } ); +}; + + +jQuery.fn.extend( { + wrapAll: function( html ) { + var wrap; + + if ( this[ 0 ] ) { + if ( isFunction( html ) ) { + html = html.call( this[ 0 ] ); + } + + // The elements to wrap the target around + wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); + + if ( this[ 0 ].parentNode ) { + wrap.insertBefore( this[ 0 ] ); + } + + wrap.map( function() { + var elem = this; + + while ( elem.firstElementChild ) { + elem = elem.firstElementChild; + } + + return elem; + } ).append( this ); + } + + return this; + }, + + wrapInner: function( html ) { + if ( isFunction( html ) ) { + return this.each( function( i ) { + jQuery( this ).wrapInner( html.call( this, i ) ); + } ); + } + + return this.each( function() { + var self = jQuery( this ), + contents = self.contents(); + + if ( contents.length ) { + contents.wrapAll( html ); + + } else { + self.append( html ); + } + } ); + }, + + wrap: function( html ) { + var htmlIsFunction = isFunction( html ); + + return this.each( function( i ) { + jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); + } ); + }, + + unwrap: function( selector ) { + this.parent( selector ).not( "body" ).each( function() { + jQuery( this ).replaceWith( this.childNodes ); + } ); + return this; + } +} ); + + +jQuery.expr.pseudos.hidden = function( elem ) { + return !jQuery.expr.pseudos.visible( elem ); +}; +jQuery.expr.pseudos.visible = function( elem ) { + return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); +}; + + + + +jQuery.ajaxSettings.xhr = function() { + try { + return new window.XMLHttpRequest(); + } catch ( e ) {} +}; + +var xhrSuccessStatus = { + + // File protocol always yields status code 0, assume 200 + 0: 200, + + // Support: IE <=9 only + // #1450: sometimes IE returns 1223 when it should be 204 + 1223: 204 + }, + xhrSupported = jQuery.ajaxSettings.xhr(); + +support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); +support.ajax = xhrSupported = !!xhrSupported; + +jQuery.ajaxTransport( function( options ) { + var callback, errorCallback; + + // Cross domain only allowed if supported through XMLHttpRequest + if ( support.cors || xhrSupported && !options.crossDomain ) { + return { + send: function( headers, complete ) { + var i, + xhr = options.xhr(); + + xhr.open( + options.type, + options.url, + options.async, + options.username, + options.password + ); + + // Apply custom fields if provided + if ( options.xhrFields ) { + for ( i in options.xhrFields ) { + xhr[ i ] = options.xhrFields[ i ]; + } + } + + // Override mime type if needed + if ( options.mimeType && xhr.overrideMimeType ) { + xhr.overrideMimeType( options.mimeType ); + } + + // X-Requested-With header + // For cross-domain requests, seeing as conditions for a preflight are + // akin to a jigsaw puzzle, we simply never set it to be sure. + // (it can always be set on a per-request basis or even using ajaxSetup) + // For same-domain requests, won't change header if already provided. + if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { + headers[ "X-Requested-With" ] = "XMLHttpRequest"; + } + + // Set headers + for ( i in headers ) { + xhr.setRequestHeader( i, headers[ i ] ); + } + + // Callback + callback = function( type ) { + return function() { + if ( callback ) { + callback = errorCallback = xhr.onload = + xhr.onerror = xhr.onabort = xhr.ontimeout = + xhr.onreadystatechange = null; + + if ( type === "abort" ) { + xhr.abort(); + } else if ( type === "error" ) { + + // Support: IE <=9 only + // On a manual native abort, IE9 throws + // errors on any property access that is not readyState + if ( typeof xhr.status !== "number" ) { + complete( 0, "error" ); + } else { + complete( + + // File: protocol always yields status 0; see #8605, #14207 + xhr.status, + xhr.statusText + ); + } + } else { + complete( + xhrSuccessStatus[ xhr.status ] || xhr.status, + xhr.statusText, + + // Support: IE <=9 only + // IE9 has no XHR2 but throws on binary (trac-11426) + // For XHR2 non-text, let the caller handle it (gh-2498) + ( xhr.responseType || "text" ) !== "text" || + typeof xhr.responseText !== "string" ? + { binary: xhr.response } : + { text: xhr.responseText }, + xhr.getAllResponseHeaders() + ); + } + } + }; + }; + + // Listen to events + xhr.onload = callback(); + errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); + + // Support: IE 9 only + // Use onreadystatechange to replace onabort + // to handle uncaught aborts + if ( xhr.onabort !== undefined ) { + xhr.onabort = errorCallback; + } else { + xhr.onreadystatechange = function() { + + // Check readyState before timeout as it changes + if ( xhr.readyState === 4 ) { + + // Allow onerror to be called first, + // but that will not handle a native abort + // Also, save errorCallback to a variable + // as xhr.onerror cannot be accessed + window.setTimeout( function() { + if ( callback ) { + errorCallback(); + } + } ); + } + }; + } + + // Create the abort callback + callback = callback( "abort" ); + + try { + + // Do send the request (this may raise an exception) + xhr.send( options.hasContent && options.data || null ); + } catch ( e ) { + + // #14683: Only rethrow if this hasn't been notified as an error yet + if ( callback ) { + throw e; + } + } + }, + + abort: function() { + if ( callback ) { + callback(); + } + } + }; + } +} ); + + + + +// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) +jQuery.ajaxPrefilter( function( s ) { + if ( s.crossDomain ) { + s.contents.script = false; + } +} ); + +// Install script dataType +jQuery.ajaxSetup( { + accepts: { + script: "text/javascript, application/javascript, " + + "application/ecmascript, application/x-ecmascript" + }, + contents: { + script: /\b(?:java|ecma)script\b/ + }, + converters: { + "text script": function( text ) { + jQuery.globalEval( text ); + return text; + } + } +} ); + +// Handle cache's special case and crossDomain +jQuery.ajaxPrefilter( "script", function( s ) { + if ( s.cache === undefined ) { + s.cache = false; + } + if ( s.crossDomain ) { + s.type = "GET"; + } +} ); + +// Bind script tag hack transport +jQuery.ajaxTransport( "script", function( s ) { + + // This transport only deals with cross domain or forced-by-attrs requests + if ( s.crossDomain || s.scriptAttrs ) { + var script, callback; + return { + send: function( _, complete ) { + script = jQuery( " + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Column mapping transforms

+

Each header below represents a column mapping transform type. Transforms are used in the context of column_mappings.

+

Some transforms refer to “a” or “b”. These mean the transforms apply to columns from only one of the two datasets to be linked (we’re trying to link people in dataset “a” with people in dataset “b”).

+

More than one transform can be applied to a column. Transforms apply in the order they’re listed, so the output of one transform may be the input of another.

+

Each transform applies to the column specified by the column_name attribute in the config under the [[column_mappings]] section. The transforms attribute +indicates the type of the transform, which is one of the ones listed below. Along with type, there can be additional attributes used by the transform. +These may vary by type, and additional information is given for each type of transform below. Often an additional attribute is just named value or values.

+
[[column_mappings]]
+alias = "namefrst_split"
+column_name = "namefrst_clean"
+transforms = [ { type = "split" } ]
+
+
+
+

add_to_a

+

Add a value to a column from dataset “a”.

+
transforms = [ { type = "add_to_a", value = 11 } ]
+
+
+
+
+

concat_to_a

+

Concatenate the string value to the end of a column in dataset “a”.

+
transforms = [ { type = "concat_to_a", value = " "} ]
+
+
+
+
+

concat_to_b

+

Concatenate the string value to the end of a column in dataset “b”.

+
transforms = [ { type = "concat_to_b", value = " "} ]
+
+
+
+
+

lowercase_strip

+

Used in name cleaning.

+

Convert alphabetical characters to lower-case and strip white space characters from the start and end of the strings in the column.

+
transforms = [ { type = "lowercase_strip"} ]
+
+
+
+
+

rationalize_name_words

+

Used in name cleaning.

+

Replace ‘?’, ‘*’, and ‘-’ with spaces. Since people’s names in raw census data can contain these +characters, replacing these characters can lead to better matching.

+
transforms = [ { type = "rationalize_name_words"} ]
+
+
+
+
+

remove_qmark_hyphen

+

Used in name cleaning.

+

Remove the ‘?-’ from words and replace with nothing.

+
transforms = [ { type = "remove_qmark_hyphen"} ]
+
+
+
+
+

remove_punctuation

+

Remove most punctuation and replace with nothing.

+

Removes:

+
? - \ / " ' : , . [ ] { }
+
+
+
transforms = [ { type = "remove_punctuation"} ]
+
+
+
+
+

replace_apostrophe

+

Used in name cleaning.

+

Replace each apostrophe “’” with a space.

+
transforms = [ { type = "replace_apostrophe"} ]
+
+
+
+
+

remove_alternate_names

+

Used in name cleaning.

+

Remove any names following the string ‘or’.

+
transforms = [ { type = "remove_alternate_names"} ]
+
+
+
+
+

remove_suffixes

+

Used in name cleaning.

+

Given a list of suffixes, remove them from the names in the column.

+
transforms=[{ type = "remove_suffixes",  values = ["jr", "sr", "ii", "iii"] }]
+
+
+
+
+

remove_stop_words

+

Used in name cleaning.

+

Remove last words from names such as street names.

+
transforms=[
+{type = "remove_stop_words", values = ['alley','ally','aly','anex','annex','av','ave','aven','avenu','avenue','avn','avnue','avanue','avaneu','bg','blvd','boul','boulevard','brg','bridge','burg','camp','circle','cor', 'corner', 'corners','cors', 'court', 'courts', 'cp', 'cres', 'crescent', 'ct', 'cts', 'dr','driv', 'drive', 'est', 'estate', 'express', 'expressway', 'ext', 'extension', 'ferry', 'fort', 'frt', 'fry', 'ft', 'heights', 'ht', 'hts', 'is', 'island', 'key', 'ky', 'ldg', 'lodge', 'mill', 'mills', 'ml', 'mls', 'mount', 'mountain', 'mountin', 'mt', 'mtn', 'park', 'parkway','pike', 'pikes','pkwy', 'pl', 'place', 'point', 'points', 'pr', 'prairie', 'prk', 'pt', 'pts', 'rad', 'radial', 'rd', 'rds', 'rest', 'riv', 'river', 'road', 'roads', 'rst', 'spgs', 'springs', 'sq', 'square', 'st', 'sta', 'station', 'str', 'street', 'streets', 'strt', 'sts', 'ter', 'terrace', 'track', 'tracks', 'trail', 'trails', 'trnpk', 'turnpike', 'un', 'union', 'valley', 'vally', 'via', 'viaduct', 'vill', 'villag', 'village', 'villiage', 'well', 'wl', 'wl', 'and','of','.',',','-','/','&','south','north','east','west','s','n','e','w','block']}]
+  
+
+
+
+
+

remove_prefixes

+

Used in name cleaning.

+

Remove prefixes like “Ms.”, “Mr.”, or “Mrs.” from names.

+

In some census data, “ah” is such a prefix from Chinese names.

+
transforms=[{ type = "remove_prefixes", values = ["ah"]}]
+
+
+
+
+

condense_strip_whitespace

+

Used in name cleaning.

+

Take white space that may be more than one character or contain non-space characters and replace it with a single space.

+

+transforms=[{ type = "condense_strip_whitespace"}]
+
+
+
+
+

remove_one_letter_names

+

Used in name cleaning.

+

If a name is a single character, remove it and leave the white space behind.

+
transforms=[{ type = "remove_one_letter_names"}]
+
+
+
+
+

split

+

Split the column value on space characters (” “).

+
[[column_mappings]]
+alias = "namefrst_split"
+column_name = "namefrst_clean"
+transforms = [ { type = "split" } ]
+
+
+
+
+

array_index

+

If the column contains an array, select the element at the given position.

+

This can be used as the input to another transform. In the example below, the first transform selects the second (index 1) item from the “namefrst_split” column that contains a set of names split on white space. Then, the substring 0,1 is selected which gives the first initial of the person’s probable middle name.

+
alias = "namefrst_mid_init"
+column_name = "namefrst_split"
+transforms = [
+ { type = "array_index", value = 1},
+ { type = "substring", values = [0, 1]}
+]
+
+
+
+
+

mapping

+

Map single or multiple values to a single output value, otherwise known as a “recoding.”

+
[[column_mappings]]
+column_name = "birthyr"
+alias = "clean_birthyr"
+transforms=[
+{type = "mapping"
+values = [{"from"=[9999,1999], "to" = ""},
+{"from" = -9998, "to" = 9999}
+]}
+
+
+
+
+

substring

+

Replace a column with a substring of the data in the column.

+
transforms = [
+ { type = "substring", values = [0, 1]}]
+
+
+
+
+

divide_by_int

+

Divide data in a column by an integer value. It may leave a non-integer result.

+

For instance, this transform takes the birthplace variable and converts it from the detailed version to the general version. The two least significant digits are detailed birthplace information; to make the more general version, we simply drop them by dividing by 100 and rounding to the lowest whole number (floor function).

+
[[column_mappings]]
+column_name = "bpl"
+alias = "bpl_root"
+transforms = [
+  { type = "divide_by_int", value = 100 },
+  { type = "get_floor" }
+]
+
+
+
+
+

when_value

+

Apply conditional logic to replacement of values in a column. Works like the SQL if() or case() expressions in the SQL “select” clause.

+

When a the value of a column is “value” replace it with “if_value” otherwise replace it with the “else_value”.

+

This example replaces all “race” IPUMS codes with 0 (white) or 1 (non-white). An IPUMS code of 100 is the “white” race category.

+
column_name = "race"
+transforms = [
+  { type = "when_value", value = 100, if_value = 0, else_value = 1}
+]
+
+
+
+
+

get_floor

+

Round down to the nearest whole number.

+

This example produces the general version of the IPUMS “relate” variable. The variable is coded such that detailed categories are between the hundreds (300 is child of household head, 301 is simply ‘child’, 302 is adopted child, 303 is step-child for instance). The general categories are usually all that’s needed (1 == household head, 2 == spouse, 3 == child, 4 == child-in-law, 5 == parent, 6 == parent-in-law, 7== sibling, 12 == not related to head).

+
[[column_mappings]]
+alias = "relate_div_100"
+column_name = "relate"
+transforms = [
+  { type = "divide_by_int", value = 100 },
+  { type = "get_floor" }
+]
+
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/comparison_types.html b/docs/comparison_types.html new file mode 100644 index 0000000..0d9cf77 --- /dev/null +++ b/docs/comparison_types.html @@ -0,0 +1,1160 @@ + + + + + + + + + Comparison types, transform add-ons, aggregate features, and household aggregate features — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Comparison types, transform add-ons, aggregate features, and household aggregate features

+

This page has information on the different comparison types available for the [[comparison_features]] +section, along with some attributes available to all of the comparison types and some aggregate features +that are not configurable.

+
+

Comparison types

+

Each header below represents a comparison type. Transforms are used in the context of comparison_features.

+
[[comparison_features]]
+alias = "relatematch"
+column_name = "relate_div_100"
+comparison_type = "equals"
+categorical = true
+
+
+
+

maximum_jaro_winkler

+

Finds the greatest Jaro-Winkler value among the cartesian product of multiple columns. For example, given an input of column_names = ['namefrst', 'namelast'], it would return the maximum Jaro-Winkler name comparison value among the following four comparisons:

+
[('namefrst_a', 'namefrst_b'),
+ ('namefrst_a', 'namelast_b'),
+ ('namelast_a', 'namefrst_b'),
+ ('namelast_a', 'namelast_b')]
+
+
+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. The list of columns used as input for the set of comparisons generated by taking the cartesian product.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "maximum_jw"
+column_names = ["namelast", "namefrst"]
+comparison_type = "maximum_jaro_winkler"
+
+
+
+
+

jaro_winkler

+

Returns the Jaro-Winkler comparison score for a given column.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. The column to compare using the Jaro-Winkler score.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "namefrst_jw"
+column_name = "namefrst"
+comparison_type = "jaro_winkler
+
+
+
+
+

jaro_winkler_street

+

Uses an additional geographic column value to filter for major location changes before comparing street names. If boundary column A is not equal to boundary column B, a Jaro-Winkler score of zero is returned. If boundary column A and B are equal, the Jaro-Winkler comparison score of the street columns is returned.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. The input street column.

    • +
    • boundary – Type: string. Required. An input column to match on before comparing street name values.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "jw_street"
+column_name = "street"
+boundary = "enum_dist"
+comparison_type = "jaro_winkler_street"
+
+
+
+
+

max_jaro_winkler

+

Returns the greatest Jaro-Winkler value from the comparisons of a list of names.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Input column containing a list of names to compare (such as related household members, or neighborhood surnames).

    • +
    +
  • +
+
[[comparison_features]]
+alias = "related_individual_max_jw"
+column_name= "namefrst_related"
+comparison_type = "max_jaro_winkler"
+
+
+
+
+

equals

+

Asserts that values are the same for both compared columns using SQL: a.{column_name} IS NOT DISTINCT FROM b.{column_name}

+
[[comparison_features]]
+alias = "relatematch"
+column_name = "relate_div_100"
+comparison_type = "equals"
+categorical = true
+
+
+
+
+

f1_match

+

Evaluates if the first name initial A matches either the first name first initial B or either the first or second middle initial of B. If so, returns 1. Otherwise, returns 2.

+

1 = First initial of first first name A matches first initial of any of potential match first names B

+

2 = mismatch

+

Uses the following SQL query:

+
"CASE WHEN (
+    (a.{fi} IS NOT DISTINCT FROM b.{fi}) OR 
+    (a.{fi} IS NOT DISTINCT FROM b.{mi0}) OR
+    (a.{fi} IS NOT DISTINCT FROM b.{mi1})
+) THEN 1 ELSE 2 END"
+
+
+
[[comparison_features]]
+alias = "f1_match"
+first_init_col = "namefrst_init"
+mid_init_cols = ["namefrst_mid_init", "namefrst_mid_init_2"]
+comparison_type = "f1_match"
+categorical = true
+
+
+
+
+

f2_match

+

Evaluates if first middle initial A is empty/null. If so, return 0. +Otherwise, if either first or second middle initial A is not null and matches first name initial B, or first or second middle initial B, return 1. +Otherwise, return 2.

+

1 = First initial of A second first name matches first initial of any of potential match first names B

+

2 = mismatch

+

0 = no second first name A

+

Uses the following SQL:

+
CASE WHEN ((a.{mi0} == '') OR (a.{mi0} IS NULL)) THEN 0 WHEN (
+    (a.{mi0} IS NOT DISTINCT FROM b.{fi}) OR
+    ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{fi})) OR
+    (a.{mi0} IS NOT DISTINCT FROM b.{mi0}) OR
+    (a.{mi0} IS NOT DISTINCT FROM b.{mi1}) OR
+    ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi0})) OR
+    ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi1}))
+) THEN 1 ELSE 2 END
+
+
+
    +
  • Attributes:

    +
      +
    • first_init_col – Type: string. Required. First name initial input column.

    • +
    • mid_init_cols – Type: list of strings. Required. List of first and second middle initial input columns.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "f2_match"
+first_init_col = "namefrst_init"
+mid_init_cols = ["namefrst_mid_init", "namefrst_mid_init_2"]
+comparison_type = "f2_match"
+categorical = true
+
+
+
+
+

not_equals

+

Asserts that values are distinct between compared individuals using SQL: a.{column_name} IS DISTINCT FROM b.{column_name}. Used mainly in caution flag features (f_caution, m_caution, sp_caution).

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Input column to compare.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "m_caution"
+column_names = ["mbpl", "mother_birthyr", "stepmom", "momloc"]
+comparison_type = "caution_comp_4"
+categorical = true
+[comparison_features.comp_a]
+column_name = "mbpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "mother_birthyr"
+comparison_type = "abs_diff"
+gt_threshold = 5
+[comparison_features.comp_c]
+column_name = "stepmom"
+comparison_type = "parent_step_change"
+[comparison_features.comp_d]
+column_name = "momloc"
+comparison_type = "present_both_years"
+
+
+
+
+

equals_as_int

+

Checks for equality using equals sign and returns boolean result in integer form. Uses SQL: CAST(a.{col} = b.{col} as INT)

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Input column to compare.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "namelast_equal_as_int"
+column_name = "namelast_clean"
+comparison_type  = "equals_as_int"
+
+
+
+
+

all_equals

+

Asserts whether the values in all given columns match. Uses a SQL expression generated by joining a.{col} = b.{col} and AND for each given column.

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. List of the columns to evaluate if all are equal across records being compared.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "exact"
+column_names = ["namefrst_unstd", "namelast_clean"]
+comparison_type = "all_equals"
+
+
+
+
+

or

+

Allows for the concatenation of up to four comparison features into one feature using a SQL OR between the generated clause for each sub-comparison.

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. A list of all input columns used by sub-comparisons.

    • +
    • comp_a, comp_b – Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section.

    • +
    • comp_c, comp_d – Type: Object. Optional. Sub-comparison using any of the comparison feature types documented in this section.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "sp_caution"
+column_names = ["spouse_bpl", "spouse_birthyr", "durmarr"]
+comparison_type = "or"
+[comparison_features.comp_a]
+column_name = "spouse_bpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "spouse_birthyr"
+comparison_type = "abs_diff"
+lower_threshold = 5
+[comparison_features.comp_c]
+column_name = "durmarr"
+comparison_type = "new_marr"
+upper_threshold = 7
+
+
+
+
+

and

+

Allows for the concatenation of up to four comparison features into one feature using a SQL AND between the generated clause for each sub-comparison.

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. A list of all input columns used by sub-comparisons.

    • +
    • comp_a, comp_b – Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section.

    • +
    • comp_c, comp_d – Type: Object. Optional. Sub-comparison using any of the comparison feature types documented in this section.

    • +
    +
  • +
+

In this example, the and comparison appears in [comparison_features.comp_b].

+
[[comparison_features]]
+alias = "street_jw"
+comparison_type = "times"
+column_names = ["street","county", "statefip"]
+[comparison_features.comp_a]
+column_name = "street"
+comparison_type = "jaro_winkler"
+lower_threshold = 0.9
+[comparison_features.comp_b]
+comparison_type = "and"
+column_names = ["county", "statefip"]
+[comparison_features.comp_b.comp_a]
+column_name = "county"
+comparison_type = "equals"
+[comparison_features.comp_b.comp_b]
+column_name = "statefip"
+comparison_type = "equals"
+
+
+
+
+

times

+

Takes the output of two sub-comparisons and multiplies them together after casting as floats.

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. A list of all input columns used by sub-comparisons.

    • +
    • comp_a, comp_b – Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. comp_a and comp_b can also have sub-comparisons, as in the given example.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "street_jw"
+comparison_type = "times"
+column_names = ["street","county", "statefip"]
+[comparison_features.comp_a]
+column_name = "street"
+comparison_type = "jaro_winkler"
+lower_threshold = 0.9
+[comparison_features.comp_b]
+comparison_type = "and"
+column_names = ["county", "statefip"]
+[comparison_features.comp_b.comp_a]
+column_name = "county"
+comparison_type = "equals"
+[comparison_features.comp_b.comp_b]
+column_name = "statefip"
+comparison_type = "equals"
+
+
+
+
+

caution_comp_3

+

Generates an SQL expression in the form (({expr_a}  OR {expr_b}) AND {expr_c}).

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. A list of all input columns used by sub-comparisons.

    • +
    • comp_a, comp_b, comp_c – Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. comp_a, comp_b, and comp_c can also have sub-comparisons.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "sp_caution"
+column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"]
+comparison_type = "caution_comp_3"
+categorical = true
+[comparison_features.comp_a]
+column_name = "spouse_bpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "spouse_birthyr"
+comparison_type = "abs_diff"
+gt_threshold = 5
+[comparison_features.comp_c]
+column_name = "durmarr"
+comparison_type = "new_marr"
+upper_threshold = 7
+
+
+
+
+

caution_comp_4

+

Generates an SQL expression in the form (({expr_a}  OR {expr_b} OR {expr_c}) AND {expr_d}).

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. A list of all input columns used by sub-comparisons.

    • +
    • comp_a, comp_b, comp_c, comp_d – Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. comp_a, comp_b, comp_c, and comp_d can also have sub-comparisons.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "m_caution"
+column_names = ["mbpl", "mother_birthyr", "stepmom", "momloc"]
+comparison_type = "caution_comp_4"
+categorical = true
+[comparison_features.comp_a]
+column_name = "mbpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "mother_birthyr"
+comparison_type = "abs_diff"
+gt_threshold = 5
+[comparison_features.comp_c]
+column_name = "stepmom"
+comparison_type = "parent_step_change"
+[comparison_features.comp_d]
+column_name = "momloc"
+comparison_type = "present_both_years"
+
+
+
+
+

any_equals

+

Used to compare middle initials and first names under specific circumstances.
If middle initial A is not empty/null and is the same as either middle initial B or first name B, +OR if first name A is not empty/null and is the same as middle initial B.

+
    +
  • Attributes:

    +
      +
    • column_names – Type: list of strings. Required. The first input column should be the middle initial column, and the second input column should be the first name column.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "mid_init_match"
+column_names = ["namefrst_mid_init", "namefrst_unstd"]
+comparison_type = "any_equals"
+
+
+
+
+

either_are_1

+

Checks if the column value for either A or B is equal to 1.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Input column to compare to 1.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "either_1"
+column_name = "nativity"
+comparison_type = "either_are_1"
+categorical = true
+
+
+
+
+

either_are_0

+

Checks if the column value for either A or B is equal to 0.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Input column to compare to 0.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "either_0"
+column_name = "nativity"
+comparison_type = "either_are_0"
+categorical = true
+
+
+
+
+

second_gen_imm

+

Checks if individual A is a second-generation immigrant by looking for nativity value of 2, 3, or 4 (one or both parents foreign-born).

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Input should be the name of the nativity column.

    • +
    +
  • +
+
[[comparison_features]]
+alias =  "sgen"
+column_name = "nativity"
+comparison_type = "second_gen_imm"
+categorical = true
+
+
+
+
+

rel_jaro_winkler

+

Uses a Scala function to determine the number of people in the input column with a name similarity score (Jaro-Winkler) greater than or equal to the given jw_threshold, an age difference less than or equal to the given age_threshold, and matching sex for the sample A individual and the sample B potential match. Takes a column generated with the feature selection transform related_individual_rows as input (list of person data objects to compare). Can be used for related or unrelated individuals, depending on the input column specified.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The input column with data in the form of a list of person data objects.

    • +
    • name_col – Type: string.

    • +
    • birthyr_col – Type: string.

    • +
    • jw_threshold – Type: float.

    • +
    • age_threshold – Type: int.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "rel"
+column_name = "namefrst_related_rows"
+name_col = "namefrst_unstd"
+birthyr_col = "replaced_birthyr"
+comparison_type = "rel_jaro_winkler"
+jw_threshold = 0.9
+age_threshold = 5
+
+
+
+
+

extra_children

+

Using a Scala function, checks to see if there are children present in sample B who are not present in sample A, but based on relate codes, age, sex, and name, we would have expected to be present in A. Returns a count of suspected “extra” children. Takes a column generated with the feature selection transform related_individual_rows as input (list of person data objects to compare).

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The input column with data in the form of a list of person data objects.

    • +
    • relate_col – Type: string. The name of the column with the relate code.

    • +
    • histid_col – Type: string. The name of the id column.

    • +
    • name_col – Type: string. The name of the column containing the first name for comparison.

    • +
    • birthyr_col – Type: string. The name of the column containing the birth year.

    • +
    • year_b – Type: int. The year that sample B was taken.

    • +
    • jw_threshold – Type: float. The minimum acceptable Jaro-Winkler score to consider a match.

    • +
    • age_threshold – Type: int. The maximum acceptable age difference to consider a match.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "extra_children"
+column_name = "namefrst_related_rows"
+relate_col = "relate"
+histid_col = "histid"
+name_col = "namefrst_unstd"
+birthyr_col = "replaced_birthyr"
+year_b = 1910
+comparison_type = "extra_children"
+jw_threshold = 0.8
+age_threshold = 2
+
+
+
+
+

jaro_winkler_rate

+

Uses a Scala function to calculate the percentage of individuals who have a Jaro-Winkler score greater than or equal to the given threshold. Rate returned as a percentage as a float data type.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The input column with data in the form of a list of person data objects. The input column seen below (”namelast_neighbors”)was generated using a “neighbor_aggregate” feature selection.

    • +
    • jw_threshold – Type: float. The minimum Jaro-Winkler threshold to consider an acceptable match.

    • +
    +
  • +
+

In the following example, a lower_threshold feature add-on is used to convert the returned rate to a boolean asserting whether it meets the given minimum threshold. (>= 5% of neighbors have a Jaro-Winkler score >= 0.95)

+
[[comparison_features]]
+alias = "nbors"
+comparison_type = "times"
+column_names = ["namelast_neighbors", "county", "statefip"]
+[comparison_features.comp_a]
+column_name = "namelast_neighbors"
+comparison_type = "jaro_winkler_rate"
+jw_threshold = 0.95
+lower_threshold = 0.05
+[comparison_features.comp_b]
+comparison_type = "and"
+column_names = ["county", "statefip"]
+[comparison_features.comp_b.comp_a]
+column_name = "county"
+comparison_type = "equals"
+[comparison_features.comp_b.comp_b]
+column_name = "statefip"
+comparison_type = "equals"
+
+
+
+
+

sum

+

Adds the column values for A and B together (takes the sum).

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The input column to be added.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "namelast_popularity_sum"
+column_name = "namelast_popularity"
+comparison_type = "sum"
+
+
+
+
+

length_b

+

Returns the length of the column value in record B using the SQL size() function.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The name of the input column to take the length of in dataset B.

    • +
    +
  • +
+
+
+

abs_diff

+

Takes the absolute value of the difference between the values of the given column in datasets A and B.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The input column to evaluate.

    • +
    • not_equals – Type: int. OPTIONAL. You can specify a value for the column to be considered invalid input, in which case the expression would return the value -1 instead of an absolute difference. For example, if you are evaluating the difference in marriage duration values, and “99” is a placeholder value for “unknown” in the data, you can exclude those values from consideration using this attribute.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "byrdiff"
+column_name = "replaced_birthyr"
+comparison_type = "abs_diff"
+
+[[comparison_features]]
+alias = "mardurmatch"
+column_name = "durmarr"
+not_equals = 99
+comparison_type = "abs_diff"
+btwn_threshold = [9, 14]
+categorical = True
+
+
+
+
+

b_minus_a

+

Returns the value of subtracting the value of column A from the value of column B.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The input column to evaluate.

    • +
    • not_equals – Type: int. OPTIONAL. You can specify a value for the column to be considered invalid input, in which case the expression would return the value -1 instead of an absolute difference. For example, if you are evaluating the difference in marriage duration values, and “99” is a placeholder value for “unknown” in the data, you can exclude those values from consideration using this attribute.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "mardurmatch"
+column_name = "durmarr"
+not_equals = 99
+comparison_type = "b_minus_a"
+btwn_threshold = [5,14]
+categorical = true
+
+
+
+
+

geo_distance

+

Uses a lookup table to find the geographic distance between locations. The SQL expression is generated by hlink/linking/core/dist_table.py. There are several ways to configure this feature. You can look up distances in the given file using one or two keys (specified with the key_count attribute). You can also optionally have a secondary look-up table that serves as a back-up value in the case that the primary look-up does not contain a value for the locations given. This is particularly useful for county distance, as you can set the primary join to be across counties, but set up a secondary join on state, which has much fewer combinations and thus less risk of nulls, to fill in when the counties specified aren’t in the look-up.

+
    +
  • Attributes:

    +
      +
    • key_count – Type: int. The number of keys used to join on the primary (or only) look-up table. Acceptable values are 1 or 2. Ex: for state and county, key_count = 2. For just state, key_count = 1 even though there is county_a and county_b.

    • +
    • distances_file – Type: string of path. Path to the distances look-up file.

    • +
    • table_name – Type: string. What to name the table that will be generated from the distances file. If you want to do multiple look-ups, if the table_name is the same across all feature specifications, it will only be read in once.

    • +
    • Attributes for key_count = 1:

      +
        +
      • column_name – Type: string. The column in the input data that you want to use as a key to look up the geographic distance.

      • +
      • loc_a – Type: string. First column to join on in the look-up table (where to find the value coming from the column_name column A).

      • +
      • loc_b – Type: string. Second column to join on in the look-up table (where to find the value coming from the column_name column B).

      • +
      • distance_col – Type: string. Name of the column containing the geographic distance in the look-up table.

      • +
      +
    • +
    • Attributes for key_count = 2:

      +
        +
      • column_names – Type: list of strings. The two columns you want to use as keys to look up the geographic distance.

      • +
      • source_column_a – Type: string. First column to join on in the source data.

      • +
      • source_column_b – Type: string. Second column to join on in the source data.

      • +
      • loc_a_0 – Type: string. First column to join on in the look-up table.

      • +
      • loc_a_1 – Type: string. First column to join on in the look-up table.

      • +
      • loc_b_0 – Type: string. Second column to join on in the look-up table.

      • +
      • loc_b_1 – Type: string. Second column to join on in the look-up table.

      • +
      • distance_col – Type: string. Name of the column containing the geographic distance in the look-up table.

      • +
      +
    • +
    • Attributes if using a secondary join:

      +
        +
      • secondary_key_count – Type: int. The number of keys used to join on the secondary (backup) look-up table. Acceptable values are 1 or 2.

      • +
      • secondary_table_name – Type: string. What to name the table that will be generated from the secondary_distances_file. If you want to do multiple look-ups, if the table_name is the same across all feature specifications, it will only be read in once.

      • +
      • secondary_distances_file – Type: string of path. Path to the secondary distances look-up file.

      • +
      • secondary_source_column – Type: string. The column in the input data that you want to use as a key in the secondary geographic distance look-up.

      • +
      • secondary_loc_a – Type: string. First column to join on in the secondary look-up table.

      • +
      • secondary_loc_b – Type: string. Second column to join on in the secondary look-up table.

      • +
      • secondary_distance_col – Type: string. Name of the column containing the geographic distance in the secondary look-up table.

      • +
      +
    • +
    +
  • +
+
[[comparison_features]]
+alias = "state_distance"
+comparison_type = "geo_distance"
+key_count = 1
+table_name = "state_distance_lookup"
+distances_file = "/path/to/county_state_distance.csv"
+column_name = "bpl"
+loc_a = "statecode1"
+loc_b = "statecode2"
+distance_col = "dist"
+
+
+[[comparison_features]]
+alias = "county_distance"
+comparison_type = "geo_distance"
+column_names = ["county", "statefip"]
+key_count = 2
+table_name = "county_distance_lookup"
+distances_file = "/path/to/county_1900_1910_distances_km.csv"
+# columns to join on in the data
+source_column_a = "county"
+source_column_b = "statefip"
+
+# column names from the csv lookup file
+loc_a_0 = "from_icpsrctyi"
+loc_a_1 = "to_icpsrctyi"
+loc_b_0 = "from_statefip"
+loc_b_1 = "to_statefip"
+distance_col = "distance_km"
+
+# SECONDARY JOIN
+secondary_key_count = 1
+secondary_table_name = "state_distance_lookup"
+secondary_distances_file = "/path/to/state_1900_1910_distances_km.csv"
+secondary_source_column = "statefip"
+secondary_loc_a = "from_statefip"
+secondary_loc_b = "to_statefip"
+secondary_distance_col = "distance_km"
+
+
+
+
+

fetch_a

+

Gets the value of column A.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. The column to get the value from.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "race"
+column_name = "race"
+comparison_type = "fetch_a"
+categorical = true
+
+
+
+
+

fetch_b

+

Gets the value of column B.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The column to get the value from.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "race"
+column_name = "race"
+comparison_type = "fetch_b"
+categorical = true
+
+
+
+
+

present_both_years

+

Checks whether both column A and column B are present.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The column to check.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "sp_caution"
+column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"]
+comparison_type = "caution_comp_4"
+categorical = true
+[comparison_features.comp_a]
+column_name = "spouse_bpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "spouse_birthyr"
+comparison_type = "abs_diff"
+gt_threshold = 5
+[comparison_features.comp_c]
+column_name = "durmarr"
+comparison_type = "new_marr"
+upper_threshold = 7
+[comparison_features.comp_d]
+column_name = "sploc"
+comparison_type = "present_both_years"
+
+
+
+
+

neither_are_null

+

Checks that neither column A nor column B is null.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The column to check.

    • +
    +
  • +
+
+
+

present_and_not_equal

+

Checks that column A and column B are both present but are not equal.

+
    +
  • Attributes:

    +
      +
    • column_name – Type: string. The column to check.

    • +
    +
  • +
+
+
+
+

Feature add-ons

+

These attributes can be added to most comparison feature types above to extend the type of output returned beyond the standard comparison feature.

+
+

alias

+
    +
  • Attributes:

    +
      +
    • alias: Type: string. Should be used at the top level comparison of every comparison feature. The name for the output column.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "jw_f"
+column_name = "father_namefrst"
+comparison_type = "jaro_winkler"
+
+
+
+
+

power

+

Raises a comparison feature to a given exponential power.

+
    +
  • Attributes:

    +
      +
    • power – Type: int. The power to raise the comparison output to. For example, power = 2 will square the output.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "county_distance_squared"
+comparison_type = "geo_distance"
+column_names = ["county", "statefip"]
+# PRIMARY JOIN
+# key count: the number of keys used for the join per source file.  Ex: for state and county, key_count = 2.  For just state, key_count = 1 even though there is county_a and county_b
+key_count = 2
+table_name = "county_distance_lookup"
+#distances_file = "/path/to/county_state_distance.csv"
+distances_file = "/path/to/county_1900_1910_distances_km.csv"
+# columns to join on in the data
+source_column_a = "county"
+source_column_b = "statefip"
+# column names from the csv lookup file
+loc_a_0 = "from_icpsrctyi"
+loc_a_1 = "to_icpsrctyi"
+loc_b_0 = "from_statefip"
+loc_b_1 = "to_statefip"
+distance_col = "distance_km"
+# SECONDARY JOIN
+secondary_key_count = 1
+secondary_table_name = "state_distance_lookup"
+secondary_distances_file = "/path/to/state_1900_1910_distances_km.csv"
+secondary_source_column = "statefip"
+secondary_loc_a = "from_statefip"
+secondary_loc_b = "to_statefip"
+secondary_distance_col = "distance_km"
+power = 2
+
+
+
+
+

threshold

+
    +
  • Attributes:

    +
      +
    • threshold – Type: numeric types. Asserts if the comparison feature output is not null and is greater than or equal to (>=) the given threshold value.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "imm"
+column_name = "nativity"
+comparison_type = "fetch_a"
+threshold = 5
+categorical = true
+
+
+
+
+

lower_threshold

+
    +
  • Attributes:

    +
      +
    • lower_threshold – Type: numeric types. Asserts if the comparison feature output is not null and is greater than or equal to (>=) the given threshold value.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "street_jw"
+comparison_type = "times"
+column_names = ["street","county", "statefip"]
+[comparison_features.comp_a]
+column_name = "street"
+comparison_type = "jaro_winkler"
+lower_threshold = 0.9
+[comparison_features.comp_b]
+comparison_type = "and"
+column_names = ["county", "statefip"]
+[comparison_features.comp_b.comp_a]
+column_name = "county"
+comparison_type = "equals"
+[comparison_features.comp_b.comp_b]
+column_name = "statefip"
+comparison_type = "equals"
+
+
+
+
+

upper_threshold

+
    +
  • Attributes:

    +
      +
    • upper_threshold – Type: numeric types. Asserts if the comparison feature output is not null and is less than or equal to (<=) the given threshold value.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "sp_caution"
+column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"]
+comparison_type = "caution_comp_4"
+categorical = true
+[comparison_features.comp_a]
+column_name = "spouse_bpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "spouse_birthyr"
+comparison_type = "abs_diff"
+gt_threshold = 5
+[comparison_features.comp_c]
+column_name = "durmarr"
+comparison_type = "new_marr"
+upper_threshold = 7
+[comparison_features.comp_d]
+column_name = "sploc"
+comparison_type = "present_both_years"
+
+
+
+
+

gt_threshold

+
    +
  • Attributes:

    +
      +
    • gt_threshold – Type: numeric types. Asserts if the comparison feature output is not null and is greater than (>) the given threshold value.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "sp_caution"
+column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"]
+comparison_type = "caution_comp_4"
+categorical = true
+[comparison_features.comp_a]
+column_name = "spouse_bpl"
+comparison_type = "not_equals"
+[comparison_features.comp_b]
+column_name = "spouse_birthyr"
+comparison_type = "abs_diff"
+gt_threshold = 5
+[comparison_features.comp_c]
+column_name = "durmarr"
+comparison_type = "new_marr"
+upper_threshold = 7
+[comparison_features.comp_d]
+column_name = "sploc"
+comparison_type = "present_both_years"
+
+
+
+
+

btwn_threshold

+
    +
  • Attributes:

    +
      +
    • btwn_threshold – Type: List of numeric type. Asserts if the comparison feature output is greater than or equal to (>=) the first threshold value, and less than or equal to (<=) the second threshold value.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "mardurmatch"
+column_name = "durmarr"
+not_equals = 99
+comparison_type = "b_minus_a"
+btwn_threshold = [5,14]
+categorical = true
+
+
+
+
+

look_at_addl_var

+
    +
  • Attributes:

    +
      +
    • look_at_addl_var – Type: boolean. Flags the program to consider an additional column value before reporting the comparison feature value.

    • +
    • addl_var – Type: string. The additional column to consider.

    • +
    • check_val_expr – Type: expression. The expression to use to evaluate the additional column. For example, check_val_expr = "= 5".

    • +
    • else_val – Type: same type as comparison feature output. If the additional volumn value does not meet the check_val_expr specification, the value to return instead of the comparison feature value.

    • +
    +
  • +
+

In the following example, the generated SQL expression for the column would be: CASE WHEN {datasource}.nativity = 5 then {yrimmig abs_diff value} else -1 END.

+
[[comparison_features]]
+alias = "immyear_diff"
+column_name = "yrimmig"
+comparison_type = "abs_diff"
+look_at_addl_var = true
+addl_var = "nativity"
+datasource = "a"
+check_val_expr = "= 5"
+else_val = -1
+
+
+
+
+
+

Aggregate Features

+

These features are not configurable. To include them in the generated comparison features, they just need to be included in the [training][independent_vars] section of the config. They are generated using the “aggregate_features” SQL template.

+
+

hits

+

The number of potential matches generated for the given individual (counted by aggregating on {id_column}_a).

+
+
+

hits2

+

hits squared.

+
+
+

exact_mult

+

Indicator for the existence of multiple potential matches with the exact same first and last name as the A sample individual within the B data. Returns numeric boolean (0 or 1).

+
+
+
+

Household Aggregate Features

+

These features are not configurable. To include them in the generated comparison features, they just need to be included in the [hh_training][independent_vars] section of the config. They are generated using the “hh_aggregate_features” SQL template.

+
+

jw_max_a

+

The highest Jaro-Winkler score for any of the first names in linked household A against the first name in linked household B where birth year difference is less than or equal to ten, excluding the individual A in the current potential match. Returns 0 if no other individuals are in the household for comparison.

+
+
+

jw_max_b

+

The highest Jaro-Winkler score for any of the first names in linked household A against the first name in linked household B where sex matches and birth year difference is less than or equal to ten, excluding the individual A in the current potential match. Returns 0 if no other individuals are in the household for comparison.

+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/config.html b/docs/config.html new file mode 100644 index 0000000..e17f749 --- /dev/null +++ b/docs/config.html @@ -0,0 +1,929 @@ + + + + + + + + + Configuration — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Configuration

+
    +
  1. Basic Example Config File

  2. +
  3. Advanced Example Config File

  4. +
  5. Top level configs

  6. +
  7. Data sources

  8. +
  9. Filter

  10. +
  11. Column mappings

  12. +
  13. Substitution columns

  14. +
  15. Feature selections

  16. +
  17. Potential matches universe

  18. +
  19. Blocking

  20. +
  21. Comparisons

  22. +
  23. Household comparisons

  24. +
  25. Comparison features

  26. +
  27. Pipeline-generated features

  28. +
  29. Training and models

  30. +
  31. Household training and models

  32. +
+
+

Basic Config File

+

The config file tells the hlink program what to link and how to link it. A description of the different sections of +a configuration file are below. For reference, here is an example of a relatively basic config file. This config file +is used by the examples/tutorial/tutorial.py script for linking, and there is a more detailed discussion of the config +file in the README in examples/tutorial.

+

Note that this config is written in TOML, but hlink is also able to work with JSON config files.

+
id_column = "id"
+feature_selections = []
+
+[datasource_a]
+alias = "a"
+file = "data/A.csv"
+
+[datasource_b]
+alias = "b"
+file = "data/B.csv"
+
+[[column_mappings]]
+column_name = "NAMEFRST"
+transforms = [
+    {type = "lowercase_strip"}
+]
+
+[[column_mappings]]
+column_name = "NAMELAST"
+transforms = [
+    {type = "lowercase_strip"}
+]
+
+[[column_mappings]]
+column_name = "AGE"
+transforms = [
+    {type = "add_to_a", value = 10}
+]
+
+[[column_mappings]]
+column_name = "SEX"
+
+[[blocking]]
+column_name = "SEX"
+
+[[blocking]]
+column_name = "AGE_2"
+dataset = "a"
+derived_from = "AGE"
+expand_length = 2
+explode = true
+
+[[comparison_features]]
+alias = "NAMEFRST_JW"
+column_name = "NAMEFRST"
+comparison_type = "jaro_winkler"
+
+[[comparison_features]]
+alias = "NAMELAST_JW"
+column_name = "NAMELAST"
+comparison_type = "jaro_winkler"
+
+[comparisons]
+operator = "AND"
+
+[comparisons.comp_a]
+comparison_type = "threshold"
+feature_name = "NAMEFRST_JW"
+threshold = 0.79
+
+[comparisons.comp_b]
+comparison_type = "threshold"
+feature_name = "NAMELAST_JW"
+threshold = 0.84
+
+
+
+
+

Advanced Config File

+

Here is an example of a more complex config file that makes use of more of hlink’s features. +It uses machine learning to probabilistically link the two datasets.

+
id_column = "histid"
+drop_data_from_scored_matches = false
+
+# --------- DATASOURCES --------------
+[datasource_a]
+alias = "us1900"
+file = "/path/to/us1900m_usa.P.parquet"
+
+[datasource_b]
+alias = "us1910"
+file = "/path/to/us1910m_usa.P.parquet"
+
+# --------- FILTERS --------------
+
+[[filter]]
+expression = "NAMELAST is not null and NAMELAST != ''"
+
+[[filter]]
+training_data_subset = true
+datasource = "a"
+
+[[filter]]
+expression = "age >= 5"
+datasource = "b"
+
+# --------- COLUMN MAPPINGS --------------
+
+[[column_mappings]]
+column_name = "serialp"
+
+[[column_mappings]]
+column_name = "sex"
+
+[[column_mappings]]
+column_name = "age"
+
+[[column_mappings]]
+column_name = "namelast"
+
+[[column_mappings]]
+alias = "namefrst_clean"
+column_name = "namefrst"
+transforms = [
+  { type = "lowercase_strip" },
+  { type = "rationalize_name_words" },
+  { type = "remove_qmark_hyphen"},
+  { type = "replace_apostrophe"},
+  { type = "remove_suffixes",  values = ["jr", "sr", "ii", "iii"] },
+  { type = "remove_alternate_names"},
+  { type = "condense_strip_whitespace"},
+]
+
+[[column_mappings]]
+alias = "namefrst_split"
+column_name = "namefrst_clean"
+transforms = [ { type = "split" } ]
+
+[[column_mappings]]
+alias = "namefrst_std"
+column_name = "namefrst_split"
+transforms = [
+  { type = "array_index", value = 0 }
+]
+
+[[column_mappings]]
+alias = "bpl_orig"
+column_name = "bpl"
+transforms = [
+  { type = "divide_by_int", value = 100 },
+  { type = "get_floor" }
+]
+
+[[column_mappings]]
+alias = "statefip"
+column_name = "statefip_h"
+
+[[column_mappings]]
+column_name = "birthyr"
+alias = "clean_birthyr"
+[[column_mappings.transforms]]
+type = "mapping"
+mappings = {9999 = "", 1999 = ""}
+output_type = "int"
+
+[[column_mappings]]
+alias = "relate_div_100"
+column_name = "relate"
+transforms = [
+  { type = "divide_by_int", value = 100 },
+  { type = "get_floor" }
+]
+
+# --------- SUBSTITUTIONS --------------
+
+[[substitution_columns]]
+column_name = "namefrst_std"
+
+[[substitution_columns.substitutions]]
+join_column = "sex"
+join_value = "1"
+substitution_file = "/path/to/name_std/male.csv"
+
+[[substitution_columns.substitutions]]
+join_column = "sex"
+join_value = "2"
+substitution_file = "/path/to/name_std/female.csv"
+
+# --------- FEATURE SELECTIONS --------------
+
+[[feature_selections]]
+input_column = "clean_birthyr"
+output_column = "replaced_birthyr"
+condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
+transform = "sql_condition"
+
+[[feature_selections]]
+input_column = "namelast"
+output_column = "namelast_bigrams"
+transform = "bigrams"
+
+[[feature_selections]]
+input_column = "bpl_orig"
+output_column = "bpl_clean"
+condition = "case when bpl_str == 'washington' and bpl2_str=='washington' then 53 when (bpl_str is null or bpl_str == '') and bpl2_str=='washington' then 53 when bpl_str == 'washington' and (bpl2_str=='' or bpl2_str is null) then 53 else bpl_orig end"
+transform = "sql_condition"
+
+[[feature_selections]]
+input_column = "bpl_clean"
+output_column = "region"
+transform = "attach_variable"
+region_dict = "/path/to/region.csv"
+col_to_join_on = "bpl"
+col_to_add = "region"
+null_filler = 99
+col_type = "float"
+
+# --------- POTENTIAL MATCHES UNIVERSE -------------
+
+[[potential_matches_universe]]
+expression = "sex == 1"
+
+# --------- BLOCKING --------------
+
+[[blocking]]
+column_name = "sex"
+
+[[blocking]]
+column_name = "birthyr_3"
+dataset = "a"
+derived_from = "replaced_birthyr"
+expand_length = 3
+explode = true
+
+[[blocking]]
+column_name = "namelast_bigrams"
+explode = true
+
+# --------- COMPARISONS --------------
+
+[comparisons]
+operator = "AND"
+
+[comparisons.comp_a]
+comparison_type = "threshold"
+feature_name = "namefrst_std_jw"
+threshold = 0.8
+
+[comparisons.comp_b]
+comparison_type = "threshold"
+feature_name = "namelast_jw"
+threshold = 0.75
+
+# --------- HOUSEHOLD COMPARISIONS (post-blocking filters) -------------
+
+[hh_comparisons]
+comparison_type = "threshold"
+feature_name = "byrdiff"
+threshold_expr = "<= 10"
+
+# --------- COMPARISON FEATURES --------------
+
+[[comparison_features]]
+alias = "region"
+column_name = "region"
+comparison_type = "fetch_a"
+categorical = true
+
+[[comparison_features]]
+alias = "namefrst_std_jw"
+column_name = "namefrst_std"
+comparison_type = "jaro_winkler"
+
+[[comparison_features]]
+alias = "namelast_jw"
+column_name = "namelast"
+comparison_type = "jaro_winkler"
+
+[[comparison_features]]
+alias = "sex_equals"
+column_name = "sex"
+comparison_type = "equals"
+categorical = true
+
+[[comparison_features]]
+alias = "relate_a"
+column_name = "relate_div_100"
+comparison_type = "fetch_a"
+
+# --------- PIPELINE-GENERATED FEATURES ------------
+
+[[pipeline_features]]
+input_columns =  ["sex_equals", "region"]
+output_column =  "sex_region_interaction"
+transformer_type =  "interaction"
+
+[[pipeline_features]]
+input_column = "relate_a"
+output_column = "relatetype"
+transformer_type = "bucketizer"
+categorical = true
+splits = [1,3,5,9999]
+
+# --------- TRAINING --------------
+
+[training]
+
+independent_vars = [ "namelast_jw", "region", "hits", "sex_region_interaction", "relatetype"]
+scale_data = false
+
+dataset = "/path/to/training_data.csv"
+dependent_var = "match"
+score_with_model = true
+use_training_data_features = false
+split_by_id_a = true
+decision = "drop_duplicate_with_threshold_ratio"
+
+n_training_iterations = 2
+output_suspicious_TD = true
+param_grid = true
+model_parameters = [ 
+    { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] },
+    { type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] }
+]
+    
+chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
+
+# --------- HOUSEHOLD TRAINING --------------
+
+[hh_training]
+
+prediction_col = "prediction"
+hh_col = "serialp"
+
+independent_vars = ["namelast_jw", "namefrst_std_jw", "relatetype", "sex_equals"]
+scale_data = false
+
+dataset = "/path/to/hh_training_data_1900_1910.csv"
+dependent_var = "match"
+score_with_model = true
+use_training_data_features = false
+split_by_id_a = true
+decision = "drop_duplicate_with_threshold_ratio"
+
+n_training_iterations = 10
+output_suspicious_TD = true
+param_grid = false
+model_parameters = [
+    { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 },
+    { type = "probit", threshold = 0.5, threshold_ratio = 1.0 }
+]
+    
+chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
+
+
+
+
+

Top level configs

+

These configs should go at the top of your config file under no header:

+

id_column

+

Required. Specify the id column that uniquely identifies a record in each dataset.

+
id_column = "id"
+
+
+

drop_data_from_scored_matches

+

Optional. Whether or not the scored potential matches should be output with full features data, or just ids and match information.

+
drop_data_from_scored_matches = false
+
+
+
+
+

Data sources

+
    +
  • Header names: datasource_a, datasource_b

  • +
  • Description: Specifies your input data.

  • +
  • Required: True

  • +
  • Type: Object

  • +
  • Attributes:

    +
      +
    • alias – Type: string. The short name for the datasource. Must be alphanumeric with no spaces.

    • +
    • file – Type: string. Required. The path to the input file. The file can be csv or parquet.

    • +
    +
  • +
+
[datasource_a]
+alias = "us1900"
+file = "/path/to/my_file.csv"
+
+
+
+
+

Filter

+
    +
  • Header name: filter

  • +
  • Description: Specifies filters to apply to your input data.

  • +
  • Required: False

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • expression – Type: string. SQL expression to apply to your input datasets. Can not be combined with training_data_subset in a single filter.

    • +
    • training_data_subset – Type: boolean. If set to true, will subset your input data to only include records that are also in your training data. Can not be combined with expression in a single filter.

    • +
    • datasource – Type: string. If you want to limit the filter to operate only on dataset a or b, you can specify that with this attribute.

    • +
    +
  • +
+
[[filter]]
+training_data_subset = true
+datasource = "a"
+
+[[filter]]
+expression = "NAMELAST is not null and NAMELAST != ''"
+
+[[filter]]
+expression = "age >= 5"
+datasource = "b"
+
+
+
+
+

Column Mappings

+
    +
  • Header name: column_mappings

  • +
  • Description: Base column mappings and transformations to extract from your input datasets.

  • +
  • Required: True

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • alias – Type: string. Optional; if not specified the new column name defaults to column_name. New name of column.

    • +
    • column_name – Type: string. Name of column in input data. Used as the name of the output column if alias is not specified.

    • +
    • transforms – Type: List. Optional. A list of transforms to apply, in order, to the input data. See the column mapping transforms section for more information.

    • +
    +
  • +
+
[[column_mappings]]
+column_name = "age"
+
+[[column_mappings]]
+alias = "namefrst_clean"
+column_name = "namefrst"
+transforms = [
+  { type = "lowercase_strip" },
+  { type = "rationalize_name_words" },
+  { type = "remove_qmark_hyphen"},
+  { type = "replace_apostrophe"},
+  { type = "remove_suffixes",  values = ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"] },
+  { type = "remove_alternate_names"},
+  { type = "condense_strip_whitespace"}
+]
+
+
+
+
+

Substitution Columns

+
    +
  • Header name: substitution_columns

  • +
  • Description: Substitutions to apply to data after column mappings.

  • +
  • Required: False

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. Column to apply substitutions to.

    • +
    • substitutions – Type: list. A list of substitutions to apply. See the substitutions section for more information.

    • +
    +
  • +
+
[[substitution_columns]]
+column_name = "namefrst_std"
+
+[[substitution_columns.substitutions]]
+join_column = "sex"
+join_value = "1"
+substitution_file = "/path/to/name_std/male.csv"
+
+[[substitution_columns.substitutions]]
+join_column = "sex"
+join_value = "2"
+substitution_file = "/path/to/name_std/female.csv"
+
+
+
+
+

Feature Selections

+
    +
  • Header name: feature_selections

  • +
  • Description: A list of feature selections to apply to the input data after substitutions and column mappings. See the feature selection transforms section for more information, including information on the specific transforms available.

  • +
  • Required: False

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • input_column – Type: string. Required. The name of the input column.

    • +
    • output_column – Type: string. Required. The name of the output column.

    • +
    • transform – Type: string. The name of the transform to apply to the column.

    • +
    • Other attributes vary depending on transform type.

    • +
    +
  • +
+
[[feature_selections]]
+input_column = "namelast_clean"
+output_column = "namelast_clean_bigrams"
+transform = "bigrams"
+
+[[feature_selections]]
+input_column = "bpl_clean"
+output_column = "region"
+transform = "attach_variable"
+region_dict = "/path/to/region.csv"
+col_to_join_on = "bpl"
+col_to_add = "region"
+null_filler = 99
+col_type = "float"
+
+
+
+
+

Potential Matches Universe

+
    +
  • Header name: potential_matches_universe

  • +
  • Description: Limits the universe of created potential matches generated using an expression fed to a SQL query.

  • +
  • Required: False

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • expression – Type: string. Required. The expression to use to filter prepped_df_(a/b) before generating potential matches.

    • +
    +
  • +
+
[[potential_matches_universe]]
+# limits potential matches created to only men
+expression = "sex == 1"
+
+
+
+
+

Blocking

+
    +
  • Header name: blocking

  • +
  • Description: Describes what columns to block on and how to create the blocks for the potential matches.

  • +
  • Required: True

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • column_name – Type: string. Required. The name of the column in the existing data to block on if not exploded; The name of the newly exploded column if explode = true.

    • +
    • explode – Type: boolean. Optional. If true, will attempt to “explode” the column by creating duplicate rows for each value in the column. Only works on columns that are arrays of values or when expand_length is set.

    • +
    • dataset – Type: string. Optional. Must be a or b and used in conjuction with explode. Will only explode the column from the a or b dataset when specified.

    • +
    • derived_from – Type: string. Used in conjunction with explode = true. Specifies an input column from the existing dataset to be exploded.

    • +
    • expand_length – Type: integer. When explode is used on a column that is an integer, this can be specified to create an array with a range of integer values from (expand_length minus original_value) to (expand_length plus original_value). For example, if the input column value for birthyr is 1870, explode is true, and the expand_length is 3, the exploded column birthyr_3 value would be the array [1867, 1868, 1869, 1870, 1871, 1872, 1873].

    • +
    +
  • +
+
[[blocking]]
+column_name = "bpl"
+
+[[blocking]]
+column_name = "birthyr_3"
+dataset = "a"
+derived_from = "birthyr"
+expand_length = 3
+explode = true
+
+
+
+
+

Comparisons

+
    +
  • Header name: comparisons

  • +
  • Description: A list of comparisons to threshold the potential matches on. Only potential matches that pass the thresholds will be created. See comparison types for more information.

  • +
  • Required: True

  • +
  • Type: Object

  • +
  • Attributes:

    +
      +
    • comparison_type – Type: string. Required. See comparison types for more information.

    • +
    • feature_name – Type: string. Required. The comparison_feature to use for the comparison threshold. A comparison_feature column by this name must be specified in the comparison_features section.

    • +
    +
  • +
+
[comparisons]
+operator = "AND"
+
+[comparisons.comp_a]
+comparison_type = "threshold"
+feature_name = "namefrst_jw"
+threshold = 0.79
+
+[comparisons.comp_b]
+comparison_type = "threshold"
+feature_name = "namelast_jw"
+threshold = 0.79
+
+
+
+
+

Household Comparisons

+
    +
  • Header name: hh_comparisons

  • +
  • Description: A list of comparisons to threshold the household potential matches on. Also referred to as post-blocking filters, as all household potential matches are created, then only potential matches that pass the post-blocking filters will be kept for scoring. See comparison types for more information.

  • +
  • Required: False

  • +
  • Type: Object

  • +
  • Attributes:

    +
      +
    • comparison_type – Type: string. Required. See comparison types for more information.

    • +
    • feature_name – Type: string. Required. The comparison_feature to use for the comparison threshold. A comparison_feature column by this name must be specified in the comparison_features section.

    • +
    +
  • +
+
[hh_comparisons]
+# only keep household potential matches with an age difference less than or equal than ten years
+comparison_type = "threshold"
+feature_name = "byrdiff"
+threshold_expr = "<= 10"
+
+
+
+
+

Comparison Features

+
    +
  • Header name: comparison_features

  • +
  • Description: A list of comparison features to create when comparing records. Comparisons for individual and household linking rounds are both represented here – no need to duplicate comparisons if used in both rounds, simply specify the column_name in the appropriate training or hh_training section of the config. See the comparison types section for more information.

  • +
  • Required: True

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • alias – Type: string. Optional. The name of the comparison feature column to be generated. If not specified, the output column will default to column_name.

    • +
    • column_name – Type: string. The name of the columns to compare.

    • +
    • comparison_type – Type: string. The name of the comparison type to use. See the comparison types section for more information.

    • +
    • categorical – Type: boolean. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage).

    • +
    • Other attributes may be included as well depending on comparison_type. See the comparison types section for details on each comparison type.

    • +
    +
  • +
+
[[comparison_features]]
+alias = "race"
+column_name = "race"
+comparison_type = "equals"
+categorical = true
+
+[[comparison_features]]
+alias = "namefrst_jw"
+column_name = "namefrst_unstd"
+comparison_type = "jaro_winkler"
+
+[[comparison_features]]
+column_name = "durmarr"
+comparison_type = "new_marr"
+upper_threshold = 7
+
+
+
+
+

Pipeline-generated Features

+
    +
  • Header name: pipeline_features

  • +
  • Description: Features to be added in the model pipeline created for scoring a dataset. These features cannot be used in the comparisons section of the config and are for creating more robust ML models. They typically leverage code available in the Spark Pipeline API.

  • +
  • Required: False

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • transformer_type – Type: string. Required. See pipeline features for more information on the available transformer types.

    • +
    • input_column – Type: string. Either use input_column or input_columns. Used if a single input_column is needed for the pipeline feature.

    • +
    • input_columns – Type: List of strings. Either use input_column or input_columns. Used if a list of input_columns is needed for the pipeline feature.

    • +
    • output_column – Type: string. The name of the new pipeline feature column to be generated.

    • +
    • categorical – Type: boolean. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage).

    • +
    • Other attributes may be included as well depending on the particular pipline feature transformer_type.

    • +
    +
  • +
+
[[pipeline_features]]
+input_columns =  ["sex_equals", "regionf"]
+output_column =  "sex_regionf_interaction"
+transformer_type =  "interaction"
+
+[[pipeline_features]]
+input_column = "immyear_diff"
+output_column = "immyear_caution"
+transformer_type = "bucketizer"
+categorical = true
+splits = [-1,0,6,11,9999]
+
+
+
+
+

Training and models

+
    +
  • Header name: training

  • +
  • Description: Specifies the training data set as well as a myriad of attributes related to training a model including the dependent variable within that dataset, the independent variables created from the comparison_features section, and the different models you want to use for either model exploration or scoring.

  • +
  • Required: False

  • +
  • Type: Object

  • +
  • Attributes:

    +
      +
    • dataset – Type: string. Location of the training dataset. Must be a csv file.

    • +
    • dependent_var – Type: string. Name of dependent variable in training dataset.

    • +
    • independent_vars – Type: list. List of independent variables to use in the model. These must be either part of pipeline_features or comparison_features.

    • +
    • chosen_model – Type: object. The model to train with in the training task and score with in the matching task. See the models section for more information on model specifications.

    • +
    • threshold – Type: float. The threshold for which to accept model probability values as true predictions. Can be used to specify a threshold to use for all models, or can be specified within each chosen_model and model_parameters specification.

    • +
    • decision – Type: string. Optional. Specifies which decision function to use to create the final prediction. The first option is drop_duplicate_a, which drops any links for which a record in the a data set has a predicted match more than one time. The second option is drop_duplicate_with_threshold_ratio which only takes links for which the a record has the highest probability out of any other potential links, and the second best link for the a record is less than the threshold_ratio.

    • +
    • threshold_ratio – Type: float. Optional. For use when decision is drop_duplicate_with_threshold_ratio . Specifies the smallest possible ratio to accept between a best and second best link for a given record. Can be used to specify a threshold ratio (beta threshold) to use for all models. Alternatively, unique threshold ratios can be specified in each individual chosen_model and model_parameters specification.

    • +
    • model_parameters – Type: list. Specifies models to test out in the model_exploration task. See the models section for more information on model specifications.

    • +
    • param_grid – Type: boolean. Optional. If you would like to evaluate multiple hyper-parameters for a single model type in your model_parameters specification, you can give hyper-parameter inputs as arrays of length >= 1 instead of integers to allow one model per row specification with multiple model eval outputs.

    • +
    • score_with_model – Type: boolean. If set to false, will skip the apply_model step of the matching task. Use this if you want to use the run_all_steps command and are just trying to generate potential links, such as for the creation of training data.

    • +
    • n_training_iterations – Type: integer. Optional; default value is 10. The number of training iterations to use during the model_exploration task.

    • +
    • scale_data – Type: boolean. Optional. Whether to scale the data as part of the machine learning pipeline.

    • +
    • use_training_data_features – Type: boolean. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to true, or training features will not be able to be generated, giving null column errors. For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to true or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data. If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to false, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven’t changed, you could set it to true to save a small amount of processing time.

    • +
    • output_suspicious_TD – Type: boolean. Optional. Used in the model_exploration link task. Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data. Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set.

    • +
    • split_by_id_a – Type: boolean. Optional. Used in the model_exploration link task. When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a “A304BT” has three potential matches in the training data, one each to histid_b “B200”, “C201”, and “D425”, all of those potential matches would either end up in the “train” split or the “test” split when evaluating the model performance.

    • +
    • feature_importances – Type: boolean. Optional, and currently not functional. Whether to record feature importances for the training features when training or evaluating an ML model.

    • +
    +
  • +
+
[training]
+independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"]
+scale_data = false
+dataset = "/path/to/1900_1910_training_data_20191023.csv"
+dependent_var = "match"
+use_training_data_features = false
+output_suspicious_TD = true
+split_by_id_a = true
+
+score_with_model = true
+feature_importances = true
+
+decision = "drop_duplicate_with_threshold_ratio"
+
+n_training_iterations = 10
+param_grid = false
+model_parameters = [
+  { type = "random_forest", maxDepth = 6, numTrees = 50 },
+  { type = "probit", threshold = 0.5}
+]
+
+chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
+
+
+
+
+

Household training and models

+
    +
  • Header name: hh_training

  • +
  • Description: Specifies the household training data set as well as a myriad of attributes related to training a model including the dependent var within that data set, the independent vars created from the comparison_features section, and the different models you want to use.

  • +
  • Required: False

  • +
  • Type: Object

  • +
  • Attributes:

    +
      +
    • All of the attributes and models available in training may also be used here.

    • +
    • prediction_col – Type: string. Required. The name of the column that the final prediction value is recorded in the individual linking round scoring step.

    • +
    • hh_col – Type: string. Required. The name of the column with the household identifier.

    • +
    +
  • +
+
[hh_training]
+prediction_col = "prediction"
+hh_col = "serialp"
+
+independent_vars = ["namelast_jw","namefrst_jw","namefrst_std_jw", "jw_max_a", "jw_max_b", "f1_match", "f2_match", "byrdifcat", "racematch", "imm", "bplmatch", "imm_interacted_bplmatch", "sexmatch", "mardurmatch", "relatetype", "relatematch", "relatetype_interacted_relatematch"]
+
+scale_data = false
+dataset = "/path/to/hh_training_data_1900_1910.csv"
+dependent_var = "match"
+use_training_data_features = false
+output_suspicious_TD = true
+split_by_id_a = true
+score_with_model = true
+feature_importances = true
+decision = "drop_duplicate_with_threshold_ratio"
+
+param_grid = true
+n_training_iterations = 10
+model_parameters = [
+    { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.1]},
+    { type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]}
+]
+
+chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
+
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html new file mode 100644 index 0000000..914aeda --- /dev/null +++ b/docs/feature_selection_transforms.html @@ -0,0 +1,246 @@ + + + + + + + + + Feature Selection transforms — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Feature Selection transforms

+

Each header below represents a feature selection transform. These transforms are used in the context of feature_selections.

+
[[feature_selections]]
+input_column = "clean_birthyr"
+output_column = "replaced_birthyr"
+condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
+transform = "sql_condition"
+
+
+

There are some additional attributes available for all transforms: checkpoint, override_column_a, override_column_b, set_value_column_a, set_value_column_b.

+
+

bigrams

+

Split the given string column into bigrams.

+
    +
  • Attributes:

    +
      +
    • input_column - Type: string. Required.

    • +
    • output_column - Type: string. Required.

    • +
    • no_first_pad - Type: boolean. Optional. If set to true, don’t prepend a space “ “ to the column before splitting into bigrams. If false or not provided, do prepend the space.

    • +
    +
  • +
+
[[feature_selections]]
+input_column = "namelast_clean"
+output_column = "namelast_clean_bigrams"
+transform = "bigrams"
+
+
+
+
+

sql_condition

+

Apply the given SQL.

+
    +
  • Attributes:

    +
      +
    • condition - Type: string. Required. The SQL condition to apply.

    • +
    • output_column - Type: string. Required.

    • +
    +
  • +
+
[[feature_selections]]
+input_column = "clean_birthyr"
+output_column = "replaced_birthyr"
+condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
+transform = "sql_condition"
+
+
+
+
+

array

+

Combine two input columns into an array output column.

+
    +
  • Attributes:

    +
      +
    • input_columns - Type: list of strings. Required. The two input columns.

    • +
    • output_column - Type: string. Required.

    • +
    +
  • +
+
[[feature_selections]]
+input_columns = ["namelast_clean_bigrams", "namefrst_unstd_bigrams"]
+output_column = "namelast_frst_bigrams"
+transform = "array"
+
+
+
+
+

union

+

Take the set union of two columns that are arrays of strings, returning another +array of strings.

+
    +
  • Attributes:

    +
      +
    • input_columns - Type: list of strings. Required.

    • +
    • output_column - Type: string. Required.

    • +
    +
  • +
+
+
+

soundex

+

Compute the soundex encoding of the input column.

+
    +
  • Attributes:

    +
      +
    • input_column - Type: string. Required.

    • +
    • output_column - Type: string. Required.

    • +
    +
  • +
+
[[feature_selections]]
+input_column = "namelast_clean"
+output_column = "namelast_clean_soundex"
+transform = "soundex"
+
+
+
+
+

power

+

Raise the input column to a given power.

+
    +
  • Attributes:

    +
      +
    • input_col - Type: string. Required.

    • +
    • output_col - Type: string. Required.

    • +
    • exponent - Type: int. Required. The power to which to raise the input column.

    • +
    +
  • +
+
[[feature_selections]]
+input_col = "ncount"
+output_col = "ncount2"
+transform = "power"
+exponent = 2
+
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/genindex.html b/docs/genindex.html new file mode 100644 index 0000000..4559ac5 --- /dev/null +++ b/docs/genindex.html @@ -0,0 +1,117 @@ + + + + + + + + Index — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + +

Index

+ +
+ +
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..a2a53e7 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,236 @@ + + + + + + + + + Welcome to hlink’s documentation! — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + +
+

Configuration API

+ +
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/installation.html b/docs/installation.html new file mode 100644 index 0000000..46eceee --- /dev/null +++ b/docs/installation.html @@ -0,0 +1,141 @@ + + + + + + + + + Installation — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Installation

+
+

Requirements

+

Make sure that you have each of these installed on your system.

+ +
+
+

Installing the program

+

In the root project directory, run pip install .

+

To install hlink for development work, run pip install -e .[dev]. This will install additional +development dependencies and install hlink in editable mode so that any changes made to the source +code are automatically built.

+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/introduction.html b/docs/introduction.html new file mode 100644 index 0000000..968f1f8 --- /dev/null +++ b/docs/introduction.html @@ -0,0 +1,141 @@ + + + + + + + + + Introduction — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Introduction

+
+

Overview

+

hlink is designed to link two datasets. It allows for probabilistic and deterministic record linkage. It provides functionality for the following production tasks:

+
    +
  1. Preprocessing: preprocess each dataset to clean and transform it in preparation for linking.

  2. +
  3. Training: train machine learning models on a set of features and compare results between models.

  4. +
  5. Matching: match two datasets using a model created in training or with deterministic rules.

  6. +
  7. Household Training: train machine learning models on a set of features for households and compare results between models.

  8. +
  9. Household Matching: match households between two datasets.

  10. +
+

In addition, it also provides functionality for the following research/development tasks:

+
    +
  1. Model Exploration and Household Model Exploration: Use a matrix of models and hyper-parameters to evaluate model performance and select a model to be used in the production run. Also generates reports of suspected false positives and false negatives in the specified training data set if appropriate config flag is set.

  2. +
  3. Reporting: Generate reports on the linked data.

  4. +
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/link_tasks.html b/docs/link_tasks.html new file mode 100644 index 0000000..1a281be --- /dev/null +++ b/docs/link_tasks.html @@ -0,0 +1,221 @@ + + + + + + + + + Link Tasks — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + + + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/models.html b/docs/models.html new file mode 100644 index 0000000..fbde7e7 --- /dev/null +++ b/docs/models.html @@ -0,0 +1,199 @@ + + + + + + + + + Models — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Models

+

These are models available to be used in the model evaluation, training, and household training link tasks.

+
    +
  • Attributes for all models:

    +
      +
    • threshold – Type: float. Alpha threshold (model hyperparameter).

    • +
    • threshold_ratio – Type: float. Beta threshold (de-duplication distance ratio).

    • +
    • Any parameters available in the model as defined in the Spark documentation can be passed as params using the label given in the Spark docs. Commonly used parameters are listed below with descriptive explanations from the Spark docs.

    • +
    +
  • +
+
+

random_forest

+

Uses pyspark.ml.classification.RandomForestClassifier. Returns probability as an array.

+
    +
  • Parameters:

    +
      +
    • maxDepth – Type: int. Maximum depth of the tree. Spark default value is 5.

    • +
    • numTrees – Type: int. The number of trees to train. Spark default value is 20, must be >= 1.

    • +
    • featureSubsetStrategy – Type: string. Per the Spark docs: “The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].”

    • +
    +
  • +
+
model_parameters = { type = "random_forest", maxDepth = 5, numTrees = 75, featureSubsetStrategy = "sqrt", threshold = 0.15, threshold_ratio = 1.0 }
+
+
+
+
+

probit

+

Uses pyspark.ml.regression.GeneralizedLinearRegression with family="binomial" and link="probit".

+
model_parameters = { type = "probit", threshold = 0.85, threshold_ratio = 1.2 }
+
+
+
+
+

logistic_regression

+

Uses pyspark.ml.classification.LogisticRegression

+
chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
+
+
+
+
+

decision_tree

+

Uses pyspark.ml.classification.DecisionTreeClassifier.

+
    +
  • Parameters:

    +
      +
    • maxDepth – Type: int. Maximum depth of the tree.

    • +
    • minInstancesPerNode – Type int. Per the Spark docs: “Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.”

    • +
    • maxBins – Type: int. Per the Spark docs: “Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.”

    • +
    +
  • +
+
chosen_model = { type = "decision_tree", maxDepth = 6, minInstancesPerNode = 2, maxBins = 4}
+
+
+
+
+

gradient_boosted_trees

+

Uses pyspark.ml.classification.GBTClassifier.

+
    +
  • Parameters:

    +
      +
    • maxDepth – Type: int. Maximum depth of the tree.

    • +
    • minInstancesPerNode – Type int. Per the Spark docs: “Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.”

    • +
    • maxBins – Type: int. Per the Spark docs: “Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.”

    • +
    +
  • +
+
chosen_model = { type = "gradient_boosted_trees", maxDepth = 4, minInstancesPerNode = 1, maxBins = 6, threshold = 0.7, threshold_ratio = 1.3 }
+
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/objects.inv b/docs/objects.inv new file mode 100644 index 0000000..4b7b5f9 Binary files /dev/null and b/docs/objects.inv differ diff --git a/docs/pipeline_features.html b/docs/pipeline_features.html new file mode 100644 index 0000000..24f1261 --- /dev/null +++ b/docs/pipeline_features.html @@ -0,0 +1,172 @@ + + + + + + + + + Pipeline generated features — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Pipeline generated features

+
+

Transformer types

+

Each header below represents a feature created using a transformation available through the Spark Pipeline API. These transforms are used in the context of pipeline_features.

+
[[pipeline_features]]
+input_column = "immyear_diff"
+output_column = "immyear_caution"
+transformer_type = "bucketizer"
+categorical = true
+splits = [-1,0,6,11,9999]
+
+[[pipeline_features]]
+input_columns = ["race","srace"]
+output_column = "race_interacted_srace"
+transformer_type = "interaction"
+
+
+
+

interaction

+

Interact two or more features, creating a vectorized result.

+
[[pipeline_features]]
+# interact the categorical features for mother caution flag, mother present flag, and mother jaro-winkler score
+input_columns = ["m_caution", "m_pres", "jw_m"]
+output_column = "m_interacted_jw_m"
+transformer_type = "interaction"
+
+
+
+
+

bucketizer

+

From the pyspark.ml.feature.Bucketizer() docs: “Maps a column of continuous features to a column of feature buckets.”

+
    +
  • Attributes:

    +
      +
    • splits – Type: Array of integers. Required for this transformer_type. Per the pyspark.ml.feature.Bucketizer() docs: “Split points for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. The splits should be of length >= 3 and strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, values outside the splits specified will be treated as errors.”

    • +
    +
  • +
+
[[pipeline_features]]
+input_column = "relate_a"
+output_column = "relatetype"
+transformer_type = "bucketizer"
+categorical = true
+splits = [1,3,5,9999]
+
+
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/running_the_program.html b/docs/running_the_program.html new file mode 100644 index 0000000..6c46fde --- /dev/null +++ b/docs/running_the_program.html @@ -0,0 +1,361 @@ + + + + + + + + + Running hlink — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + + + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/search.html b/docs/search.html new file mode 100644 index 0000000..b747bd8 --- /dev/null +++ b/docs/search.html @@ -0,0 +1,136 @@ + + + + + + + + Search — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +

Search

+ + + + +

+ Searching for multiple words only shows matches that contain + all words. +

+ + +
+ + + +
+ + + +
+ +
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/searchindex.js b/docs/searchindex.js new file mode 100644 index 0000000..41d498f --- /dev/null +++ b/docs/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({docnames:["column_mapping_transforms","comparison_types","config","feature_selection_transforms","index","installation","introduction","link_tasks","models","pipeline_features","running_the_program","substitutions","use_examples"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":5,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,sphinx:56},filenames:["column_mapping_transforms.md","comparison_types.md","config.md","feature_selection_transforms.md","index.rst","installation.md","introduction.md","link_tasks.md","models.md","pipeline_features.md","running_the_program.md","substitutions.md","use_examples.md"],objects:{},objnames:{},objtypes:{},terms:{"0":[0,1,2,7,8,9,10,12],"005":2,"05":[1,2],"1":[0,1,2,4,7,8,9,10,12],"10":[2,12],"100":[0,2,12],"11":[0,2,9],"12":0,"14":1,"15":8,"1867":2,"1868":2,"1869":2,"1870":2,"1870_ct":[],"1871":2,"1872":2,"1873":2,"1880":[],"1880_ct":[],"1900":[2,12],"1900_1910_potential_fn":12,"1900_1910_potential_fp":12,"1900_1910_training_data_20191023":2,"1900_1910_training_result":12,"1910":[1,2,12],"1920":12,"1930":[2,12],"1940":[2,12],"1999":[0,2],"2":[0,1,2,3,7,8,11,12],"20":8,"3":[0,1,2,5,8,9,12],"300":0,"301":0,"302":0,"303":0,"4":[0,1,8],"5":[0,1,2,8,9,10,12],"50":[2,12],"50g":12,"53":2,"5g":10,"6":[0,2,5,8,9,12],"65":2,"7":[0,1,2,8,12],"75":[2,8,12],"79":2,"7th":11,"8":[1,2,5,10],"80":2,"84":2,"85":[8,10],"9":1,"95":1,"99":[1,2],"9998":0,"9999":[0,2,9],"boolean":[1,2,3,11,12],"case":[0,1,2,3],"class":10,"default":[2,8,10],"do":[1,3,10,12],"export":[4,7,10],"final":[2,12],"float":[1,2,8],"function":[0,1,2,6,10],"import":[2,10,12],"int":[0,1,2,3,4,8],"long":11,"new":[2,12],"null":[1,2,3],"return":[1,3,8,10],"short":2,"true":[1,2,3,9,11,12],"try":[0,2],"var":[1,2],A:[1,2,9,10],AND:[1,2],By:10,For:[0,1,2,7,10,12],IS:1,If:[0,1,2,3,8,10,11],In:[0,1,5,6,10,12],It:[0,2,6,10,12],NOT:1,OR:1,THEN:1,The:[0,1,2,3,7,8,9,10,12],Then:0,There:[1,3,7],These:[0,1,2,3,8,9],To:[1,5,10],Will:2,With:9,_:[0,1,2,3,4,8,9,10],_a:1,a304bt:2,ab:1,abbrevi:11,abl:2,about:[2,10,12],abov:1,absolut:1,accept:[1,2,12],access:[10,12],across:1,ad:[1,2],add:[0,4],add_to_a:2,addit:[0,1,2,3,5,6,10],addl:1,addl_var:1,adjust:10,adopt:0,advanc:4,after:[1,2,4,8,10],ag:[1,2,3],against:[1,11],age_2:2,age_threshold:1,aggreg:4,ah:0,algorithm:7,ali:0,alia:[0,2],all:[0,1,2,3,8,9,10],allei:0,alli:0,allow:[1,2,6,12],along:[0,1],alpha:8,alphabet:0,alphanumer:2,also:[1,2,6,9,10,12],altern:[0,2,4],although:2,among:1,amount:2,an:[0,1,2,3,8,10],analysi:12,analyz:[10,12],anex:0,ani:[0,1,2,5,7,8],annex:0,anoth:[0,3],anyon:7,anywher:11,api:[2,9],apostroph:[0,4],appear:1,appli:[0,2,3,12],apply_model:2,appropri:[2,6],ar:[0,1,2,3,5,7,8,9,10,12],area:2,aren:1,arg:12,argument:[10,12],arrai:[0,2,4,8,9],array_index:2,ask:12,aspect:10,assert:[1,10],attach_vari:2,attempt:2,attribut:[0,1,2,3,8,9,10,11],auto:8,automat:[5,7],av:[0,11],avail:[1,2,3,8,9,12],avaneu:0,avanu:0,aven:0,avenu:[0,11],avn:0,avnu:0,b200:2,b:[0,1,2,4,10],back:1,backup:1,base:[1,2,7],basic:4,befor:[1,2,3],begin:10,behind:0,being:[1,7],below:[0,1,2,3,8,9,10],best:2,beta:[2,8],better:0,between:[0,1,2,6,7,10,12],beyond:1,bg:0,bigram:[2,4],bin:8,binomi:8,birth:1,birthplac:0,birthyr:[0,2],birthyr_3:2,birthyr_col:1,block:[0,4,7],blvd:0,boost:[4,8],born:1,borrow_t:10,both:[1,2,12],boul:0,boulevard:0,boundari:1,bpl2_str:2,bpl:[0,1,2],bpl_clean:2,bpl_orig:2,bpl_root:0,bpl_str:2,bplmatch:2,brg:0,bridg:0,btwn:1,bucket:2,built:5,burg:0,byrdifcat:2,byrdiff:[1,2,12],c201:2,calcul:[1,12],camp:0,can:[0,1,2,8,10,12],cannot:2,cartesian:1,cast:1,categor:[1,2,8,9],categori:[0,8],caus:8,caution:[1,9],censu:[0,12],chang:[1,2,5,10,12],charact:0,characterist:7,check:[1,10],check_val_expr:1,checkpoint:3,child:[0,8],children:1,chines:0,choic:10,chosen_model:[2,8,12],circl:0,circumst:1,classif:[7,8],claus:[0,1],clean:[0,6],clean_birthyr:[0,2,3],code:[0,1,2,5],col:1,col_to_add:2,col_to_join_on:2,col_typ:2,column:[1,3,4,7,9,10,11,12],column_map:[0,2],column_nam:[0,1,2,11],combin:[1,2,3],come:1,command:[2,10,12],common:12,commonli:8,comp:1,comp_a:[1,2],comp_b:[1,2],comp_c:1,comp_d:1,compar:[1,2,6,7],comparis:2,comparison:[4,7],comparison_featur:[1,2],comparison_typ:[1,2],complet:10,complex:2,comput:3,concat:[0,4],concaten:[0,1],condens:[0,4],condense_strip_whitespac:2,condit:[0,2,3,4],conf:[10,12],config:[0,1,4,6,7,10,12],configur:[1,7,10,12],conjuct:2,conjunct:2,connecticut:[],consid:[1,8],consider:1,consol:10,contain:[0,1,11],context:[0,1,3,9],continu:[8,9,12],conveni:10,convert:[0,1],copi:12,cor:0,core:[1,10,12],corner:0,could:2,count:[1,10,12],counti:1,county_1900_1910_distances_km:1,county_a:1,county_b:1,county_dist:[1,2,12],county_distance_lookup:1,county_distance_squar:[1,2,12],county_state_dist:1,court:0,cover:9,coverag:2,cp:0,cpu:10,cre:0,creat:[2,6,7,9,10,11,12],creation:2,crescent:0,crosswalk:7,csv:[1,2,7,10,11,12],ct1870:[],ct1880:[],ct:0,current:[1,2,10],d425:2,data:[0,1,4,6,7,10],databas:10,datafram:[7,10,12],dataset:[0,1,2,6,7,10,12],datasourc:[1,2,10],datasource_a:2,datasource_b:2,de:8,decis:[2,4,8,12],decisiontreeclassifi:8,dedic:7,defin:[8,9,10],demograph:7,depend:[1,2,5,12],dependent_var:[2,12],depth:8,deriv:12,derived_from:2,desc:10,describ:[2,10],descript:[2,8,10],design:6,detail:[0,2,7,10],determin:1,determinist:6,dev:5,develop:[5,6],df:[2,10],dictionari:10,diff:1,differ:[1,2,4],digit:0,dir:11,directori:[5,10,12],discard:8,discret:8,discuss:2,dist:1,dist_tabl:1,distanc:[1,8],distance_col:1,distance_km:1,distances_fil:1,distinct:1,divid:[0,4],divide_by_int:2,doc:[8,9],document:[1,7,8,10,12],doe:[1,12],don:3,doubl:9,down:[0,12],dr:0,driv:0,drive:0,drop:[0,2,10],drop_al:10,drop_all_prc:10,drop_all_temp:10,drop_data_from_scored_match:2,drop_duplicate_a:2,drop_duplicate_with_threshold_ratio:[2,12],duplic:[2,8],durat:1,dure:2,durmarr:[1,2],e:[0,5],each:[0,1,2,3,5,6,7,8,9,10],easiest:10,easili:10,east:0,edit:5,either:[1,2,11],either_0:1,either_1:1,element:0,els:[0,1,2,3],else_v:1,else_valu:0,empti:1,enabl:10,encod:[2,3],end:[0,1,2,3,11],ensur:2,enter:10,enum_dist:1,enumer:10,equal:[2,11],error:[2,9],est:0,estat:0,etc:12,eval:2,evalu:[1,2,6,8],even:1,everi:1,ex:1,exact:[1,2],exact_mult:[2,12],exampl:[0,1,2],except:9,exclud:1,excute_command:10,execut:10,execute_command:10,execute_task:10,executor:10,executor_memori:[10,12],exist:[1,2,10],exit:10,expand:2,expand_length:2,expect:1,experiment:10,explan:8,explicitli:9,explod:[2,7],exploded_df_a:10,exploded_df_b:10,explor:[2,4,6,10],expon:3,exponenti:1,expr_a:1,expr_b:1,expr_c:1,expr_d:1,express:[0,1,2],expresswai:0,ext:0,extend:1,extens:0,extra:1,extract:2,f1:1,f1_match:2,f2:1,f2_match:2,f:[1,10],f_caution:[2,12],f_interacted_jw_f:[2,12],f_pre:[2,12],factori:10,fail:2,fals:[2,3,4,6,10],famili:8,father_namefrst:1,fbplmatch:[2,12],featur:[4,6,7,8,10],feature_import:[2,12],feature_nam:2,feature_select:[2,3],featuresubsetstrategi:8,fed:2,femal:[2,11],ferri:0,fetch:1,fetch_a:2,fewer:[1,8],fi:1,file:[1,4,7,10,11,12],filepath:10,fill:1,filter:[1,4,7,11],find:[1,12],finish:10,first:[0,1,2,10,11],first_init_col:1,five:10,fix:7,flag:[1,6,7,9,10,12],floor:[0,4],follow:[0,1,6,10,11,12],foreign:1,forest:[4,8],form:[1,11],fort:0,four:1,framework:12,from:[0,1,2,7,8,9,10,12],from_icpsrctyi:1,from_statefip:1,frt:0,fry:0,fsoundex:[2,12],ft:0,full:[2,12],full_count_1870_1880:10,full_count_1900_1910:12,fullcount_1870_1880:10,further:12,gbtclassifi:8,gen:1,gener:[0,1,4,6,7,10],generalizedlinearregress:8,geo:1,geograph:1,get:[0,1,4,10],get_floor:2,get_set:10,get_step:10,get_tabl:10,get_task:10,give:[0,2],given:[0,1,2,3,8,12],go:[2,10],gradient:[4,8],greater:1,greatest:1,group:2,gt:1,h:10,ha:[1,2,10,12],handl:10,have:[1,5,8,10,12],haven:2,head:0,header:[0,1,2,3,9,11],height:0,help:[2,10],here:[2,10,12],hh:1,hh_blocked_match:10,hh_col:2,hh_comparison:[2,7],hh_match:10,hh_model_eval_repeat_fn:10,hh_model_eval_repeat_fp:10,hh_model_eval_training_data:10,hh_model_eval_training_featur:10,hh_model_eval_training_result:10,hh_model_eval_training_vector:10,hh_model_explor:[7,10],hh_potential_match:10,hh_potential_matchs_prep:10,hh_predicted_match:10,hh_repeat_fn:12,hh_repeat_fp:12,hh_scored_potential_match:10,hh_train:[1,2,7,10,12],hh_training_data:10,hh_training_data_1900_1910:2,hh_training_featur:[10,12],hh_training_result:12,hidden:10,hierarchi:10,high:10,highest:[1,2],histid:[1,2,12],histid_col:1,histor:10,hit:[2,10,12],hits2:[2,12],hlink:[1,2,5,6,12],hold:9,hot:2,household:[0,4,6,8,10,12],how:2,howev:12,ht:0,hundr:0,hyper:[2,6,12],hyperparamet:[8,12],hyphen:[0,4],i:10,id:[1,2],id_column:[1,2],ident:12,identifi:[2,12],if_valu:0,ii:[0,2],iii:[0,2],imm:[1,2,12],imm_interacted_bplmatch:2,imm_interacted_immyear_caut:[2,12],immigr:1,immyear_caut:[2,9],immyear_diff:[1,2,9,12],implement:12,includ:[1,2,9,10],increas:[2,9],independ:2,independent_var:[1,2,12],index:[0,4],indic:[0,1,12],individu:[1,2,7,12],inf:9,inform:[0,1,2,10],ingest:7,initi:[0,1,10],input:[0,1,2,3,10,11],input_col:3,input_column:[2,3,9],input_table_nam:10,instal:4,instanc:[0,8],instead:[1,2],instruct:10,integ:[0,1,2,9],interact:[2,4,12],interfac:10,intermedi:10,introduct:4,invalid:[1,8],ipum:0,ipython:10,island:0,isn:12,isrdi:10,istemporari:10,item:0,iter:2,its:10,iv:2,jaro:[1,9],jaro_winkl:2,java:5,job:10,join:[1,11],join_column:[2,11],join_valu:[2,11],jr:[0,2],json:[2,10],just:[0,1,2,10,12],jw:1,jw_f:[1,2,12],jw_m:[2,9,12],jw_max_a:2,jw_max_b:2,jw_sp:[2,12],jw_street:1,jw_threshold:1,keep:2,kei:[0,1,7,10],kept:2,key_count:1,know:2,known:0,ky:0,label:8,last:[0,1,9],launch:[10,12],law:0,ldg:0,lead:0,learn:[2,6,7,10,12],least:0,leav:0,left:8,length:[1,2,9],less:[1,2],let:10,letter:[0,4],level:[1,4,10],leverag:2,librari:4,like:[0,2,10],limit:2,line:10,link:[0,1,2,4,6,8],link_run:10,linkag:6,linkrun:10,list:[0,1,2,3,4,8,10,11],ll:10,load:10,load_conf_fil:10,load_config:10,loc_a:1,loc_a_0:1,loc_a_1:1,loc_b:1,loc_b_0:1,loc_b_1:1,locat:[1,2,10],lodg:0,log2:8,logic:0,logist:[4,8],logistic_regress:[2,12],logisticregress:8,look:[1,10,11],lookup:1,lower:[0,1],lowercas:[0,4],lowercase_strip:2,lowest:0,lr:10,lsoundex:[2,12],m:1,m_caution:[1,2,9,12],m_interacted_jw_m:[2,9,12],m_pre:[2,9,12],machin:[2,6,7,10,12],made:5,mai:[0,2,10],main:10,mainli:1,major:[1,10],make:[0,2,5,12],male:[2,11],mani:10,manual:12,map:[4,7,9],mardurmatch:[1,2],marriag:1,match:[0,1,4,6,10,11,12],matches_df:10,matrix:[6,12],max:[1,8,10],maxbin:8,maxdepth:[2,8,12],maximum:[1,8],maximum_jw:1,mbpl:1,mbplmatch:[2,12],mean:0,meet:1,member:[1,7],memori:10,men:2,meso:10,messag:10,method:[10,12],mi0:1,mi1:1,mi:[2,12],mid_init_col:1,mid_init_match:1,middl:[0,1],might:12,mill:0,minimum:[1,8],mininstancespernod:8,minu:[1,2],mismatch:1,miss:10,ml:[0,2,4,8,9],mode:[4,5,12],model:[4,6,10],model_eval_repeat_fn:10,model_eval_repeat_fp:10,model_eval_training_data:10,model_eval_training_featur:10,model_eval_training_result:10,model_eval_training_vector:10,model_explor:[2,7,10,12],model_paramet:[2,7,8,12],modul:10,momloc:1,more:[0,2,7,9,10,12],most:[0,1,10],mother:9,mother_birthyr:1,mount:0,mountain:0,mountin:0,mr:0,ms:0,mt:0,mtn:0,much:1,mult:1,multipl:[0,1,2,10],multipli:1,must:[2,8,9,10,11],my:[10,12],my_conf:10,my_fil:2,myriad:2,n:[0,8,9],n_training_iter:[2,7,12],name:[0,1,2,4,10,11],name_col:1,name_std:[2,11],namefrst:[0,1,2],namefrst_a:1,namefrst_b:1,namefrst_clean:[0,2],namefrst_init:1,namefrst_jw:[1,2,12],namefrst_mid_init:[0,1],namefrst_mid_init_2:1,namefrst_rel:1,namefrst_related_row:1,namefrst_split:[0,2],namefrst_std:[2,11],namefrst_std_jw:[2,12],namefrst_unstd:[1,2],namefrst_unstd_bigram:3,namelast:[1,2],namelast_a:1,namelast_b:1,namelast_bigram:2,namelast_clean:[1,2,3],namelast_clean_bigram:[2,3],namelast_clean_soundex:3,namelast_equal_as_int:1,namelast_frst_bigram:3,namelast_jw:[2,12],namelast_neighbor:1,namelast_popular:1,namelast_popularity_sum:1,nativ:1,nbor:[1,2,12],ncount2:[2,3,12],ncount:[2,3,12],nearest:0,necessari:12,need:[0,1,2,7,10,12],neg:[2,4,6],neighbor:1,neighborhood:1,neither:1,new_marr:[1,2],newli:2,no_first_pad:3,node:8,non:0,nor:1,north:0,note:2,noth:0,now:10,null_fil:2,num:10,number:[0,1,2,7,8,10],numer:1,numtre:[2,8,12],object:[1,2,10],often:[0,10],onc:[1,10],one:[0,1,2,4],ones:0,onethird:8,onli:[0,1,2,12],ons:4,oper:2,option:[1,2,3,8,10,12],order:[0,2],organ:10,original_valu:2,oth:[2,12],other:[1,2,12],otherwis:[0,1,9,12],our:10,out:[2,12],output:[0,1,2,3,7,10,12],output_col:3,output_column:[2,3,9],output_suspicious_td:[2,12],output_table_nam:10,output_typ:2,outsid:9,override_column_a:3,override_column_b:3,overview:4,p:2,page:[1,10],pair:12,param:[8,12],param_grid:[2,12],paramet:[2,6,8,10,12],parent:[0,1,11],parent_step_chang:1,park:0,parkwai:0,parquet:[2,7],part:2,particular:2,particularli:1,partit:12,pass:[2,8],path:[1,2,10,11,12],pattern:10,peopl:[0,1,10],per:[1,2,8,9,10],percent:7,percentag:1,perform:[2,6,7,11],persist:10,person:[0,1],pike:0,pip:5,pipelin:4,pipeline_featur:[2,9],piplin:2,pkwy:0,pl:0,place:0,placehold:1,pleas:10,plu:2,point:[0,9,12],popul:7,posit:[0,2,4,6],possibl:2,post:2,potenti:[1,4,7],potential_match:[7,10],potential_matches_prep:10,potential_matches_univers:2,power:4,pr:0,prairi:0,predict:[2,12],predicted_match:10,prediction_col:2,preexist:10,prefer:12,prefix:[0,4],prep:2,prep_step:10,prepar:[6,7,10],prepend:3,prepped_df_a:10,prepped_df_b:10,preprocess:[4,6,10,12],present:[1,2,9],primari:1,print:10,prk:0,probabalist:[],probabilist:[2,6],probabl:[0,2,8],probit:[2,4],proceed:11,process:[2,10],produc:[0,10],product:[1,6,12],program:[1,2,4,7,12],project:5,prompt:10,provid:[3,6,9,10],pt:0,pull:7,punctuat:[0,4],put:[10,12],py:[1,2],pyspark:[8,9,10],python:[5,10],q:[10,12],qmark:[0,4],queri:[1,2],r:[],race:[0,1,2,9,12],race_interacted_srac:[2,9,12],racematch:2,rad:0,radial:0,rais:[1,3],random:[4,8],random_forest:[2,12],randomforestclassifi:8,rang:[2,9],rate:1,ratio:[2,8],ration:[0,4],rationalize_name_word:2,raw:[0,2,7,10],raw_df_a:10,raw_df_b:10,rd:0,re:0,read:[1,7,10],readm:2,recod:0,record:[1,2,6],refer:[0,2],regex:4,regex_word_replac:11,region:[2,12],region_dict:2,regionf:2,regist:[7,10],regress:[4,8],regular:12,rel:[1,2,12],relat:[0,1,2],relate_a:[2,9],relate_col:1,relate_div_100:[0,1,2],related_individual_max_jw:1,related_individual_row:1,relatematch:[1,2],relatetyp:[2,9],relatetype_interacted_relatematch:2,relev:12,reload:10,remain:7,remov:[0,4],remove_alternate_nam:2,remove_qmark_hyphen:2,remove_suffix:2,repeat_fn:12,repeat_fp:12,repeatedli:2,replac:[0,4],replace_apostroph:2,replaced_birthyr:[1,2,3],report:[1,4,6,10],repres:[0,1,2,3,9,10],represent:[2,7],reproduc:10,request:7,requir:[1,2,3,4,9,10,11],research:6,respect:7,rest:0,result:[0,1,6,9,10,12],reus:4,right:[8,10],risk:1,riv:0,river:0,road:0,robust:2,root:5,round:[0,2,7],row:2,rst:0,rule:6,run:[4,5,6,7,12],run_all_step:[2,10,12],run_step:10,s:[0,2,10,11],sai:10,same:[1,2,7,10],sampl:1,save:[2,7,12],scala:1,scale:2,scale_data:[2,12],scenario:12,score:[1,2,7,9],score_with_model:[2,12],scored_potential_match:10,scratch:2,script:[2,10],second:[0,1,2,11],secondari:1,secondary_distance_col:1,secondary_distances_fil:1,secondary_key_count:1,secondary_loc_a:1,secondary_loc_b:1,secondary_source_column:1,secondary_table_nam:1,section:[0,1,2,7,12],see:[1,2,7,10,12],seen:1,select:[0,1,4,6,10,12],seri:10,serialp:2,serv:1,set:[0,1,2,3,6,7,10,12],set_executor_memori:10,set_link_task:10,set_loc:10,set_num_cor:10,set_preexisting_t:10,set_print_sql:10,set_value_column_a:3,set_value_column_b:3,sever:1,sex:[1,2,11],sex_equ:2,sex_region_interact:2,sex_regionf_interact:2,sexmatch:2,sgen:[1,2,12],should:[1,2,8,9,10],show:10,showf:10,shut:12,sibl:0,sign:1,signific:0,similar:1,simpli:[0,2],sinc:[0,2],singl:[0,2,10,12],size:1,skip:[2,7],small:2,smallest:2,so:[0,1,2,5,12],some:[0,1,3,10],someth:10,soundex:4,sourc:[1,4,5,7,10,12],source_column_a:1,source_column_b:1,south:0,sp:1,sp_caution:[1,2,12],sp_interacted_jw_sp:[2,12],sp_pre:[2,12],space:[0,2,3,11],span:12,spark:[2,8,9,10,12],sparkfactori:10,sparksess:10,specif:[1,2,10],specifi:[0,1,2,6,7,9,10,11],spg:0,split:[2,3,4,7,8,9,12],split_by_id_a:[2,12],sploc:1,spous:0,spouse_birthyr:1,spouse_bpl:1,spring:0,sq:0,sql:[0,1,2,3,4,10],sql_condit:2,sqrt:8,squar:[0,1],sr:[0,2],srace:[2,9,12],st:0,sta:0,stage:2,standard:[1,11],start:[0,11],state:1,state_1900_1910_distances_km:1,state_dist:1,state_distance_lookup:1,statecode1:1,statecode2:1,statefip:[1,2],statefip_h:2,station:0,step:[0,2],stepmom:1,still:11,stop:[0,4],str:0,street:[0,1],street_jw:[1,2,12],street_unstd:11,strictli:9,string:[0,1,2,3,8,10,11],strip:[0,4],strt:0,sub:1,subhead:11,subset:[2,11],substitut:[4,7],substitution_column:[2,11],substitution_fil:[2,11],substitutions_street_abbrev:11,substr:4,subtract:1,suffix:[0,4],suppli:11,support:8,sure:[2,5,10],surnam:1,suspect:[1,6],swap:11,system:5,t:[1,2,3,12],tabl:[1,2,4,7,10,12],table_nam:1,tablenam:10,tag:7,take:[0,1,2,3,10],taken:1,task:[2,4,6,8,12],task_nam:10,tell:2,templat:1,ten:[1,2],ter:0,terrac:0,test:[2,7,12],text:10,than:[0,1,2,8],thei:[0,1,2,10],them:[0,1],thi:[0,1,2,5,7,9,10,12],those:[1,2,7],though:1,three:2,threshold:[2,8,12],threshold_expr:2,threshold_ratio:[2,8,12],through:[9,10],thu:1,time:[2,10],to_icpsrctyi:1,to_statefip:1,togeth:[1,2],toml:[2,10],top:[1,4],topic:10,track:0,trail:0,train:[1,4,6,8,10],training_data:[2,10],training_data_1900_1910:12,training_data_1900_1910_hlink_featur:12,training_data_subset:2,training_featur:[10,12],training_result:12,transform:[2,4,6,7],transformer_typ:[2,9],treat:[2,9],tree:[4,8],trnpk:0,tune:12,turnpik:0,tutori:[2,10],two:[0,1,2,3,6,7,9,10,12],txt:[],type:[0,2,3,4,8,10,11,12],typic:2,un:0,under:[0,1,2],union:[0,4],uniqu:2,univers:4,unknown:1,unrel:1,unstabl:10,up:[1,2,10,11],updat:12,upper:1,upper_threshold:2,us1900:2,us1900m_usa:2,us1910:2,us1910m_usa:2,us:[0,1,2,3,4,6,7,8,9,11,12],usag:10,use_potential_matches_featur:12,use_training_data_featur:[2,7,12],user:10,usual:[0,12],v:2,vallei:0,valli:0,valu:[0,1,2,4,8,9,10,11],vari:[0,2],variabl:[0,2,12],variant:11,ve:12,vector:[2,9],veri:12,version:[0,12],vi:2,via:0,viaduct:0,vii:2,viii:2,vill:0,villag:0,villiag:0,volumn:1,w:0,wa:[1,12],wai:[1,10],want:[1,2,10,12],washington:2,we:[0,1,10,12],well:[0,2],were:2,weren:12,west:0,what:[1,2,10,12],when:[0,1,2,3,4,12],where:[1,10,12],whether:[1,2,11],which:[0,1,2,3,9,10,12],white:0,whitespac:[0,4],who:1,whole:0,width:7,winkler:[1,9],within:[1,2,10,11],wl:0,word:[0,4],work:[0,2,5,10,12],workflow:4,would:[1,2,12],write:[10,12],written:2,x:[9,10],x_crosswalk:10,x_hh_tfam:10,x_hh_tfam_2a:10,x_hh_tfam_2b:10,x_load:10,x_parquet_from_csv:10,x_persist:10,x_sql:10,x_sqlf:10,x_summari:10,x_tab:10,x_tfam:10,x_tfam_raw:10,x_union:10,y:9,year:[1,2,3,4],year_b:1,yet:10,you:[1,2,5,10,11,12],your:[2,5,10,12],yrimmig:1,zero:1},titles:["Column mapping transforms","Comparison types, transform add-ons, aggregate features, and household aggregate features","Configuration","Feature Selection transforms","Welcome to hlink\u2019s documentation!","Installation","Introduction","Link Tasks","Models","Pipeline generated features","Running hlink","Substitutions","Advanced Workflow Examples"],titleterms:{"1":11,"export":12,abs_diff:1,add:1,add_to_a:0,advanc:[2,12],after:12,aggreg:1,alia:1,all_equ:1,any_equ:1,api:4,arrai:3,array_index:0,b_minus_a:1,basic:2,bigram:3,block:2,btwn_threshold:1,bucket:9,caution_comp_3:1,caution_comp_4:1,column:[0,2],comparison:[1,2],concat_to_a:0,concat_to_b:0,condense_strip_whitespac:0,config:2,configur:[2,4],data:[2,11,12],decision_tre:8,differ:12,divide_by_int:0,document:4,either_are_0:1,either_are_1:1,equal:1,equals_as_int:1,exact_mult:1,exampl:[10,12],explor:[7,12],extra_children:1,f1_match:1,f2_match:1,fals:12,featur:[1,2,3,9,12],fetch_a:1,fetch_b:1,file:2,filter:2,fn:12,fp:12,gener:[2,9,12],geo_dist:1,get_floor:0,gradient_boosted_tre:8,gt_threshold:1,hit:1,hits2:1,hlink:[4,10],household:[1,2,7],instal:5,interact:[9,10],introduct:6,jaro_winkl:1,jaro_winkler_r:1,jaro_winkler_street:1,jw_max_a:1,jw_max_b:1,length_b:1,level:2,librari:10,link:[7,10,12],list:12,logistic_regress:8,look_at_addl_var:1,lower_threshold:1,lowercase_strip:0,map:[0,2],match:[2,7],max_jaro_winkl:1,maximum_jaro_winkl:1,ml:12,mode:10,model:[2,7,8,12],neg:12,neither_are_nul:1,not_equ:1,ons:1,overview:[6,7],pipelin:[2,9],posit:12,potenti:[2,12],power:[1,3],preprocess:7,present_and_not_equ:1,present_both_year:1,probit:8,program:[5,10],random_forest:8,rationalize_name_word:0,regex:11,rel_jaro_winkl:1,remove_alternate_nam:0,remove_one_letter_nam:0,remove_prefix:0,remove_punctu:0,remove_qmark_hyphen:0,remove_stop_word:0,remove_suffix:0,replac:11,replace_apostroph:0,report:7,requir:5,reus:12,run:10,s:4,second_gen_imm:1,select:[2,3],soundex:3,sourc:2,split:0,sql_condit:3,start:10,step:[7,10],substitut:[2,11],substr:0,sum:1,tabl:11,task:[7,10],threshold:1,time:1,top:2,train:[2,7,12],transform:[0,1,3,9],type:[1,9],union:3,univers:2,upper_threshold:1,us:10,welcom:4,when_valu:0,word:11,workflow:[10,12],year:12}}) \ No newline at end of file diff --git a/docs/substitutions.html b/docs/substitutions.html new file mode 100644 index 0000000..c84bac5 --- /dev/null +++ b/docs/substitutions.html @@ -0,0 +1,187 @@ + + + + + + + + + Substitutions — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Substitutions

+
    +
  • Parent header: substitution_columns

  • +
  • Subheader name: substitutions

  • +
  • Type: List

  • +
  • Attributes:

    +
      +
    • substitution_file – Type: string. Required. Path to the file containing the look-up table to join against for replacement values.

    • +
    +
  • +
+

You must supply a substitution file and either specify regex_word_replace=true or supply a join value.

+
+

1:1 substitution by data table

+

Performs a 1:1 replacement on a filtered subset of the data table. If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file. Used to replace variant name forms with standardized name forms, filtering on sex.

+
    +
  • Attributes:

    +
      +
    • join_column – Type: string. Column to filter input data on.

    • +
    • join_value – Type: string. Value to filter for in the input data.

    • +
    +
  • +
+
[[substitution_columns]]
+column_name = "namefrst_std"
+
+[[substitution_columns.substitutions]]
+join_column = "sex"
+join_value = "1"
+substitution_file = "/path/to/name_std/male.csv"
+
+[[substitution_columns.substitutions]]
+join_column = "sex"
+join_value = "2"
+substitution_file = "/path/to/name_std/female.csv"
+
+
+
+
+

Substitution by regex word replace

+

Performs word replacement within a column’s data string (such as replacing the abbreviation Ave. in the string 7th Ave. with Avenue to create 7th Avenue).

+
    +
  • Attributes:

    +
      +
    • regex_word_replace – Type: boolean. Whether or not to use regex matching on the input data to perform replacement. If true, the swap value will still be replaced if it is anywhere in the column data, as long as it is:

      +
        +
      • at the start of the column data string, or proceeded by a space

      • +
      • at the end of the column data string, or followed by a space

      • +
      +
    • +
    +
  • +
+
[[substitution_columns]]
+column_name = "street_unstd"
+
+[[substitution_columns.substitutions]]
+regex_word_replace = true
+substitution_file = "/path/to/dir/substitutions_street_abbrevs.csv"
+
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/use_examples.html b/docs/use_examples.html new file mode 100644 index 0000000..96d2b17 --- /dev/null +++ b/docs/use_examples.html @@ -0,0 +1,251 @@ + + + + + + + + + Advanced Workflow Examples — hlink 2.0.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +
+

Advanced Workflow Examples

+
+

Export training data after generating features to reuse in different linking years

+

It is common to have a single training data set that spans two linked years, which is then used to train a model that is applied to a different set of linked years. For example, we have a training data set that spans linked individuals from the 1900 census to the 1910 census. We use this training data to predict links in the full count 1900-1910 linking run, but we also use this training data to link year pairs 1910-1920, 1920-1930, and 1930-1940.

+

When this training data set is used for the years it was derived from, the only columns necessary are the HISTIDs identifying the individuals in the data and the dependent variable (usually a boolean match column) for the model training. Features for the machine learning model generation are created from the source data available in the full count run.

+

However, when this training data set is used for other years, the program does not have access to the source full count files, and can’t generate the ML features from the given data. In this scenario, you would need to save a copy of the training_features and hh_training_features Spark tables to .csv so you can point to that in the other year pair runs, and indicate the use_potential_matches_features = true flag in both the training and hh_training sections of the configuration.

+
+

Example training data export with generated ML features

+
    +
  1. Create a config file and put it in your hlink config directory.

  2. +
  3. Launch the hlink program in interactive mode:

    +
    hlink --conf=full_count_1900_1910 --cores 50 --executor_memory 50G
    +
    +
    +
  4. +
  5. Run the preprocessing and training link tasks:

    +
    hlink $ run_all_steps preprocessing training
    +
    +
    +
  6. +
  7. Ask the program what the arguments for the csv command are:

    +
    hlink $ ? csv
    +Writes a dataframe out to csv.
    +     Arg 1: dataframe
    +     Arg 2: path
    +     Arg 3 (optional): # of partitions
    +
    +
    +
  8. +
  9. Export the results using the csv command:

    +
    hlink $ csv training_features /my/output/training_data_1900_1910_HLINK_FEATURES.csv
    +
    +
    +
  10. +
  11. Continue with other linking work you might need to do with this year pair, otherwise shut down the hlink framework for this pair of linking years:

    +
    hlink $ q
    +
    +
    +
  12. +
  13. In the config file for the new year pairs (1910-1920, 1920-1930, etc.), point to this new file as your dataset, and set the use_training_data_features

    +
    # config file for 1910-1920 linking run using the 1900-1910 training data with hlink-generated features
    +[training]
    +
    +# more configs here...
    +
    +dataset = "/path/to/training_data_1900_1910_HLINK_FEATURES.csv"
    +dependent_var = "match"
    +
    +# This needs to be changed to `true` to use the features we just generated
    +use_training_data_features = true
    +
    +# configs continue here...
    +
    +
    +
  14. +
  15. Launch the hlink program using your new config for the new year pair you want to link. Run your link tasks and export relevant data.

  16. +
+
+
+
+

ML model exploration and export of lists of potential false positives/negatives in training data

+

hlink accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models. You can see example training and hh_training configuration sections that implement this in the training and household training sections of the configuration documentation.

+

The model exploration link task also allows you to export lists of potential false positives (FPs) and false negatives (FNs) in your training data. This is calculated when running the train/test splits in the regular model exploration tasks if the output_suspicious_TD flag is true.

+
+

Example model exploration and FP/FN export workflow

+
    +
  1. Create a config file that has a training and/or hh_training section with model parameters to explore. For example:

    +
    [training]
    +
    +independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"]
    +
    +scale_data = false
    +dataset = "/path/to/training_data_1900_1910.csv"
    +dependent_var = "match"
    +
    +# This would need to be changed to `true` in a run between other years if your
    +# source data years weren't identical to the linked years of your training data.
    +use_training_data_features = false
    +
    +# VERY IMPORTANT if you want to output FPs/FNs
    +output_suspicious_TD = true
    +
    +split_by_id_a = true
    +score_with_model = true
    +feature_importances = false
    +decision = "drop_duplicate_with_threshold_ratio"
    +param_grid = true
    +n_training_iterations = 10
    +model_parameters = [
    +    { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.0, 1.1]},
    +    { type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]}
    +]
    +
    +# The chosen_model is the final selected model to use in the full count production
    +# run. This is where you would manually update your config after running model
    +# exploration and making decisions about your models and hyperparameters. This 
    +# section isn't used by the model exploration task.
    +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
    +
    +
    +
  2. +
  3. Launch the hlink program in interactive mode:

    +
    hlink --conf=full_count_1900_1910 --cores 50 --executor_memory 50G
    +
    +
    +
  4. +
  5. Run the preprocessing and model exploration link tasks:

    +
    hlink $ run_all_steps preprocessing model_exploration
    +
    +
    +
  6. +
  7. Export the results of the train/test split runs to csv for further analysis. For training params, the results will be in the training_results table, and for hh_training in the hh_training_results table.

    +
    hlink $ csv training_results /my/output/1900_1910_training_results.csv
    +
    +
    +
  8. +
  9. Export the potential FPs and FNs to csv. For training params, the results will be in the repeat_FPs and repeat_FNs tables, and for hh_training in the hh_repeat_FPs and hh_repeat_FNs tables.

    +
    hlink $ csv repeat_FPs /my/output/1900_1910_potential_FPs.csv
    +hlink $ csv repeat_FNs /my/output/1900_1910_potential_FNs.csv
    +
    +
    +
  10. +
  11. Use your preferred methods to analyze the data you’ve just exported. Update the chosen_model in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.

  12. +
+
+
+
+ + +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md new file mode 100644 index 0000000..2ccec09 --- /dev/null +++ b/examples/tutorial/README.md @@ -0,0 +1,99 @@ +# HLink Tutorial + +This is an example linking project that uses hlink to link people between two +datasets, `data/A.csv` and `data/B.csv`. Note that these datasets are not +included, so the example script will throw an error if run out of the box. + +## Dataset Overview + +The tutorial script supposes that each of the datasets has the following columns: +id, NAMEFRST, NAMELAST, AGE, SEX. In addition, it supposes that dataset B was +created 10 years after dataset A. Each record in the datasets represents a single +person. + +## The Config File and Linking Strategy + +To link these two datasets, we need to create a configuration file that directs +hlink on what operations to perform and how to determine when a link is found. For +our tutorial example, we'll use deterministic linking, but hlink is also capable +of using machine learning models to classify possible links between the datasets. + +Creating a config file is complicated. See the hlink documentation for a +more detailed explanation of the different config file sections and keys. + +The first step in creating a config file is describing the data to hlink. The +`id_column` key tells hlink the name of the id column which uniquely identifies +each record in the databases. In our case, this is just "id". The `datasource_a` +and `datasource_b` sections give hlink information about where to find the input +files. We give hlink the relative path to our data files in these sections. + +After describing the data to hlink, we need to think about our linking strategy. +How will we determine who links between the two datasets? Do we need to do any +data cleaning or reformatting to allow this? Are there any rules that exclude two +records from linking? + +In our tutorial example, here is the general linking strategy that we'll use: +- First, we say that two records cannot link if they have a different SEX between +A and B. +- We say that two records may link only if the difference between AGE in A and +AGE in B is within 2 years of 10, so 8 to 12. +- Finally, we use the Jaro-Winkler string comparison algorithm to compare each +candidate link's NAMEFRST and NAMELAST between the two datasets. If the names score +sufficiently high, then we have a link! + +The first two bullet points above correspond to the `blocking` section. In this +section, we separate records into different *blocks*. Then each record may link +only with other records in its block. In the case of SEX, this means that we split +the datasets roughly in two, creating two separate blocks where links may occur. + +For AGE, we have a range of allowable differences. So we *explode* each record +in dataset A, creating five new records which go into 5 different blocks. The +five records have five different ages, AGE - 2, AGE - 1, AGE, AGE + 1, and +AGE + 2. Dataset B is blocked on age, and the records from dataset A go into +the block corresponding to their exact age. This allows us to do fuzzy matching +of ages; each record in dataset A is allowed to match with records in dataset B +with an AGE anywhere from 8 to 12 years greater than the AGE in dataset A. + +The last bullet point corresponds to the `comparison_features` and `comparisons` +sections. In these sections, we tell hlink to compute the Jaro-Winkler score +between the NAMEFRST string in the dataset A record and the corresponding string +in the dataset B record, then compare the score against a threshold of 0.79 to +determine if it's a link or not. We do the same thing for NAMELAST, with a +threshold of 0.84. If a single record pair reaches both thresholds, then we call +it a link! This pair of records will end up in `potential_matches.csv` when the +script completes. + +It's very likely that the names in dataset A and dataset B are not consistently +formatted. This is where the `column_mappings` section comes in. It tells hlink +to perform some data cleaning in the preprocessing step before matching occurs. +The column mappings in the config file strip whitespace from the names and lowercase +them to remove discrepancies in formatting between the two datasets. + +Now that the config file is written, we can run hlink to generate some links. See +the next section for a description of the tutorial script that runs hlink. + +## The Tutorial Script + +The `tutorial.py` Python script contains code to load in the config file and run +hlink to generate potential links between the two datasets. It creates a `LinkRun`, +which is the main way to communicate with the hlink library. After analyzing the +config file for errors, it runs two link tasks: preprocessing and matching. + +The preprocessing task reads the data from the datasets in and does the data +cleaning and column mapping that we've asked it to do for us in the config file. + +The matching task does the real linking work, finding links between the two datasets. +It stores its results in a `potential_matches` spark table. The script saves this +table to the `potential_matches.csv` file. + +## Getting and Interpreting Results + +After running the tutorial script, we have a `potential_matches.csv` file that +contains data on potential links that hlink identified between the two datasets. +Each record in this dataset identifies a potential link. The id\_a and id\_b +columns identify the records in dataset A and dataset B that have been linked. +There are also some more fields that are useful for reviewing the links and confirming +that they look reasonable. Some links may be more reasonable than others! +Our linking strategy is deterministic and relatively simple, so it may catch +more or less links than another strategy. + diff --git a/examples/tutorial/tutorial.py b/examples/tutorial/tutorial.py new file mode 100644 index 0000000..d4804c9 --- /dev/null +++ b/examples/tutorial/tutorial.py @@ -0,0 +1,47 @@ +from hlink.linking.link_run import LinkRun +from hlink.spark.factory import SparkFactory +from hlink.configs.load_config import load_conf_file +from hlink.scripts.lib.io import write_table_to_csv +from hlink.scripts.lib.conf_validations import analyze_conf + + +def main(): + # First let's create a LinkRun object. This will be the main way that we + # interact with hlink. To create a LinkRun, we need to read in our config + # file and set up spark. + print("=== Loading config file") + config = load_conf_file("tutorial_config.toml") + print("=== Setting up spark") + # Create a SparkSession. Connect to the local machine, simulating a cluster. + spark = SparkFactory().set_local().set_num_cores(4).create() + + print("=== Creating the LinkRun") + link_run = LinkRun(spark, config) + + # Now we've got the LinkRun created. Let's analyze our config file to look + # for errors that could cause hlink to fail. + print("=== Analyzing config file") + analyze_conf(link_run) + + # Alright! Our config file looks good to go. Let's run the steps we need. + # Since we're not using machine learning for our linking, this is fairly + # simple. First we'll preprocess the data and load it into spark by running + # all of the steps in the preprocessing link task. + print("=== Running preprocessing") + link_run.preprocessing.run_all_steps() + + # Now let's do the matching. We only need steps 0 and 1 because the last step + # is only applicable when we're using machine learning. + print("=== Running first two matching steps") + link_run.matching.run_step(0) + link_run.matching.run_step(1) + + # The matching task saves its output in the potential_matches spark table. + # Let's output this table to a CSV so that we can read it in later and look + # at our results! + print("=== Saving potential matches to potential_matches.csv") + write_table_to_csv(link_run.spark, "potential_matches", "potential_matches.csv") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorial/tutorial_config.toml b/examples/tutorial/tutorial_config.toml new file mode 100644 index 0000000..0743279 --- /dev/null +++ b/examples/tutorial/tutorial_config.toml @@ -0,0 +1,64 @@ +id_column = "id" +feature_selections = [] + +[datasource_a] +alias = "a" +file = "data/A.csv" + +[datasource_b] +alias = "b" +file = "data/B.csv" + +[[column_mappings]] +column_name = "NAMEFRST" +transforms = [ + {type = "lowercase_strip"} +] + +[[column_mappings]] +column_name = "NAMELAST" +transforms = [ + {type = "lowercase_strip"} +] + +[[column_mappings]] +column_name = "AGE" +transforms = [ + {type = "add_to_a", value = 10} +] + +[[column_mappings]] +column_name = "SEX" + +[[blocking]] +column_name = "SEX" + +[[blocking]] +column_name = "AGE_2" +dataset = "a" +derived_from = "AGE" +expand_length = 2 +explode = true + +[[comparison_features]] +alias = "NAMEFRST_JW" +column_name = "NAMEFRST" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "NAMELAST_JW" +column_name = "NAMELAST" +comparison_type = "jaro_winkler" + +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "NAMEFRST_JW" +threshold = 0.79 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "NAMELAST_JW" +threshold = 0.84 diff --git a/hlink/__init__.py b/hlink/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/configs/__init__.py b/hlink/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/configs/load_config.py b/hlink/configs/load_config.py new file mode 100755 index 0000000..4c62a32 --- /dev/null +++ b/hlink/configs/load_config.py @@ -0,0 +1,64 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pathlib import Path +import json +import toml + +from hlink.errors import UsageError + + +def load_conf_file(conf_name): + """Flexibly load a config file. + + Given a path `conf_name`, look for a file at that path. If that file + exists and has a '.toml' extension or a '.json' extension, load it and + return its contents. If it doesn't exist, look for a file with the same + name with a '.toml' extension added and load it if it exists. Then do the + same for a file with a '.json' extension added. + + After successfully loading a config file, store the absolute path where the + config file was found as the value of the "conf_path" key in the returned + config dictionary. + + Args: + conf_name (str): the file to look for + + Returns: + dict: the contents of the config file + + Raises: + FileNotFoundError: if none of the three checked files exist + UsageError: if the file at path `conf_name` exists, but it doesn't have a '.toml' or '.json' extension + """ + candidate_files = [ + Path(conf_name), + Path(conf_name + ".toml"), + Path(conf_name + ".json"), + ] + + existing_files = filter((lambda file: file.exists()), candidate_files) + + for file in existing_files: + if file.suffix == ".toml": + with open(file) as f: + conf = toml.load(f) + conf["conf_path"] = str(file.resolve()) + return conf + + if file.suffix == ".json": + with open(file) as f: + conf = json.load(f) + conf["conf_path"] = str(file.resolve()) + return conf + + raise UsageError( + f"The file {file} exists, but it doesn't have a '.toml' or '.json' extension." + ) + + candidate_files_str = ", ".join(map(str, candidate_files)) + raise FileNotFoundError( + f"Couldn't find any of these three files: {candidate_files_str}" + ) diff --git a/hlink/errors.py b/hlink/errors.py new file mode 100755 index 0000000..11c3a47 --- /dev/null +++ b/hlink/errors.py @@ -0,0 +1,22 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +class SparkError(Exception): + """Catch any exceptions from Spark""" + + pass + + +class UsageError(Exception): + """ Incorrectly specified options """ + + pass + + +class DataError(Exception): + """ There is an issue in the source data that will cause a problem. """ + + pass diff --git a/hlink/linking/README.md b/hlink/linking/README.md new file mode 100644 index 0000000..eec5dcb --- /dev/null +++ b/hlink/linking/README.md @@ -0,0 +1,10 @@ +## Overview + +There is one base class,`LinkTask`, defined in the file: `link_task.py`. Each subpackage contains a class which inherits from the base `LinkTask`. + +All classes inheriting from `LinkTask`, directly correspond to a task that the user can do. For example, the `Preprocessing` class contains all the code for the `preprocessing` user task. + +## Templates + +The `templates` directory in each subpackage contains SQL file templates that are written using [jinja2](http://jinja.pocoo.org/docs/2.10/templates/). Putting all the sql in this directory allows for a seperation between the python code and the SQL code used. Using jinja2 templating allows for reuse of an sql file with slightly different parameters. + diff --git a/hlink/linking/__init__.py b/hlink/linking/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/linking/core/__init__.py b/hlink/linking/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/linking/core/classifier.py b/hlink/linking/core/classifier.py new file mode 100644 index 0000000..0efaf38 --- /dev/null +++ b/hlink/linking/core/classifier.py @@ -0,0 +1,104 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml.feature import SQLTransformer +from pyspark.ml.regression import GeneralizedLinearRegression +from pyspark.ml.classification import ( + RandomForestClassifier, + LogisticRegression, + DecisionTreeClassifier, + GBTClassifier, +) +import hlink.linking.transformers.rename_prob_column + + +def choose_classifier(model_type, params, dep_var): + """Returns a classifier and a post_classification transformer given model type and params. + + Parameters + ---------- + model_type: string + name of model + params: dictionary + dictionary of parameters for model + dep_var: string + the dependent variable for the model + + Returns + ------- + The classifer and a transformer to be used after classification. + + """ + post_transformer = SQLTransformer(statement="SELECT * FROM __THIS__") + features_vector = "features_vector" + if model_type == "random_forest": + classifier = RandomForestClassifier( + **{ + key: val + for key, val in params.items() + if key not in ["threshold", "threshold_ratio"] + }, + labelCol=dep_var, + featuresCol=features_vector, + seed=2133, + probabilityCol="probability_array", + ) + post_transformer = SQLTransformer( + statement="SELECT *, parseProbVector(probability_array, 1) as probability FROM __THIS__" + ) + + elif model_type == "probit": + classifier = GeneralizedLinearRegression( + family="binomial", + link="probit", + labelCol=dep_var, + featuresCol=features_vector, + predictionCol="probability", + ) + + elif model_type == "logistic_regression": + classifier = LogisticRegression( + **params, + featuresCol=features_vector, + labelCol=dep_var, + predictionCol="prediction", + probabilityCol="probability_array", + ) + post_transformer = SQLTransformer( + statement="SELECT *, parseProbVector(probability_array, 1) as probability FROM __THIS__" + ) + + elif model_type == "decision_tree": + classifier = DecisionTreeClassifier( + **params, + featuresCol=features_vector, + labelCol=dep_var, + probabilityCol="probability_array", + seed=2133, + ) + post_transformer = SQLTransformer( + statement="SELECT *, parseProbVector(probability_array, 1) as probability FROM __THIS__" + ) + + elif model_type == "gradient_boosted_trees": + classifier = GBTClassifier( + **{ + key: val + for key, val in params.items() + if key not in ["threshold", "threshold_ratio"] + }, + featuresCol=features_vector, + labelCol=dep_var, + seed=2133, + ) + post_transformer = ( + hlink.linking.transformers.rename_prob_column.RenameProbColumn() + ) + + else: + raise ValueError( + "Model type not recognized! Please check your config, reload, and try again." + ) + return classifier, post_transformer diff --git a/hlink/linking/core/column_mapping.py b/hlink/linking/core/column_mapping.py new file mode 100755 index 0000000..bd326e2 --- /dev/null +++ b/hlink/linking/core/column_mapping.py @@ -0,0 +1,46 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.functions import * +import hlink.linking.core.transforms as transforms_core + + +def select_column_mapping(column_mapping, df_selected, is_a, column_selects): + name = column_mapping["column_name"] + if "override_column_a" in column_mapping and is_a: + override_name = column_mapping["override_column_a"] + column_select = col(override_name) + if "override_transforms" in column_mapping: + for transform in column_mapping["override_transforms"]: + column_select = transforms_core.apply_transform( + column_select, transform, is_a + ) + elif "override_column_b" in column_mapping and not is_a: + override_name = column_mapping["override_column_b"] + column_select = col(override_name) + if "override_transforms" in column_mapping: + for transform in column_mapping["override_transforms"]: + column_select = transforms_core.apply_transform( + column_select, transform, is_a + ) + elif "set_value_column_a" in column_mapping and is_a: + value_to_set = column_mapping["set_value_column_a"] + column_select = lit(value_to_set) + elif "set_value_column_b" in column_mapping and not is_a: + value_to_set = column_mapping["set_value_column_b"] + column_select = lit(value_to_set) + elif "transforms" in column_mapping: + column_select = col(name) + for transform in column_mapping["transforms"]: + column_select = transforms_core.apply_transform( + column_select, transform, is_a + ) + else: + column_select = col(name) + + alias = column_mapping["alias"] if "alias" in column_mapping else name + + column_selects.append(alias) + return df_selected.withColumn(alias, column_select), column_selects diff --git a/hlink/linking/core/comparison.py b/hlink/linking/core/comparison.py new file mode 100755 index 0000000..d07cc31 --- /dev/null +++ b/hlink/linking/core/comparison.py @@ -0,0 +1,90 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.linking.core.comparison_feature as comparison_feature_core + + +def get_comparison_leaves(comp): + comp_leaves = [] + + def __get_comp_leaf(comp, comp_leaves): + + if "comp_a" in comp: + __get_comp_leaf(comp["comp_a"], comp_leaves) + __get_comp_leaf(comp["comp_b"], comp_leaves) + + else: + comp_leaves.append(comp) + + if "comp_a" in comp: + __get_comp_leaf(comp["comp_a"], comp_leaves) + __get_comp_leaf(comp["comp_b"], comp_leaves) + + elif "secondary" in comp: + __get_comp_leaf(comp["threshold_a"], comp_leaves) + __get_comp_leaf(comp["threshold_b"], comp_leaves) + + else: + __get_comp_leaf(comp, comp_leaves) + + return comp_leaves + + +def generate_comparisons(comp, features, id_col): + """Creates the comparison SQL clause given a comparison and a list of comparison features. + + Parameters + ---------- + comp: dictionary + the config dictionary containing the comparison definition + features: dictionary + the config dictionary containing the comparison features + id_col: string + the id column + + Returns + ------- + A string of the sql clause to be used for comparison + filtering after blocking. + """ + if comp != {}: + if "comp_a" in comp: + comp_a_clause = generate_comparisons(comp["comp_a"], features, id_col) + comp_b_clause = generate_comparisons(comp["comp_b"], features, id_col) + if comp["operator"] == "AND": + return f""" + ({comp_a_clause} AND {comp_b_clause}) + """ + elif comp["operator"] == "OR": + return f""" + ({comp_a_clause} OR {comp_b_clause}) + """ + elif "secondary" in comp: + comp_a = comp["threshold_a"] + comp_a_clause = f"{comp_a['feature_name']} >= {comp_a['threshold']}" + comp_b = comp["threshold_b"] + comp_b_clause = f"{comp_b['feature_name']} >= {comp_b['threshold']}" + if comp["operator"] == "AND": + return f"({comp_a_clause} AND {comp_b_clause})" + + else: + if "column_name" in comp: + col = comp["column_name"] + else: + col = comparison_feature_core.generate_comparison_feature( + [f for f in features if f["alias"] == comp["feature_name"]][0], + id_col, + ) + if "comparison_type" in comp: + comp_type = comp["comparison_type"] + if comp_type == "threshold": + if comp.get("threshold_expr", False): + return f"{col} {comp['threshold_expr']}" + else: + return f"{col} >= {comp['threshold']}" + else: + return f"{col}" + else: + return "" diff --git a/hlink/linking/core/comparison_feature.py b/hlink/linking/core/comparison_feature.py new file mode 100755 index 0000000..c059a1c --- /dev/null +++ b/hlink/linking/core/comparison_feature.py @@ -0,0 +1,547 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import itertools + + +def create_feature_tables( + link_task, t_ctx_def, advanced_comp_features, hh_comp_features, id_col, table_name +): + """Creates the table which contains all the comparison features from a + table that has all the column and feature selections. + + Parameters + ---------- + link_task: LinkTask + the link task that is currently being ran + t_ctx_def: dictionary + the dictionary of values to pass to the sql templates + advanced_comp_features: list + a list of the "advanced" features which + require potential match aggregation + id_col: string + the id column + table_name: string + the name of the output table + + Returns + ------- + The output table with the features. + """ + + has_adv_comp_features = len(advanced_comp_features) > 0 + has_hh_comp_features = len(hh_comp_features) > 0 + gen_comp_feat = has_adv_comp_features or has_hh_comp_features + config = link_task.link_run.config + + tmp_training_features = link_task.run_register_sql( + name=f"tmp_{table_name}", + template="potential_matches_base_features", + t_ctx=t_ctx_def, + persist=gen_comp_feat, + ) + + if has_adv_comp_features and not has_hh_comp_features: + return link_task.run_register_sql( + table_name, + template="aggregate_features", + t_ctx={ + "id": config["id_column"], + "potential_matches": f"tmp_{table_name}", + "advanced_comp_features": advanced_comp_features, + }, + persist=True, + ) + + elif has_hh_comp_features and not has_adv_comp_features: + return link_task.run_register_sql( + table_name, + template="hh_aggregate_features", + t_ctx={ + "id": config["id_column"], + "hh_col": config[f"{link_task.training_conf}"].get("hh_col", "serialp"), + "potential_matches": f"tmp_{table_name}", + "hh_comp_features": hh_comp_features, + }, + persist=True, + ) + + elif has_adv_comp_features and has_hh_comp_features: + af = link_task.run_register_sql( + table_name, + template="aggregate_features", + t_ctx={ + "id": config["id_column"], + "potential_matches": f"tmp_{table_name}", + "advanced_comp_features": advanced_comp_features, + }, + ) + return link_task.run_register_sql( + af, + template="hh_aggregate_features", + t_ctx={ + "id": config["id_column"], + "potential_matches": f"tmp_{table_name}", + "hh_comp_features": hh_comp_features, + }, + persist=True, + ) + + else: + return link_task.run_register_python( + table_name, lambda: tmp_training_features, persist=True + ) + + +def get_features(config, independent_features, pregen_features=[]): + """Splits apart the comparison features into comp_features, + advanced_features, and dist_features. + + Parameters + ---------- + config: dictionary + the base configuration dictionary + independent_features: list + a list of the comparison features to split apart + pregen_features: dictionary + a list of features that have been pregenerated and should be skipped + + Returns + ------- + A 3-tuple of the standard comparison features, + the advanced comparison features, and the distance features. + """ + + aggregate_features = [ + "hits", + "hits2", + "exact_mult", + "exact_all_mult", + "exact_all_mult2", + ] + + hh_aggregate_features = ["jw_max_a", "jw_max_b"] + + all_comp_features = config["comparison_features"] + advanced_comp_features = [ + f for f in all_comp_features if f["alias"] in aggregate_features + ] + [f for f in independent_features if f in aggregate_features] + hh_comp_features = [ + f for f in all_comp_features if f["alias"] in hh_aggregate_features + ] + [f for f in independent_features if f in hh_aggregate_features] + derived_comp_features = [ + f + for f in all_comp_features + if f["alias"] not in aggregate_features + and f["alias"] not in pregen_features + and f["alias"] not in hh_aggregate_features + ] + derived_aliases = [f["alias"] for f in derived_comp_features] + + dist_features = [ + c + for c in all_comp_features + if c["comparison_type"] == "geo_distance" and c["alias"] not in pregen_features + ] + + if len({"exact_mult"} & set(advanced_comp_features)) > 0 and ( + ("exact" not in derived_aliases) + ): + raise KeyError( + 'In order to calculate "exact_mult", "exact" needs to be added to the list of comparison features in your configuration.' + ) + + if len( + set(["exact_all_mult", "exact_all_mult2"]) & set(advanced_comp_features) + ) > 0 and (("exact_all" not in derived_aliases)): + raise KeyError( + 'In order to calculate "exact_all_mult", or "exact_all_mult2", "exact_all" needs to be added to the list of comparison features in your configuration.' + ) + + comp_features = ",\n ".join( + [ + generate_comparison_feature(f, config["id_column"], include_as=True) + for f in derived_comp_features + ] + ) + return comp_features, advanced_comp_features, hh_comp_features, dist_features + + +def generate_comparison_feature(feature, id_col, include_as=False): + """Returns an SQL expression for a given feature. + + Parameters + ---------- + feature: dictionary + a comparison feature from the config + id_col: string + the id column + include_as: boolean + if true, then the expression will include "as {alias}", + where `alias` is the alias of the given feature + + Returns + ------- + A string containing the sql expression. + """ + comp_type = feature["comparison_type"] + + if comp_type == "sql_condition": + expr = feature["condition"] + + elif comp_type == "maximum_jaro_winkler": + columns = feature["column_names"] + comps = ", ".join( + [ + f"jw(nvl(a.{col1}, ''), nvl(b.{col2}, ''))" + for col1, col2 in itertools.product(columns, columns) + ] + ) + expr = f"GREATEST({comps})" + + elif comp_type == "jaro_winkler": + col = feature["column_name"] + expr = f"jw(nvl(a.{col}, ''), nvl(b.{col}, ''))" + + elif comp_type == "jaro_winkler_street": + col = feature["column_name"] + boundary_col = feature["boundary"] + expr = f"IF(a.{boundary_col} = b.{boundary_col}, jw(nvl(a.{col},''), nvl(b.{col}, '')), 0)" + + elif comp_type == "max_jaro_winkler": + col = feature["column_name"] + expr = f"jw_max(a.{col}, b.{col})" + + elif comp_type == "equals": + col = feature["column_name"] + expr = f"a.{col} IS NOT DISTINCT FROM b.{col}" + + elif comp_type == "f1_match": + fi = feature["first_init_col"] + mi0 = feature["mid_init_cols"][0] + mi1 = feature["mid_init_cols"][1] + expr = ( + f"CASE WHEN (" + f"(a.{fi} IS NOT DISTINCT FROM b.{fi}) OR " + f"(a.{fi} IS NOT DISTINCT FROM b.{mi0}) OR " + f"(a.{fi} IS NOT DISTINCT FROM b.{mi1})" + f") THEN 1 ELSE 2 END" + ) + + elif comp_type == "f2_match": + fi = feature["first_init_col"] + mi0 = feature["mid_init_cols"][0] + mi1 = feature["mid_init_cols"][1] + expr = ( + f"CASE WHEN ((a.{mi0} == '') OR (a.{mi0} IS NULL)) THEN 0 WHEN (" + f"(a.{mi0} IS NOT DISTINCT FROM b.{fi}) OR " + f"((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{fi})) OR " + f"(a.{mi0} IS NOT DISTINCT FROM b.{mi0}) OR " + f"(a.{mi0} IS NOT DISTINCT FROM b.{mi1}) OR " + f"((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi0})) OR " + f"((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi1}))" + f") THEN 1 ELSE 2 END" + ) + + elif comp_type == "not_equals": + col = feature["column_name"] + expr = f"a.{col} IS DISTINCT FROM b.{col}" + + elif comp_type == "equals_as_int": + col = feature["column_name"] + expr = f"CAST(a.{col} = b.{col} as INT)" + + elif comp_type == "all_equals": + cols = feature["column_names"] + all_equals = " AND ".join([f"a.{col} = b.{col}" for col in cols]) + expr = f"{all_equals}" + + elif comp_type == "not_zero_and_not_equals": + col = feature["column_name"] + expr = f"a.{col} is not null and b.{col} is not null and a.{col} != 0 AND b.{col} != 0 and a.{col} IS DISTINCT FROM b.{col}" + + elif comp_type == "or": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + if "comp_d" in feature: + expr_d = generate_comparison_feature(feature["comp_d"], id_col) + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr = f"{expr_a} OR {expr_b} OR {expr_c} OR {expr_d}" + elif "comp_c" in feature: + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr = f"{expr_a} OR {expr_b} OR {expr_c}" + else: + expr = f"{expr_a} OR {expr_b}" + + elif comp_type == "and": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + if "comp_d" in feature: + expr_d = generate_comparison_feature(feature["comp_d"], id_col) + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr = f"{expr_a} AND {expr_b} AND {expr_c} AND {expr_d}" + elif "comp_c" in feature: + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr = f"{expr_a} AND {expr_b} AND {expr_c}" + else: + expr = f"{expr_a} AND {expr_b}" + + elif comp_type == "times": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + expr = f"CAST({expr_a} as float) * CAST({expr_b} as float)" + + elif comp_type == "caution_comp_3": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr = f"({expr_a} OR {expr_b}) AND {expr_c}" + + elif comp_type == "caution_comp_4": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr_d = generate_comparison_feature(feature["comp_d"], id_col) + expr = f"({expr_a} OR {expr_b} OR {expr_c}) AND {expr_d}" + + elif comp_type == "caution_comp_3_012": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr = ( + f"CASE WHEN CAST({expr_c} as string) == 'false' THEN 2 " + f"WHEN ({expr_a} OR {expr_b}) AND {expr_c} THEN 1 " + f"ELSE 0 END" + ) + + elif comp_type == "caution_comp_4_012": + expr_a = generate_comparison_feature(feature["comp_a"], id_col) + expr_b = generate_comparison_feature(feature["comp_b"], id_col) + expr_c = generate_comparison_feature(feature["comp_c"], id_col) + expr_d = generate_comparison_feature(feature["comp_d"], id_col) + expr = ( + f"CASE WHEN CAST({expr_d} as string) == 'false' THEN 2 " + f"WHEN (({expr_a}) OR ({expr_b}) OR ({expr_c})) AND ({expr_d}) THEN 1 " + f"ELSE 0 END" + ) + + elif comp_type == "any_equals": + col1, col2 = feature["column_names"] + expr = f""" + ( + ( a.{col1} = b.{col1} OR a.{col1} = b.{col2} ) + AND + nvl(a.{col1}, '') != '' + ) + OR + ( + ( a.{col2} = b.{col1}) + AND + nvl(a.{col2}, '') != '' + ) + """ + + elif comp_type == "either_are_1": + col = feature["column_name"] + expr = f"(a.{col} = 1 OR b.{col} = 1)" + + elif comp_type == "either_are_0": + col = feature["column_name"] + expr = f"(a.{col} = 0 OR b.{col} = 0)" + + elif comp_type == "second_gen_imm": + col = feature["column_name"] + expr = f"(a.{col} = 2 OR a.{col} = 3 OR a.{col} = 4)" + + elif comp_type == "rel_jaro_winkler": + col = feature["column_name"] + if "jw_threshold" in feature: + jw_threshold = feature["jw_threshold"] + else: + jw_threshold = 0.8 + print( + f"WARNING: No jw_threshold defined; Setting jw_threshold for rel_jaro_winkler comparison feature for {col} to {jw_threshold}" + ) + if feature.get("age_threshold", False): + age_threshold = feature["age_threshold"] + else: + age_threshold = 5 + print( + f"WARNING: No age_threshold defined; Setting age_threshold for rel_jaro_winkler comparison feature for {col} to {age_threshold}" + ) + histid = feature.get("histid_col", "histid") + name = feature.get("name_col", "namefrst_std") + byr = feature.get("birthyr_col", "birthyr") + sex = feature.get("sex_col", "sex") + expr = f"rel_jw(a.{col}, b.{col}, string({jw_threshold}), string({age_threshold}), map('name','{name}','byr','{byr}','sex','{sex}'))" + + elif comp_type == "extra_children": + col = feature["column_name"] + if "jw_threshold" in feature: + jw_threshold = feature["jw_threshold"] + else: + jw_threshold = 0.8 + print( + f"WARNING: No jw_threshold defined; Setting jw_threshold for rel_jaro_winkler comparison feature for {col} to {jw_threshold}" + ) + if feature.get("age_threshold", False): + age_threshold = feature["age_threshold"] + else: + age_threshold = 5 + print( + f"WARNING: No age_threshold defined; Setting age_threshold for rel_jaro_winkler comparison feature for {col} to {age_threshold}" + ) + year_b = feature.get("year_b", "year_b") + relate = feature.get("relate_col", "relate") + histid = id_col if id_col is not None else "histid" + name = feature.get("name_col", "namefrst_std") + byr = feature.get("birthyr_col", "birthyr") + sex = feature.get("sex_col", "sex") + expr = f"extra_children(a.{col}, b.{col}, string({year_b}), a.{relate}, b.{relate}, string({jw_threshold}), string({age_threshold}), map('histid', '{histid}', 'name','{name}','byr','{byr}','sex','{sex}'))" + + elif comp_type == "jaro_winkler_rate": + col = feature["column_name"] + if "jw_threshold" in feature: + jw_threshold = feature["jw_threshold"] + else: + jw_threshold = 0.8 + print( + f"WARNING: No jw_threshold defined; Setting jw_threshold for jaro_winkler_rate comparison feature for {col} to {jw_threshold}" + ) + expr = f"jw_rate(a.{col}, b.{col}, string({jw_threshold}))" + + elif comp_type == "sum": + col = feature["column_name"] + expr = f"a.{col} + b.{col}" + + elif comp_type == "hh_compare_rate": + col = feature["column_name"] + expr = f"hh_compare_rate(a.{col}, b.{col})" + + elif comp_type == "length_b": + col = feature["column_name"] + expr = f"size(b.{col})" + + elif comp_type == "abs_diff": + col = feature["column_name"] + ne = feature.get("not_equals", False) + if ne: + expr = f"case when cast(b.{col} as INT) != {ne} and cast(a.{col} as INT) != {ne} then abs(CAST(b.{col} as INT) - CAST(a.{col} as INT)) else -1 end" + else: + expr = f"abs(CAST(b.{col} as INT) - CAST(a.{col} as INT))" + + elif comp_type == "b_minus_a": + col = feature["column_name"] + ne = feature.get("not_equals", False) + if ne: + expr = f"case when cast(b.{col} as INT) != {ne} and cast(a.{col} as INT) != {ne} then CAST(b.{col} as INT) - CAST(a.{col} as INT) else -1 end" + else: + expr = f"CAST(b.{col} as INT) - CAST(a.{col} as INT)" + + elif comp_type == "has_matching_element": + col = feature["column_name"] + expr = f"has_matching_element(a.{col}, b.{col})" + elif comp_type == "geo_distance": + distance_col = feature["distance_col"] + dt = feature["table_name"] + st = feature.get("secondary_table_name", False) + if st: + st_distance_col = feature["secondary_distance_col"] + expr = f"IF({dt}.{distance_col} IS NOT NULL, {dt}.{distance_col}, {st}.{st_distance_col})" + else: + expr = f"{dt}.{distance_col}" + + elif comp_type == "fetch_a": + col = feature["column_name"] + expr = f"a.{col}" + + elif comp_type == "fetch_b": + col = feature["column_name"] + expr = f"b.{col}" + + elif comp_type == "fetch_td": + col = feature["column_name"] + expr = f"pm.{col}" + + elif comp_type == "new_marr": + col = feature["column_name"] + if "upper_threshold" not in feature: + feature["upper_threshold"] = 10 + print( + f"WARNING: No upper_threshold defined; Setting upper_threshold for new_marr comparison feature for {col} to 10" + ) + expr = f"CAST(b.{col} as INT)" + + elif comp_type == "existing_marr": + col = feature["column_name"] + if "lower_threshold" not in feature: + feature["lower_threshold"] = 10 + print( + f"WARNING: No lower_threshold defined; Setting lower_threshold for existing_marr comparison feature for {col} to 10" + ) + expr = f"CAST(b.{col} as INT)" + + elif comp_type == "parent_step_change": + col = feature["column_name"] + expr = f"(CAST(a.{col} as INT) > 0) IS DISTINCT FROM (CAST(b.{col} as INT) > 0)" + + elif comp_type == "present_both_years": + col = feature["column_name"] + expr = f"a.{col} IS NOT NULL AND a.{col} > 0 AND b.{col} IS NOT NULL AND b.{col} > 0" + + elif comp_type == "neither_are_null": + col = feature["column_name"] + expr = f"a.{col} IS NOT NULL AND b.{col} IS NOT NULL and a.{col} != '' and b.{col} != ''" + + elif comp_type == "present_and_matching_categorical": + col = feature["column_name"] + expr = f"IF(a.{col} IS NOT NULL AND b.{col} IS NOT NULL AND CAST(a.{col} as STRING) != '' and CAST(b.{col} as STRING) != '', IF(a.{col} IS DISTINCT FROM b.{col}, 1, 0), 2)" + + elif comp_type == "present_and_equal_categorical_in_universe": + col = feature["column_name"] + niu = feature["NIU"] + expr = f"IF(a.{col} IS NOT NULL AND b.{col} IS NOT NULL AND a.{col} != {niu} AND b.{col} != {niu} AND CAST(a.{col} as string) != '' and CAST(b.{col} as string) != '', IF(a.{col} IS DISTINCT FROM b.{col}, 0, 1), 0)" + + elif comp_type == "present_and_not_equal": + col = feature["column_name"] + expr = f"IF(a.{col} IS NOT NULL AND b.{col} IS NOT NULL AND cast(a.{col} as string) != '' and cast(b.{col} as string) != '' and a.{col} > 0, IF(a.{col} IS DISTINCT FROM b.{col}, TRUE, FALSE), FALSE)" + + else: + raise ValueError(f"No comparison type: {feature['comparison_type']}") + + if feature.get("power", False): + exponent = feature["power"] + expr = f"POWER(CAST({expr} as INT), {exponent})" + + if feature.get("threshold", False): + threshold = feature["threshold"] + expr = f"{expr} IS NOT NULL and {expr} >= {threshold}" + elif feature.get("lower_threshold", False): + lower_threshold = feature["lower_threshold"] + expr = f"{expr} IS NOT NULL and {expr} >= {lower_threshold}" + elif feature.get("upper_threshold", False): + upper_threshold = feature["upper_threshold"] + expr = f"{expr} IS NOT NULL and {expr} <= {upper_threshold}" + elif feature.get("gt_threshold", False): + gt_threshold = feature["gt_threshold"] + expr = f"{expr} IS NOT NULL and {expr} > {gt_threshold}" + elif feature.get("btwn_threshold", False): + bt0 = feature["btwn_threshold"][0] + bt1 = feature["btwn_threshold"][1] + expr = f"{expr} IS NOT NULL and {expr} >= {bt0} and {expr} <= {bt1}" + + if feature.get("look_at_addl_var", False): + addl_var = feature["addl_var"] + check_val_expr = feature["check_val_expr"] + else_val = feature["else_val"] + datasource = feature["datasource"] + expr = f"CASE WHEN {datasource}.{addl_var} {check_val_expr} then {expr} else {else_val} END" + + if include_as: + full_expr = f"({expr})" + f" as {feature['alias']}" + else: + full_expr = expr + + return full_expr diff --git a/hlink/linking/core/dist_table.py b/hlink/linking/core/dist_table.py new file mode 100644 index 0000000..08be40d --- /dev/null +++ b/hlink/linking/core/dist_table.py @@ -0,0 +1,118 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +def register_dist_tables_and_create_sql(link_task, dist_features): + """Given a list of distance table comparison features, + registers the required distance tables and returns + the distance table join sql statements. + + Parameters + ---------- + link_task: LinkTask + the LinkTask used to register the distance tables + dist_features: list + a list of comparison features that use distance tables + + Returns + ------- + A list of the sql join clauses to be used to join in the + distance tables when creating the comparison features. + """ + tables_loaded = [] + join_clauses = [] + for feature in dist_features: + st = feature.get("secondary_table_name", False) + dt = feature["table_name"] + if dt not in tables_loaded: + link_task.run_register_python( + f"{dt}", + lambda: link_task.spark.read.csv( + f"{feature['distances_file']}", header=True, inferSchema=True + ), + persist=True, + ) + tables_loaded.append(dt) + if st: + if st not in tables_loaded: + link_task.run_register_python( + f"{st}", + lambda: link_task.spark.read.csv( + f"{feature['secondary_distances_file']}", + header=True, + inferSchema=True, + ), + persist=True, + ) + tables_loaded.append(st) + if feature["key_count"] == 1: + join_clause = __key_count_1( + dt, feature["column_name"], feature["loc_a"], feature["loc_b"] + ) + if join_clause not in join_clauses: + join_clauses.append(join_clause) + elif feature["key_count"] == 2: + join_clause = __key_count_2( + dt, + feature["source_column_a"], + feature["source_column_b"], + feature["loc_a_0"], + feature["loc_a_1"], + feature["loc_b_0"], + feature["loc_b_1"], + ) + + if join_clause not in join_clauses: + join_clauses.append(join_clause) + if st: + if feature["secondary_key_count"] == 1: + join_clause = __key_count_1( + st, + feature["secondary_source_column"], + feature["secondary_loc_a"], + feature["secondary_loc_b"], + ) + + if join_clause not in join_clauses: + join_clauses.append(join_clause) + elif feature["secondary_key_count"] == 2: + join_clause = ( + st, + feature["secondary_source_column_a"], + feature["secondary_source_column_b"], + feature["secondary_loc_a_0"], + feature["secondary_loc_a_1"], + feature["secondary_loc_b_0"], + feature["secondary_loc_b_1"], + ) + if join_clause not in join_clauses: + join_clauses.append(join_clause) + return join_clauses, tables_loaded + + +def __key_count_1(table, column, loc_a, loc_b): + join_clause = ( + f"LEFT JOIN {table} " + f"ON a.{column} = {table}.{loc_a} " + f"AND b.{column} = {table}.{loc_b}" + ) + return join_clause + + +def __key_count_2(table, column_a, column_b, loc_a_0, loc_a_1, loc_b_0, loc_b_1): + join_clause = ( + f"LEFT JOIN {table} " + f"ON a.{column_a} = {table}.{loc_a_0} " + f"AND a.{column_b} = {table}.{loc_b_0} " + f"AND b.{column_a} = {table}.{loc_a_1} " + f"AND b.{column_b} = {table}.{loc_b_1}" + ) + return join_clause + + +def get_broadcast_hint(tables_loaded): + tables = ", ".join(tables_loaded) + broadcast_hints = f"/*+ BROADCAST({tables}) */" + return broadcast_hints diff --git a/hlink/linking/core/pipeline.py b/hlink/linking/core/pipeline.py new file mode 100644 index 0000000..e2a8268 --- /dev/null +++ b/hlink/linking/core/pipeline.py @@ -0,0 +1,211 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml.feature import ( + Imputer, + StandardScaler, + OneHotEncoderEstimator, + VectorAssembler, + Bucketizer, +) +import hlink.linking.transformers.interaction_transformer +import hlink.linking.transformers.float_cast_transformer + + +def generate_pipeline_stages(conf, ind_vars, tf, tconf): + """Creates a Spark ML pipeline from the pipeline features. + Parameters + ---------- + conf: dictionary + the base configuration dictionary + ind_vars: list + a list of columns that are going to be used as independent_vars + tf: DataFrame + a Spark DataFrame for the "training_features" table + tconf: string + the name of the training section of the configuration. + Can either be "training" or "hh_training". + + Returns + ------- + A Spark ML pipeline object. + + """ + pipeline_stages = [] + tf_cols = tf.columns + all_cols = ind_vars + tf_cols + + # Get the input columns that will be used as inputs for pipeline features + pipeline_feature_input_cols = get_pipeline_feature_input_cols( + ind_vars, conf.get("pipeline_features") + ) + pipeline_input_cols = ind_vars + pipeline_feature_input_cols + cols_to_pass = list( + (set(tf_cols) & set(ind_vars)) + | (set(tf_cols) & set(pipeline_feature_input_cols)) + ) + col_names_dict = dict(zip(all_cols, all_cols)) + + ( + categorical_comparison_features, + categorical_pipeline_features, + ) = _calc_categorical_features( + ind_vars, + cols_to_pass, + conf["comparison_features"], + conf.get("pipeline_features"), + ) + + # Cast table columns to float + dep_var = str(conf[tconf]["dependent_var"]) + # id_b = conf["id_column"] + "_b" + # id_a = conf["id_column"] + "_a" + # cols_to_float = list(set(tf_cols) - {id_a, id_b, dep_var}) + float_cast_transformer = ( + hlink.linking.transformers.float_cast_transformer.FloatCastTransformer( + inputCols=cols_to_pass + ) + ) + pipeline_stages.append(float_cast_transformer) + + # Impute null values for remaining non-null columns + features_to_impute = [] + for x in cols_to_pass: + if x in categorical_comparison_features or x == dep_var or "id" in x: + continue + else: + features_to_impute.append(x) + + imputed_output_features = [x + "_imp" for x in features_to_impute] + imputer = Imputer( + inputCols=features_to_impute, + strategy="mean", + outputCols=imputed_output_features, + ) + pipeline_stages.append(imputer) + + for x in features_to_impute: + if x in col_names_dict.keys(): + col_names_dict[x] = x + "_imp" + # feature_names = list((set(ind_vars) - set(features_to_impute)) | set(output_feature_names)) + + if len(categorical_comparison_features) > 0: + encoded_output_cols = [ + x + "_onehotencoded" for x in categorical_comparison_features + ] + encoder = OneHotEncoderEstimator( + inputCols=categorical_comparison_features, + outputCols=encoded_output_cols, + handleInvalid="keep", + dropLast=False, + ) + # feature_names = list((set(feature_names) - set(categorical_comparison_features)) | set(encoded_output_cols)) + for x in categorical_comparison_features: + if x in col_names_dict.keys(): + col_names_dict[x] = x + "_onehotencoded" + pipeline_stages.append(encoder) + + if "pipeline_features" in conf: + for x in conf["pipeline_features"]: + if x["output_column"] in pipeline_input_cols: + if x["transformer_type"] == "bucketizer": + splits = x["splits"] + if x["input_column"] in col_names_dict.keys(): + input_col = col_names_dict[x["input_column"]] + else: + input_col = x["input_column"] + bucketizer = Bucketizer( + splits=splits, inputCol=input_col, outputCol=x["output_column"] + ) + pipeline_stages.append(bucketizer) + + elif x["transformer_type"] == "interaction": + input_cols = [] + for key in x["input_columns"]: + if key in col_names_dict.keys(): + input_cols.append(col_names_dict[key]) + else: + input_cols.append(key) + interaction_transformer = hlink.linking.transformers.interaction_transformer.InteractionTransformer( + inputCols=input_cols, outputCol=x["output_column"] + ) + pipeline_stages.append(interaction_transformer) + else: + continue + + if len(categorical_pipeline_features) > 0: + encoded_output_cols = [ + x + "_onehotencoded" for x in categorical_pipeline_features + ] + encoder = OneHotEncoderEstimator( + inputCols=categorical_pipeline_features, + outputCols=encoded_output_cols, + handleInvalid="keep", + dropLast=False, + ) + # feature_names = list((set(feature_names) - set(categorical_pipeline_features)) | set(encoded_output_cols)) + for x in categorical_pipeline_features: + if x in col_names_dict.keys(): + col_names_dict[x] = x + "_onehotencoded" + pipeline_stages.append(encoder) + + vec_cols = [] + for col in ind_vars: + if col in col_names_dict.keys(): + vec_cols.append(col_names_dict[col]) + else: + vec_cols.append(col) + + scale_data = conf[tconf].get("scale_data", False) + output_col = "features_vector_prelim" if scale_data else "features_vector" + vecAssembler = VectorAssembler(inputCols=vec_cols, outputCol=output_col) + pipeline_stages.append(vecAssembler) + if scale_data: + scaler = StandardScaler( + inputCol="features_vector_prelim", outputCol="features_vector" + ) + pipeline_stages.append(scaler) + return pipeline_stages + + +def _calc_categorical_features( + ind_vars, cols_to_pass, comparison_features, pipeline_features, for_hh=False +): + categorical_comparison_features = [] + categorical_pipeline_features = [] + cols = set(cols_to_pass + ind_vars) + + # Check for categorical features in all comparison features + for x in comparison_features: + if x["alias"] in cols: + if "categorical" in x.keys(): + categorical_comparison_features.append(x["alias"]) + else: + continue + + # Check for categorical features in the pipeline-generated features (if exist) + + if pipeline_features is not None: + for pipeline_feature in pipeline_features: + if pipeline_feature["output_column"] in cols and pipeline_feature.get( + "categorical", False + ): + categorical_pipeline_features.append(pipeline_feature["output_column"]) + + return categorical_comparison_features, categorical_pipeline_features + + +def get_pipeline_feature_input_cols(ind_vars, pipeline_features): + pipeline_feature_input_cols = [] + if pipeline_features is not None: + for pipeline_feature in pipeline_features: + if pipeline_feature["output_column"] in ind_vars: + if pipeline_feature.get("input_column", False): + pipeline_feature_input_cols.append(pipeline_feature["input_column"]) + else: + pipeline_feature_input_cols += pipeline_feature["input_columns"] + else: + continue + return pipeline_feature_input_cols diff --git a/hlink/linking/core/substitutions.py b/hlink/linking/core/substitutions.py new file mode 100644 index 0000000..fff8c0d --- /dev/null +++ b/hlink/linking/core/substitutions.py @@ -0,0 +1,97 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from collections import namedtuple +from pyspark.sql.functions import * + + +def generate_substitutions(spark, df_selected, substitution_columns): + for substitution_column in substitution_columns: + column_name = substitution_column["column_name"] + for substitution in substitution_column["substitutions"]: + if ( + "regex_word_replace" in substitution + and substitution["regex_word_replace"] + ): + df_selected = __apply_regex_substitution( + df_selected, column_name, substitution, spark.sparkContext + ) + elif "substitution_file" in substitution: + df_selected = __apply_substitution( + df_selected, column_name, substitution, spark.sparkContext + ) + else: + raise KeyError( + "You must supply a substitution file and either specify regex_word_replace=true or supply a join value." + ) + return df_selected + + +def __load_substitutions(file_name): + """Reads in the substitution file and returns a 2-tuple representing it. + + Parameters + ---------- + file_name: name of substitution file + + Returns + ---------- + A 2-tuple where the first value is an array of values to be replaced and the second is an array of values to use + when replacing words in the first array. + """ + sub_froms = [] + sub_tos = [] + with open(file_name, mode="r", encoding="utf-8-sig") as f: + for line in f: + sub_to, sub_from = line.strip().lower().split(",") + sub_froms.append(sub_from) + sub_tos.append(sub_to) + return (sub_froms, sub_tos) + + +def __apply_substitution(df, column_name, substitution, sc): + """Returns a new df with the values in the column column_name replaced using substitutions defined in substitution_file.""" + substitution_file = substitution["substitution_file"] + join_value = substitution["join_value"] + join_column = substitution["join_column"] + join_column_alias = join_column + "_sub" + sub_froms, sub_tos = __load_substitutions(substitution_file) + subs = list(zip(sub_froms, sub_tos)) + Sub = namedtuple("Sub", ["sub_from", "sub_to"]) + sub_df = ( + sc.parallelize(subs, 1) + .map(lambda s: Sub(s[0], s[1])) + .toDF() + .withColumn(join_column_alias, lit(join_value)) + ) + join_statement = (sub_df["sub_from"] == split(df[column_name], " ")[0]) & ( + sub_df[join_column_alias] == df[join_column] + ) + df_sub = df.join(sub_df.hint("broadcast"), join_statement, "left_outer").drop( + "join_column_alias" + ) + df_sub_select = ( + when(df_sub["sub_to"].isNull(), df_sub[column_name]) + .otherwise(concat_ws(" ", df_sub["sub_to"], split(df_sub[column_name], " ")[1])) + .alias(column_name) + ) + df_sub_selects = list(set(df.columns) - set([column_name])) + [df_sub_select] + return df_sub.select(df_sub_selects) + + +def __apply_regex_substitution(df, column_name, substitution, sc): + """Returns a new df with the values in the column column_name replaced using substitutions defined in substitution_file.""" + + substitution_file = substitution["substitution_file"] + sub_froms, sub_tos = __load_substitutions(substitution_file) + subs = dict(zip(sub_froms, sub_tos)) + col = column_name + df.checkpoint() + + for sub_from, sub_to in subs.items(): + col = regexp_replace( + col, r"(?:(?<=\s)|(?<=^))(" + sub_from + r")(?:(?=\s)|(?=$))", sub_to + ) + return df.withColumn(column_name, col) diff --git a/hlink/linking/core/threshold.py b/hlink/linking/core/threshold.py new file mode 100644 index 0000000..4c193db --- /dev/null +++ b/hlink/linking/core/threshold.py @@ -0,0 +1,124 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.window import Window +from pyspark.sql.functions import rank, lead + + +def get_threshold_ratio(training_conf, model_conf, default=1.3): + """Gets the threshold ratio or default from the config using the correct precedence. + + Parameters + ---------- + training_conf: dictionary + the config dictionary containing the training conf + model_conf: dictionary + the config dictionary for a specific model + default: float + the default value to use if the threshold is missing + + Returns + ------- + The threshold ratio. + """ + if "threshold_ratio" in model_conf: + return model_conf["threshold_ratio"] + elif "threshold_ratio" in training_conf: + return training_conf["threshold_ratio"] + else: + return default + + +def predict_using_thresholds( + pred_df, alpha_threshold, threshold_ratio, training_conf, id_col +): + """Adds a prediction column to the given pred_df by applying thresholds. + + Parameters + ---------- + pred_df: DataFrame + a Spark DataFrame of potential matches a probability column + alpha_threshold: float + the alpha threshold cutoff value. No record with a probability lower than this + value will be considered for prediction = 1. + threshold_ratio: float + the threshold ratio cutoff value. Ratio's refer + to the "a" record's next best probability value. + Only used with the "drop_duplicate_with_threshold_ratio" + configuration value. + training_conf: dictionary + the training config section + id_col: string + the id column + + Returns + ------- + A Spark DataFrame containing the "prediction" column as well as other intermediate columns generated to create the prediction. + """ + use_threshold_ratio = ( + training_conf.get("decision", "") == "drop_duplicate_with_threshold_ratio" + ) + + if use_threshold_ratio: + return _apply_threshold_ratio( + pred_df.drop("prediction"), alpha_threshold, threshold_ratio, id_col + ) + else: + return _apply_alpha_threshold( + pred_df.drop("prediction"), alpha_threshold, threshold_ratio + ) + + +def _apply_alpha_threshold(pred_df, alpha_threshold, threshold_ratio): + return pred_df.selectExpr( + "*", + f"case when probability >= {alpha_threshold} then 1 else 0 end as prediction", + ) + + +def _apply_threshold_ratio(df, alpha_threshold, threshold_ratio, id_col): + """ Apply a decision threshold using the ration of a match's probability to the next closest match's probability. """ + id_a = id_col + "_a" + id_b = id_col + "_b" + if "probability" not in df.columns: + raise NameError( + 'In order to calculate the threshold ratio based on probabilities, you need to have a "probability" column in your data.' + ) + else: + windowSpec = Window.partitionBy(df[f"{id_a}"]).orderBy( + df["probability"].desc(), df[f"{id_b}"] + ) + prob_rank = rank().over(windowSpec) + prob_lead = lead(df["probability"], 1).over(windowSpec) + return ( + df.select( + df["*"], + prob_rank.alias("prob_rank"), + prob_lead.alias("second_best_prob"), + ) + .selectExpr( + "*", + f""" + IF( + second_best_prob IS NOT NULL + AND second_best_prob >= {alpha_threshold} + AND prob_rank == 1, + probability / second_best_prob, + NULL) + as ratio + """, + ) + .selectExpr( + "*", + f""" + CAST( + probability >= {alpha_threshold} + AND prob_rank == 1 + AND (ratio > {threshold_ratio} OR ratio is NULL) + as INT) as prediction + """, + ) + .drop("prob_rank") + ) diff --git a/hlink/linking/core/transforms.py b/hlink/linking/core/transforms.py new file mode 100755 index 0000000..48b8950 --- /dev/null +++ b/hlink/linking/core/transforms.py @@ -0,0 +1,498 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.functions import * +from pyspark.sql.types import * +from pyspark.ml import Pipeline +from pyspark.sql import Window +from pyspark.ml.feature import NGram, RegexTokenizer, CountVectorizer, MinHashLSH + + +def generate_transforms( + spark, df_selected, feature_selections, link_task, is_a, id_col +): + not_skipped_feature_selections = [ + c + for c in feature_selections + if ("skip" not in c or not (c["skip"])) + and ("post_agg_feature" not in c or not (c["post_agg_feature"])) + ] + post_agg_feature_selections = [ + c + for c in feature_selections + if ("post_agg_feature" in c) and c["post_agg_feature"] + ] + + def parse_feature_selections(df_selected, feature_selection, is_a): + transform = feature_selection["transform"] + + if not feature_selection.get("output_column", False): + feature_selection["output_column"] = feature_selection["output_col"] + + if "checkpoint" in feature_selection and feature_selection["checkpoint"]: + df_selected = df_selected.checkpoint() + + if "override_column_a" in feature_selection and is_a: + override_name = feature_selection["override_column_a"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], df_selected[override_name] + ) + return df_selected + + elif "override_column_b" in feature_selection and not is_a: + override_name = feature_selection["override_column_b"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], df_selected[override_name] + ) + return df_selected + + elif "set_value_column_a" in feature_selection and is_a: + set_value = feature_selection["set_value_column_a"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], lit(set_value) + ) + return df_selected + + elif "set_value_column_b" in feature_selection and not is_a: + set_value = feature_selection["set_value_column_b"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], lit(set_value) + ) + return df_selected + + elif transform == "bigrams": + input_col = feature_selection["input_column"] + output_col = feature_selection["output_column"] + intermediate_col = input_col + "_tokens" + unsorted_col = input_col + "_unsorted" + if ( + "no_first_pad" in feature_selection + and feature_selection["no_first_pad"] + ): + input_col_space = input_col + else: + input_col_space = input_col + "_space" + df_selected = df_selected.withColumn( + input_col_space, concat(lit(" "), input_col) + ) + tokenizer_a = RegexTokenizer( + pattern="", inputCol=input_col_space, outputCol=intermediate_col + ) + ngram_a = NGram(n=2, inputCol=intermediate_col, outputCol=output_col) + pipeline = Pipeline(stages=[tokenizer_a, ngram_a]) + df_selected = pipeline.fit(df_selected).transform(df_selected) + df_selected = df_selected.withColumn(unsorted_col, df_selected[output_col]) + df_selected = df_selected.withColumn( + output_col, sort_array(df_selected[unsorted_col]) + ) + return df_selected + + elif transform == "sql_condition": + cond = feature_selection["condition"] + output_col = feature_selection["output_column"] + df_selected = df_selected.withColumn(output_col, expr(cond)) + return df_selected + + elif transform == "array": + col1, col2 = feature_selection["input_columns"] + output_col = feature_selection["output_column"] + df_selected = df_selected.withColumn(output_col, array(col1, col2)) + return df_selected + + elif transform == "union": + col1, col2 = feature_selection["input_columns"] + output_col = feature_selection["output_column"] + + def union_list(list_a, list_b): + return list(set(list_a).union(set(list_b))) + + union_list_udf = udf(union_list, ArrayType(StringType())) + df_selected = df_selected.withColumn(output_col, union_list_udf(col1, col2)) + return df_selected + + elif transform == "hash": + input_col = feature_selection["input_column"] + count_col = feature_selection["output_column"] + "_count" + hash_array_col = feature_selection["output_column"] + df_selected = df_selected.where(f"size({input_col}) > 0") + count_vect = CountVectorizer(inputCol=input_col, outputCol=count_col) + lsh = MinHashLSH( + inputCol=count_col, + outputCol=hash_array_col, + numHashTables=feature_selection["number"], + seed=445123, + ) + # non_zero = udf(lambda v: v.numNonzeros() > 0, BooleanType()) + # hha_count_nonzero = hha_counts.where(non_zero(F.col("word_counts"))) + cv_model = count_vect.fit(df_selected) + df_transformed = cv_model.transform(df_selected) + lsh_model = lsh.fit(df_transformed) + df_selected = lsh_model.transform(df_transformed) + return df_selected + + elif transform == "soundex": + input_col = feature_selection["input_column"] + output_col = feature_selection["output_column"] + df_selected = df_selected.withColumn(output_col, soundex(input_col)) + return df_selected + + elif transform == "neighbor_aggregate": + return df_selected + # df_selected.createOrReplaceTempView("prepped_df_tmp") + # link_task.run_register_sql("hh_nbor_rank", t_ctx=feature_selection) + # link_task.run_register_sql("hh_nbor", t_ctx=feature_selection) + # df_selected = link_task.run_register_sql( + # None, template="attach_neighbor_col", t_ctx=feature_selection + # ) + # spark.catalog.dropTempView("prepped_df_tmp") + # spark.catalog.dropTempView("hh_nbor") + # spark.catalog.dropTempView("hh_nbor_rank") + + elif transform == "attach_family_col": + return df_selected + + elif transform == "related_individuals": + df_selected.createOrReplaceTempView("prepped_df_tmp") + df_selected = link_task.run_register_sql( + None, + template="attach_related_col", + t_ctx={ + "output_col": feature_selection["output_col"], + "input_col": feature_selection["input_col"], + "prepped_df": "prepped_df_tmp", + "family_id": feature_selection["family_id"], + "relate_col": feature_selection["relate_col"], + "top_code": feature_selection["top_code"], + "bottom_code": feature_selection["bottom_code"], + "id": id_col, + }, + ) + spark.catalog.dropTempView("prepped_df_tmp") + return df_selected + + elif transform == "related_individual_rows": + return df_selected + # df_selected.createOrReplaceTempView("prepped_df_tmp") + # relate_filter = ( + # feature_selection["filter_b"] + # if (not (is_a) and "filter_b" in feature_selection) + # else None + # ) + # df_selected = link_task.run_register_sql( + # None, + # template="attach_related_cols_as_rows", + # t_ctx={ + # "output_col": feature_selection["output_col"], + # "input_cols": feature_selection["input_cols"], + # "prepped_df": "prepped_df_tmp", + # "family_id": feature_selection["family_id"], + # "relate_col": feature_selection["relate_col"], + # "top_code": feature_selection["top_code"], + # "bottom_code": feature_selection["bottom_code"], + # "id": id_col, + # "filter": relate_filter, + # }, + # ) + # spark.catalog.dropTempView("prepped_df_tmp") + + elif transform == "popularity": + input_cols = feature_selection.get("input_cols", False) + output_col = feature_selection["output_col"] + + # this should be a dictionary key:col_name, value:integer to be used for range + range_col = feature_selection.get("range_col", False) + range_val = feature_selection.get("range_val", False) + + if range_col and range_val: + if input_cols: + window = ( + Window.partitionBy([df_selected[col] for col in input_cols]) + .orderBy(df_selected[range_col]) + .rangeBetween(-range_val, range_val) + ) + else: + window = Window.orderBy(df_selected[range_col]).rangeBetween( + -range_val, range_val + ) + else: + window = Window.partitionBy([df_selected[col] for col in input_cols]) + + df_selected = df_selected.select( + df_selected["*"], count(lit(1)).over(window).alias(output_col) + ) + return df_selected + + elif transform == "power": + input_col = feature_selection["input_col"] + output_col = feature_selection["output_col"] + exponent = feature_selection["exponent"] + df_selected = df_selected.select( + "*", pow(df_selected[input_col], exponent).alias(output_col) + ) + return df_selected + + elif transform == "attach_variable": + input_col = feature_selection["input_column"] # join key in core data + output_col = feature_selection[ + "output_column" + ] # desired alias for the added variable + col_to_join_on = feature_selection["col_to_join_on"] # join key in csv data + col_to_add = feature_selection["col_to_add"] # column to add from csv data + region_dict = feature_selection["region_dict"] # path to csv data file + null_filler = feature_selection[ + "null_filler" + ] # value to replace null values + col_type = feature_selection["col_type"] + + df_selected.createOrReplaceTempView("prepped_df_tmp") + + # open up csv file + link_task.run_register_python( + name="region_data", + func=lambda: spark.read.csv(region_dict, header=True, inferSchema=True), + # persist=True, + ) + # self.spark.table("region_data").region.cast("int") + + # join the csv file to the dataframe (df_selected) + df_selected = link_task.run_register_sql( + None, + template="attach_variable", + t_ctx={ + "input_col": input_col, + "output_col": output_col, + "prepped_df": "prepped_df_tmp", + "col_to_join_on": col_to_join_on, + "col_to_add": col_to_add, + "region_data": "region_data", + }, + ) + df_selected = df_selected.fillna(null_filler, subset=[output_col]) + df_selected = df_selected.withColumn( + output_col, df_selected[output_col].cast(col_type) + ) + spark.catalog.dropTempView("prepped_df_tmp") + return df_selected + + else: + raise ValueError("Invalid transform type for {}".format(str(transform))) + + for feature_selection in not_skipped_feature_selections: + df_selected = parse_feature_selections(df_selected, feature_selection, is_a) + + def get_transforms(name, is_a): + to_process = [] + for f in not_skipped_feature_selections: + if ("override_column_a" in f) and is_a: + pass + elif ("override_column_b" in f) and not is_a: + pass + elif ("set_value_column_a" in f) and is_a: + pass + elif ("set_value_column_b" in f) and not is_a: + pass + elif f["transform"] == name: + to_process.append(f) + + return to_process + + hh_transforms = [ + get_transforms("attach_family_col", is_a), + get_transforms("related_individual_rows", is_a), + get_transforms("neighbor_aggregate", is_a), + ] + if any(hh_transforms): + attach_ts, related_ts, neighbor_ts = hh_transforms + if neighbor_ts: + group_by = [ + neighbor_ts[0]["sort_column"], + neighbor_ts[0]["neighborhood_column"], + ] + elif related_ts: + group_by = [related_ts[0]["family_id"]] + elif attach_ts: + group_by = [attach_ts[0]["family_id"]] + + df_grouped = df_selected.groupBy(*group_by).agg( + collect_list(struct("*")).alias("hh_rows") + ) + neighbor_selects = [] + if neighbor_ts: + for neighbor_t in neighbor_ts: + serial_column = neighbor_t["sort_column"] + window = f""" PARTITION BY {neighbor_t['neighborhood_column']} + ORDER BY {serial_column} + ROWS BETWEEN {neighbor_t['range']} PRECEDING + AND {neighbor_t['range']} FOLLOWING""" + output_column = neighbor_t["output_column"] + output_column_tmp = neighbor_t["output_column"] + "_tmp" + df_grouped = df_grouped.selectExpr( + "*", + f"collect_list(hh_rows_get_first_value(hh_rows, '{serial_column}', 'pernum', '{neighbor_t['input_column']}')) OVER ({window}) as {output_column_tmp}", + ) + df_grouped = df_grouped.selectExpr( + "*", + f"extract_neighbors({output_column_tmp}, {serial_column}) as {output_column}", + ).drop(f"{output_column_tmp}") + neighbor_selects.append(output_column) + if attach_ts: + attach_hh_column = spark._jvm.com.isrdi.udfs.AttachHHColumn() + attach_hh_column.createAttachUDF( + spark._jwrapped, df_grouped._jdf, attach_ts, "attach_hh_scala" + ) + all_cols_but_hh_rows = list(set(df_grouped.columns) - set(["hh_rows"])) + df_grouped_selects = all_cols_but_hh_rows + [ + "attach_hh_scala(hh_rows) as hh_rows" + ] + df_grouped = df_grouped.selectExpr(df_grouped_selects) + if related_ts: + attach_rel_rows = spark._jvm.com.isrdi.udfs.AttachRelatedRows() + a_or_b = "a" if is_a else "b" + attach_rel_rows.createAttachUDF( + spark._jwrapped, + df_grouped._jdf, + related_ts, + id_col, + a_or_b, + "attach_rel_scala", + ) + all_cols_but_hh_rows = list(set(df_grouped.columns) - set(["hh_rows"])) + df_grouped_selects = all_cols_but_hh_rows + [ + "attach_rel_scala(hh_rows) as hh_rows" + ] + df_grouped = df_grouped.selectExpr(df_grouped_selects) + explode_selects = neighbor_selects + ["explode(hh_rows) as tmp_row"] + tmp_row_selects = neighbor_selects + ["tmp_row.*"] + df_selected = df_grouped.selectExpr(*explode_selects).selectExpr( + *tmp_row_selects + ) + + for feature_selection in post_agg_feature_selections: + df_selected = parse_feature_selections(df_selected, feature_selection, is_a) + return df_selected + + +# These apply to the column mappings in the current config +def apply_transform(column_select, transform, is_a): + """Given a dataframe select string return a new string having applied the given transform. + column_select: A PySpark column type + transform: The transform info from the current config + is_a: Is running on dataset 'a' or 'b ? + + See the json_schema config file in config_schemas/config.json for definitions on each transform type. + """ + transform_type = transform["type"] + if transform_type == "add_to_a": + if is_a: + return column_select + transform["value"] + else: + return column_select + if transform_type == "concat_to_a": + if is_a: + return concat(column_select, lit(transform["value"])) + else: + return column_select + elif transform_type == "concat_to_b": + if is_a: + return column_select + else: + return concat(column_select, lit(transform["value"])) + elif transform_type == "concat_two_cols": + return concat(column_select, transform["column_to_append"]) + elif transform_type == "lowercase_strip": + return lower(trim(column_select)) + elif transform_type == "rationalize_name_words": + return regexp_replace(column_select, r"[^a-z?'\*\-]+", " ") + elif transform_type == "remove_qmark_hyphen": + return regexp_replace(column_select, r"[?\*\-]+", "") + elif transform_type == "remove_punctuation": + return regexp_replace(column_select, r"[?\-\\\/\"\':,.\[\]\{\}]+", "") + elif transform_type == "replace_apostrophe": + return regexp_replace(column_select, r"'+", " ") + elif transform_type == "remove_alternate_names": + return regexp_replace(column_select, r"(\w+)( or \w+)+", "$1") + elif transform_type == "remove_suffixes": + suffixes = "|".join(transform["values"]) + suffix_regex = r"\b(?: " + suffixes + r")\s*$" + return regexp_replace(column_select, suffix_regex, "") + elif transform_type == "remove_stop_words": + words = "|".join(transform["values"]) + suffix_regex = r"\b(?:" + words + r")\b" + return regexp_replace(column_select, suffix_regex, "") + elif transform_type == "remove_prefixes": + prefixes = "|".join(transform["values"]) + prefix_regex = "^(" + prefixes + ") " + return regexp_replace(column_select, prefix_regex, "") + elif transform_type == "condense_prefixes": + prefixes = "|".join(transform["values"]) + prefix_regex = r"^(" + prefixes + ") " + return regexp_replace(column_select, prefix_regex, r"$1") + elif transform_type == "condense_strip_whitespace": + return regexp_replace(trim(column_select), r"\s\s+", " ") + elif transform_type == "remove_one_letter_names": + return regexp_replace(column_select, r"^((?:\w )+)(\w+)", r"$2") + elif transform_type == "split": + return split(column_select, " ") + elif transform_type == "length": + return length(column_select) + elif transform_type == "array_index": + return column_select[transform["value"]] + elif transform_type == "mapping": + mapped_column = column_select + if transform.get("values", False): + print( + "DEPRECATION WARNING: The 'mapping' transform no longer takes the 'values' parameter with a list of mappings in dictionaries; instead each mapping should be its own transform. Please change your config for future releases." + ) + for mapping in transform["values"]: + from_regexp = "|".join(["^" + str(f) + "$" for f in mapping["from"]]) + mapped_column = regexp_replace( + mapped_column, from_regexp, str(mapping["to"]) + ) + else: + for key, value in transform["mappings"].items(): + from_regexp = "^" + str(key) + "$" + mapped_column = regexp_replace(mapped_column, from_regexp, str(value)) + if transform.get("output_type", False) == "int": + mapped_column = mapped_column.cast(LongType()) + return mapped_column + elif transform_type == "swap_words": + mapped_column = column_select + for swap_from, swap_to in transform["values"].items(): + mapped_column = regexp_replace( + mapped_column, + r"(?:(?<=\s)|(?<=^))(" + swap_from + r")(?:(?=\s)|(?=$))", + swap_to, + ) + return mapped_column + elif transform_type == "substring": + if len(transform["values"]) == 2: + sub_from = transform["values"][0] + sub_length = transform["values"][1] + return column_select.substr(sub_from, sub_length) + else: + raise ValueError( + f"Length of substr transform should be 2. You gave: {transform}" + ) + elif transform_type == "expand": + expand_length = transform["value"] + return array( + [column_select + i for i in range(-expand_length, expand_length + 1)] + ) + elif transform_type == "cast_as_int": + return column_select.cast("int") + elif transform_type == "divide_by_int": + divisor = transform["value"] + return column_select.cast("int") / divisor + elif transform_type == "when_value": + threshold = transform["value"] + if_value = transform["if_value"] + else_value = transform["else_value"] + return when(column_select.cast("int") == threshold, if_value).otherwise( + else_value + ) + elif transform_type == "get_floor": + return floor(column_select).cast("int") + else: + raise ValueError("Invalid transform type for {}".format(str(transform))) diff --git a/hlink/linking/hh_matching/__init__.py b/hlink/linking/hh_matching/__init__.py new file mode 100644 index 0000000..a1267a1 --- /dev/null +++ b/hlink/linking/hh_matching/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .hh_matching import HHMatching diff --git a/hlink/linking/hh_matching/hh_matching.py b/hlink/linking/hh_matching/hh_matching.py new file mode 100644 index 0000000..eb64625 --- /dev/null +++ b/hlink/linking/hh_matching/hh_matching.py @@ -0,0 +1,23 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask +from .link_step_block_on_households import LinkStepBlockOnHouseholds +from .link_step_filter import LinkStepFilter +from hlink.linking.matching.link_step_score import LinkStepScore + + +class HHMatching(LinkTask): + def get_steps(self): + return [ + LinkStepBlockOnHouseholds(self), + LinkStepFilter(self), + LinkStepScore(self), + ] + + def __init__(self, link_run): + super().__init__(link_run, display_name="Household Matching") + self.training_conf = "hh_training" + self.table_prefix = "hh_" diff --git a/hlink/linking/hh_matching/link_step_block_on_households.py b/hlink/linking/hh_matching/link_step_block_on_households.py new file mode 100644 index 0000000..98c5127 --- /dev/null +++ b/hlink/linking/hh_matching/link_step_block_on_households.py @@ -0,0 +1,83 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.linking.link_step import LinkStep + + +class LinkStepBlockOnHouseholds(LinkStep): + def __init__(self, task): + super().__init__( + task, + "block on households", + input_table_names=["predicted_matches", "prepped_df_a", "prepped_df_b"], + output_table_names=["hh_blocked_matches"], + ) + + def _run(self): + id_col = self.task.link_run.config["id_column"] + + # Get the IDs for the potential matches that were deemed a match + self.task.run_register_python( + "indiv_matches", + lambda: self.task.spark.table("predicted_matches") + .select(f"{id_col}_a", f"{id_col}_b") + .distinct(), + persist=True, + ) + + pdfa = self.task.spark.table("prepped_df_a") + pdfb = self.task.spark.table("prepped_df_b") + individuals_matched = self.task.spark.table("indiv_matches") + + # Get the HH serial ids for these matched individuals + serials_to_match = ( + individuals_matched.join( + pdfa, on=[individuals_matched[f"{id_col}_a"] == pdfa[f"{id_col}"]] + ) + .select(individuals_matched[f"{id_col}_b"], pdfa.serialp.alias("serialp_a")) + .join(pdfb, on=[individuals_matched[f"{id_col}_b"] == pdfb[f"{id_col}"]]) + .select("serialp_a", pdfb.serialp.alias("serialp_b")) + .distinct() + ) + + self.task.run_register_python("serials_to_match", lambda: serials_to_match) + + # Get the individual IDs and serialps of the people who were NOT matched in the first round + self.task.run_register_python( + "unmatched_a", + lambda: pdfa.join( + individuals_matched, + on=[pdfa[f"{id_col}"] == individuals_matched[f"{id_col}_a"]], + how="left_anti", + ).select( + pdfa[f"{id_col}"].alias(f"{id_col}_a"), pdfa.serialp.alias("serialp_a") + ), + ) + + self.task.run_register_python( + "unmatched_b", + lambda: pdfb.join( + individuals_matched, + on=[pdfb[f"{id_col}"] == individuals_matched[f"{id_col}_b"]], + how="left_anti", + ).select( + pdfb[f"{id_col}"].alias(f"{id_col}_b"), pdfb.serialp.alias("serialp_b") + ), + ) + + uma = self.task.spark.table("unmatched_a") + umb = self.task.spark.table("unmatched_b") + stm = self.task.spark.table("serials_to_match") + + # Generate potential matches with those unmatched people who were in a household (serialp) with a match, blocking only on household id + self.task.run_register_python( + "hh_blocked_matches", + lambda: stm.join(uma, "serialp_a").join(umb, "serialp_b").distinct(), + persist=True, + ) + + print( + "Potential matches from households which contained a scored match have been saved to table 'hh_blocked_matches'." + ) diff --git a/hlink/linking/hh_matching/link_step_filter.py b/hlink/linking/hh_matching/link_step_filter.py new file mode 100644 index 0000000..c34f06c --- /dev/null +++ b/hlink/linking/hh_matching/link_step_filter.py @@ -0,0 +1,62 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.linking.core.comparison_feature as comparison_feature_core +import hlink.linking.core.comparison as comparison_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepFilter(LinkStep): + def __init__(self, task): + super().__init__( + task, + "filter", + input_table_names=["hh_blocked_matches"], + output_table_names=["hh_potential_matches"], + ) + + def _run(self): + # self.task.spark.sql("set spark.sql.shuffle.partitions=4000") + config = self.task.link_run.config + + # establish empty table context dict to pass to SQL template + t_ctx = {} + t_ctx["id_col"] = config["id_column"] + # get comparison_features + if config.get("hh_comparisons", False): + t_ctx["matching_clause"] = comparison_core.generate_comparisons( + config["hh_comparisons"], + config["comparison_features"], + config["id_column"], + ) + + comps = comparison_core.get_comparison_leaves(config["hh_comparisons"]) + comp_feature_names = [c["feature_name"] for c in comps] + + t_ctx["feature_columns"] = [ + comparison_feature_core.generate_comparison_feature( + f, config["id_column"], include_as=True + ) + for f in config["comparison_features"] + if f["alias"] in comp_feature_names + ] + + self.task.run_register_sql( + "hh_potential_matches", t_ctx=t_ctx, persist=True + ) + + else: + self.task.run_register_python( + "hh_potential_matches", + lambda: self.task.spark.table("hh_blocked_matches"), + persist=True, + ) + + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + + print( + "Potential matches from households which meet hh_comparsions thresholds have been saved to table 'hh_potential_matches'." + ) diff --git a/hlink/linking/hh_matching/templates/hh_blocked_matches.sql b/hlink/linking/hh_matching/templates/hh_blocked_matches.sql new file mode 100644 index 0000000..e8c8bf1 --- /dev/null +++ b/hlink/linking/hh_matching/templates/hh_blocked_matches.sql @@ -0,0 +1,17 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +unmatched_a.*, +unmatched_b.* + +FROM unmatched_a +JOIN unmatched_b +JOIN to_match + +ON +unmatched_a.serialp_a == to_match.serialp_a +AND +unmatched_b.serialp_b == to_match.serialp_b diff --git a/hlink/linking/hh_matching/templates/hh_potential_matches.sql b/hlink/linking/hh_matching/templates/hh_potential_matches.sql new file mode 100644 index 0000000..70a557b --- /dev/null +++ b/hlink/linking/hh_matching/templates/hh_potential_matches.sql @@ -0,0 +1,26 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT DISTINCT +hhbm.* +{% if feature_columns %} + {% for c in feature_columns %} + , {{c}} + {% endfor %} +{% endif %} + +FROM hh_blocked_matches hhbm +JOIN prepped_df_a a + +JOIN prepped_df_b b +ON +a.{{id_col}} == hhbm.{{id_col}}_a +AND +b.{{id_col}} == hhbm.{{id_col}}_b + +{% if matching_clause %} +WHERE +{{ matching_clause }} +{% endif %} diff --git a/hlink/linking/hh_model_exploration/__init__.py b/hlink/linking/hh_model_exploration/__init__.py new file mode 100644 index 0000000..61880fa --- /dev/null +++ b/hlink/linking/hh_model_exploration/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .hh_model_exploration import HHModelExploration diff --git a/hlink/linking/hh_model_exploration/hh_model_exploration.py b/hlink/linking/hh_model_exploration/hh_model_exploration.py new file mode 100644 index 0000000..596f3f5 --- /dev/null +++ b/hlink/linking/hh_model_exploration/hh_model_exploration.py @@ -0,0 +1,31 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from hlink.linking.model_exploration.link_step_ingest_file import LinkStepIngestFile +from hlink.linking.model_exploration.link_step_create_features import ( + LinkStepCreateFeatures, +) +from hlink.linking.model_exploration.link_step_train_test_models import ( + LinkStepTrainTestModels, +) + + +class HHModelExploration(LinkTask): + def __init__(self, link_run): + super().__init__( + link_run, + display_name="Household Model Exploration", + ) + self.training_conf = "hh_training" + self.table_prefix = "hh_model_eval_" + + def get_steps(self): + return [ + LinkStepIngestFile(self), + LinkStepCreateFeatures(self), + LinkStepTrainTestModels(self), + ] diff --git a/hlink/linking/hh_training/__init__.py b/hlink/linking/hh_training/__init__.py new file mode 100644 index 0000000..13021e3 --- /dev/null +++ b/hlink/linking/hh_training/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .hh_training import HHTraining diff --git a/hlink/linking/hh_training/hh_training.py b/hlink/linking/hh_training/hh_training.py new file mode 100644 index 0000000..85996aa --- /dev/null +++ b/hlink/linking/hh_training/hh_training.py @@ -0,0 +1,28 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from hlink.linking.training.link_step_ingest_file import LinkStepIngestFile +from hlink.linking.training.link_step_create_comparison_features import ( + LinkStepCreateComparisonFeatures, +) +from hlink.linking.training.link_step_train_and_save_model import ( + LinkStepTrainAndSaveModel, +) + + +class HHTraining(LinkTask): + def __init__(self, link_run): + super().__init__(link_run, display_name="Household Training") + self.training_conf = "hh_training" + self.table_prefix = "hh_" + + def get_steps(self): + return [ + LinkStepIngestFile(self), + LinkStepCreateComparisonFeatures(self), + LinkStepTrainAndSaveModel(self), + ] diff --git a/hlink/linking/link_run.py b/hlink/linking/link_run.py new file mode 100644 index 0000000..c8c2498 --- /dev/null +++ b/hlink/linking/link_run.py @@ -0,0 +1,118 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pathlib import Path +import pandas as pd + +from hlink.linking.preprocessing import Preprocessing +from hlink.linking.model_exploration import ModelExploration +from hlink.linking.training import Training +from hlink.linking.matching import Matching +from hlink.linking.reporting import Reporting +from hlink.linking.hh_model_exploration import HHModelExploration +from hlink.linking.hh_training import HHTraining +from hlink.linking.hh_matching import HHMatching +from hlink.linking.table import Table + +table_definitions_file = Path(__file__).parent / "table_definitions.csv" + +link_task_choices = { + "preprocessing": Preprocessing, + "training": Training, + "matching": Matching, + "hh_training": HHTraining, + "hh_matching": HHMatching, + "model_exploration": ModelExploration, + "hh_model_exploration": HHModelExploration, + "reporting": Reporting, +} + + +class LinkRun: + """A link run, which manages link tasks, spark tables, and related settings. + + A link run has attributes for each link task in `link_task_choices`. These can + be accessed like normal attributes with dot notation or with the `get_task()` + method. The link run also has a `known_tables` attribute which lists tables + commonly generated during linking and their descriptions. See the `get_table()` + function for a way to get access to `Table` objects by passing a string name. + + The `use_preexisting_tables` and `print_sql` attributes are boolean settings flags. + + The `trained_models` dictionary is used for communication between the matching + and training link tasks and the household matching and household training link + tasks. + """ + + def __init__(self, spark, config, use_preexisting_tables=True, print_sql=False): + self.spark = spark + self.config = config + self.use_preexisting_tables = use_preexisting_tables + self.print_sql = print_sql + + self.trained_models = {} + + for task_name in link_task_choices: + link_task = link_task_choices[task_name](self) + setattr(self, task_name, link_task) + + self._init_tables() + + def _init_tables(self): + """Initialize `self.known_tables` from the contents of `table_definitions_file`.""" + table_defs = pd.read_csv(table_definitions_file) + tables = [] + + for table_def in table_defs.itertuples(): + hide = table_def.hide != 0 + tables.append(Table(self.spark, table_def.name, table_def.desc, hide)) + + self.known_tables = {table.name: table for table in tables} + + def get_task(self, task_name: str): + """Get a link task attribute of the link run by name. + + If you have the string name of a link task and want the task itself, + use this method instead of something like `getattr()`. + + Args: + task_name (str): the name of the link task + + Raises: + AttributeError: if `task_name` is not the name of a link task on the link run + + Returns: + LinkTask: the requested link task + """ + if task_name in link_task_choices: + return getattr(self, task_name) + else: + raise AttributeError(f"LinkRun has no task named '{task_name}'") + + def get_table(self, table_name: str): + """Get a `Table` by name. + + If the table is in `self.known_tables`, return it. Otherwise, return a new + table. This method is infallible, so it will always return a table. The + returned table may or may not exist in spark. + + Args: + table_name (str): the name of the table to retrieve + + Returns: + Table: the requested table + """ + if table_name in self.known_tables: + return self.known_tables[table_name] + return Table(self.spark, table_name, "Unknown table", hide=True) + + def drop_temp_tables(self): + """Delete all temporary spark tables.""" + all_tables = self.spark.catalog.listTables() + temp_tables = filter((lambda table: table.tableType == "TEMPORARY"), all_tables) + + for table in temp_tables: + print(f"Dropping {table.name}") + self.spark.catalog.dropTempView(table.name) diff --git a/hlink/linking/link_step.py b/hlink/linking/link_step.py new file mode 100644 index 0000000..fca6b40 --- /dev/null +++ b/hlink/linking/link_step.py @@ -0,0 +1,50 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink +from typing import List + + +class LinkStep: + def __init__( + self, + task, + desc: str, + *, + input_table_names: List[str] = [], + output_table_names: List[str] = [], + input_model_names: List[str] = [], + output_model_names: List[str] = [], + ): + self.task = task + self.desc = desc + self.input_table_names = input_table_names + self.output_table_names = output_table_names + self.input_model_names = input_model_names + self.output_model_names = output_model_names + + def find_missing_input_table_names(self): + tables = map(self.task.link_run.get_table, self.input_table_names) + missing_tables = filter((lambda table: not table.exists()), tables) + return [table.name for table in missing_tables] + + def run(self): + missing_table_names = self.find_missing_input_table_names() + if len(missing_table_names) > 0: + missing_names_str = ", ".join(missing_table_names) + raise RuntimeError( + f"Missing input tables required for link step '{self}': {missing_names_str}" + ) + + self._run() + + def _run(self): + """Run the link step. + + This abstract method must be implemented by concrete subclasses. It is + wrapped by the `run()` method, which makes some additional quick checks. + """ + raise NotImplementedError() + + def __str__(self): + return self.desc diff --git a/hlink/linking/link_task.py b/hlink/linking/link_task.py new file mode 100755 index 0000000..e6b3073 --- /dev/null +++ b/hlink/linking/link_task.py @@ -0,0 +1,187 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from jinja2 import Environment, PackageLoader, ChoiceLoader +from hlink.errors import SparkError +from timeit import default_timer as timer +import logging +from typing import Optional + + +class LinkTask(object): + """Base class for link tasks. + + A link task consists of one or more `LinkStep`s and belongs to one `LinkRun`. + The `get_steps()` function returns a list specifying which steps the task has + and what order they should be run in. + + The `run_all_steps()` and `run_step()` functions are ways to run the link steps + belonging to the task. + + `run_register_python()` and `run_register_sql()` are methods to be used by + concrete subclasses for creating spark tables and performing step work. + """ + + def __init__(self, link_run, display_name: Optional[str] = None): + self.link_run = link_run + loader = ChoiceLoader( + [ + PackageLoader(self.__class__.__module__), + PackageLoader("hlink.linking", "templates/shared"), + ] + ) + self.jinja_env = Environment(loader=loader) + + if display_name is None: + self.display_name = self.__class__.__name__ + else: + self.display_name = display_name + + @property + def spark(self): + return self.link_run.spark + + def __str__(self): + return self.display_name + + def get_steps(self): + """Get a list of the steps that make up the link task. + + This abstract method must be implemented by concrete subclasses. + + Returns: + List[LinkStep]: the link steps making up the link task + """ + raise NotImplementedError() + + def run_all_steps(self): + """Run all steps in order.""" + start_all = timer() + for (i, step) in enumerate(self.get_steps()): + print(f"Running step {i}: {step}") + step.run() + end_all = timer() + elapsed_time_all = round(end_all - start_all, 2) + print(f"Finished all in {elapsed_time_all}s") + + def run_step(self, step_num: int): + """Run a particular step. + + Note that running steps out of order may cause errors when later steps + depend on the work done by earlier steps and that work is not available. + + Args: + step_num (int): the step number, used as an index into the `get_steps()` list + """ + steps = self.get_steps() + + if step_num < 0 or step_num >= len(steps): + steps_string = "\n\t".join( + [f"step {i}: {steps[i].desc}" for i in range(len(steps))] + ) + print( + f"Error! Couldn't find step {step_num}. Valid steps are: \n\t{steps_string}" + ) + return + + step = steps[step_num] + step_string = f"step {step_num}: {step}" + print(f"Running {step_string}") + logging.info(f"Starting {step.task.display_name} - {step_string}") + + start = timer() + step.run() + end = timer() + + elapsed_time = round(end - start, 2) + print(f"Finished {step_string} in {elapsed_time}s") + logging.info( + f"Finished {step.task.display_name} - {step_string} in {elapsed_time}s" + ) + + def run_register_python( + self, + name: str, + func, + args=[], + persist=False, + overwrite_preexisting_tables=False, + ): + """Run the given python function `func` and register the returned data + frame with the given table `name`. + """ + if name is not None: + if overwrite_preexisting_tables is False: + df = self._check_preexisting_table(name) + else: + df = None + if df is None: + df = func(*args) + if persist: + df.write.mode("overwrite").saveAsTable(name) + else: + df.createOrReplaceTempView(name) + self.spark.sql(f"REFRESH TABLE {name}") + return self.spark.table(name) + else: + try: + return func(*args) + except Exception as err: + logging.error(err) + raise SparkError(str(err)) + + def run_register_sql( + self, + name: str, + sql=None, + template=None, + t_ctx={}, + persist=False, + overwrite_preexisting_tables=False, + ): + """Run the given sql or template (with context) and register the returned + data frame with the given table `name`. + + Read the table from disk instead of running sql if `use_preexisting_tables` + is set to True on the link task's `LinkRun`. + + Persist the created table if `persist` is True. + """ + + def run_sql(): + sql_to_run = self._get_sql(name, sql, template, t_ctx) + if self.link_run.print_sql: + print(sql_to_run) + try: + return self.spark.sql(sql_to_run) + except Exception as err: + print(f"Exception in Spark SQL: {sql_to_run}") + + logging.error(str(err)) + raise SparkError(str(err)) + + return self.run_register_python( + name=name, + func=run_sql, + persist=persist, + overwrite_preexisting_tables=overwrite_preexisting_tables, + ) + + def _check_preexisting_table(self, name: str): + table = self.link_run.get_table(name) + if self.link_run.use_preexisting_tables and table.exists(): + print(f"Preexisting table: {name}") + return table.df() + return None + + def _get_sql(self, name: str, sql, template, t_ctx): + if sql is None: + template_file_name = template if template is not None else name + template_path = f"{template_file_name}.sql" + sql = self.jinja_env.get_template(f"{template_path}").render(t_ctx) + print(f"{name} -- {template_path}") + else: + print(name) + return sql diff --git a/hlink/linking/matching/__init__.py b/hlink/linking/matching/__init__.py new file mode 100644 index 0000000..ebc5e4b --- /dev/null +++ b/hlink/linking/matching/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .matching import Matching diff --git a/hlink/linking/matching/_helpers.py b/hlink/linking/matching/_helpers.py new file mode 100644 index 0000000..0dc79e8 --- /dev/null +++ b/hlink/linking/matching/_helpers.py @@ -0,0 +1,14 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +def get_blocking(conf): + if "blocking" in conf: + return conf["blocking"] + else: + print( + "DEPRECATION WARNING: The config value 'blocking_steps' has been renamed to 'blocking' and is now just a single array of objects." + ) + return conf["blocking_steps"][0] diff --git a/hlink/linking/matching/link_step_explode.py b/hlink/linking/matching/link_step_explode.py new file mode 100644 index 0000000..4d5cb3f --- /dev/null +++ b/hlink/linking/matching/link_step_explode.py @@ -0,0 +1,143 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.functions import array, explode, col + +import hlink.linking.core.comparison as comparison_core +from . import _helpers as matching_helpers + +from hlink.linking.link_step import LinkStep + + +class LinkStepExplode(LinkStep): + def __init__(self, task): + super().__init__( + task, + "explode", + input_table_names=["prepped_df_a", "prepped_df_b"], + output_table_names=["exploded_df_a", "exploded_df_b"], + ) + + def _run(self): + config = self.task.link_run.config + # filter the universe of potential matches before exploding + t_ctx = {} + universe_conf = config.get("potential_matches_universe", []) + t_ctx["universe_exprs"] = [ + conf_entry["expression"] for conf_entry in universe_conf + ] + for suffix in ("a", "b"): + t_ctx["prepped_df"] = f"prepped_df_{suffix}" + output_table_name = f"match_universe_df_{suffix}" + self.task.run_register_sql( + output_table_name, + template="potential_matches_universe", + t_ctx=t_ctx, + persist=True, + ) + + # self.spark.sql("set spark.sql.shuffle.partitions=4000") + blocking = matching_helpers.get_blocking(config) + + self.task.run_register_python( + name="exploded_df_a", + func=lambda: self._explode( + df=self.task.spark.table("match_universe_df_a"), + comparisons=config["comparisons"], + comparison_features=config["comparison_features"], + blocking=blocking, + id_column=config["id_column"], + is_a=True, + ), + ) + self.task.run_register_python( + name="exploded_df_b", + func=lambda: self._explode( + df=self.task.spark.table("match_universe_df_b"), + comparisons=config["comparisons"], + comparison_features=config["comparison_features"], + blocking=blocking, + id_column=config["id_column"], + is_a=False, + ), + ) + + def _explode(self, df, comparisons, comparison_features, blocking, id_column, is_a): + + # comp_feature_names, dist_features_to_run, feature_columns = comparison_core.get_feature_specs_from_comp( + # comparisons, comparison_features + # ) + feature_columns = [] + if comparisons: + comps = comparison_core.get_comparison_leaves(comparisons) + comparison_feature_names = [c["feature_name"] for c in comps] + comparison_features_to_run = [ + c for c in comparison_features if c["alias"] in comparison_feature_names + ] + for c in comparison_features_to_run: + if c.get("column_name", False): + feature_columns.append(c["column_name"]) + elif c.get("column_names", False): + feature_columns += c["column_names"] + + exploded_df = df + + blocking_columns = [bc["column_name"] for bc in blocking] + + all_column_names = set(blocking_columns + feature_columns + [id_column]) + + all_exploding_columns = [bc for bc in blocking if bc.get("explode", False)] + + for exploding_column in all_exploding_columns: + exploding_column_name = exploding_column["column_name"] + if exploding_column.get("expand_length", False): + expand_length = exploding_column["expand_length"] + derived_from_column = exploding_column["derived_from"] + explode_selects = [ + explode(self._expand(derived_from_column, expand_length)).alias( + exploding_column_name + ) + if exploding_column_name == column + else column + for column in all_column_names + ] + else: + explode_selects = [ + explode(col(exploding_column_name)).alias(exploding_column_name) + if exploding_column_name == c + else c + for c in all_column_names + ] + if "dataset" in exploding_column: + derived_from_column = exploding_column["derived_from"] + explode_selects_with_derived_column = [ + col(derived_from_column).alias(exploding_column_name) + if exploding_column_name == column + else column + for column in all_column_names + ] + if exploding_column["dataset"] == "a": + exploded_df = ( + exploded_df.select(explode_selects) + if is_a + else exploded_df.select(explode_selects_with_derived_column) + ) + elif exploding_column["dataset"] == "b": + exploded_df = ( + exploded_df.select(explode_selects) + if not (is_a) + else exploded_df.select(explode_selects_with_derived_column) + ) + else: + exploded_df = exploded_df.select(explode_selects) + return exploded_df + + def _expand(self, column_name, expand_length): + return array( + [ + col(column_name).cast("int") + i + for i in range(-expand_length, expand_length + 1) + ] + ) diff --git a/hlink/linking/matching/link_step_match.py b/hlink/linking/matching/link_step_match.py new file mode 100644 index 0000000..3868639 --- /dev/null +++ b/hlink/linking/matching/link_step_match.py @@ -0,0 +1,98 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.linking.core.comparison_feature as comparison_feature_core +import hlink.linking.core.dist_table as dist_table_core +import hlink.linking.core.comparison as comparison_core +from . import _helpers as matching_helpers + +from hlink.linking.link_step import LinkStep + + +class LinkStepMatch(LinkStep): + def __init__(self, task): + super().__init__( + task, + "match", + input_table_names=["exploded_df_a", "exploded_df_b"], + output_table_names=["potential_matches"], + ) + + def _run(self): + config = self.task.link_run.config + if config.get("mesos", False): + self.task.spark.sql("set spark.sql.shuffle.partitions=4000") + + blocking = matching_helpers.get_blocking(config) + + t_ctx = {} + if config.get("comparisons", False): + if config["comparisons"] != {}: + t_ctx["matching_clause"] = comparison_core.generate_comparisons( + config["comparisons"], + config["comparison_features"], + config["id_column"], + ) + + t_ctx["blocking_columns"] = [bc["column_name"] for bc in blocking] + + blocking_exploded_columns = [ + bc["column_name"] for bc in blocking if "explode" in bc and bc["explode"] + ] + t_ctx["dataset_columns"] = [ + c + for c in self.task.spark.table("exploded_df_a").columns + if c not in blocking_exploded_columns + ] + + # comp_feature_names, dist_features_to_run, features_to_run = comparison_core.get_feature_specs_from_comp( + # config["comparisons"], config["comparison_features"] + # ) + if config.get("comparisons", {}): + comps = comparison_core.get_comparison_leaves(config["comparisons"]) + comp_feature_names = [c["feature_name"] for c in comps] + + t_ctx["feature_columns"] = [ + comparison_feature_core.generate_comparison_feature( + f, config["id_column"], include_as=True + ) + for f in config["comparison_features"] + if f["alias"] in comp_feature_names + ] + + dist_feature_names = [ + c["alias"] + for c in config["comparison_features"] + if c["comparison_type"] in ["geo_distance"] + ] + dist_features_to_run = [ + c["feature_name"] + for c in comps + if c["feature_name"] in dist_feature_names + ] + + if dist_features_to_run: + dist_comps = [ + c + for c in config["comparison_features"] + if c["alias"] in dist_features_to_run + ] + ( + t_ctx["distance_table"], + dist_tables, + ) = dist_table_core.register_dist_tables_and_create_sql( + self.task, dist_comps + ) + if dist_tables: + t_ctx["broadcast_hints"] = dist_table_core.get_broadcast_hint( + dist_tables + ) + + if config.get("streamline_potential_match_generation", False): + t_ctx["dataset_columns"] = [config["id_column"]] + try: + self.task.run_register_sql("potential_matches", t_ctx=t_ctx, persist=True) + finally: + self.task.spark.sql("set spark.sql.shuffle.partitions=200") diff --git a/hlink/linking/matching/link_step_score.py b/hlink/linking/matching/link_step_score.py new file mode 100644 index 0000000..13799e9 --- /dev/null +++ b/hlink/linking/matching/link_step_score.py @@ -0,0 +1,219 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql import Row, Window +from pyspark.sql import functions as f + +import hlink.linking.core.comparison_feature as comparison_feature_core +import hlink.linking.core.threshold as threshold_core +import hlink.linking.core.dist_table as dist_table_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepScore(LinkStep): + def __init__(self, task): + super().__init__( + task, + "score", + input_table_names=[f"{task.table_prefix}potential_matches"], + output_table_names=[ + f"{task.table_prefix}potential_matches_prepped", + f"{task.table_prefix}scored_potential_matches", + f"{task.table_prefix}predicted_matches", + ], + input_model_names=[f"{task.table_prefix}trained_model"], + ) + + def _run(self): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + if training_conf not in config or "chosen_model" not in config[training_conf]: + print( + f"WARNING: Skipping step '{self.desc}'. Your config file either does not contain a '{training_conf}' section or a 'chosen_model' section within the '{training_conf}' section." + ) + return + + if config.get("mesos", False): + self.task.spark.sql("set spark.sql.shuffle.partitions=10000") + self.task.spark.sql("set spark.sql.files.maxPartitionBytes=28000000") + + id_a = config["id_column"] + "_a" + id_b = config["id_column"] + "_b" + chosen_model_params = config[training_conf]["chosen_model"].copy() + self._create_features(config) + pm = self.task.spark.table(f"{table_prefix}potential_matches_prepped") + ind_var_columns = config[training_conf]["independent_vars"] + flatten = lambda l: [item for sublist in l for item in sublist] + if config.get("pipeline_features", False): + pipeline_columns = flatten( + [ + f["input_columns"] if "input_columns" in f else [f["input_column"]] + for f in config["pipeline_features"] + ] + ) + else: + pipeline_columns = [] + required_columns = set( + ind_var_columns + + pipeline_columns + + ["exact", id_a, id_b, "serialp_a", "serialp_b"] + ) & set(pm.columns) + + pre_pipeline = self.task.link_run.trained_models.get( + f"{table_prefix}pre_pipeline" + ) + if pre_pipeline is None: + raise ValueError( + "Missing a temporary table from the training task. This table will not be persisted between sessions of hlink for technical reasons. Please run training before running this step." + ) + + self.task.run_register_python( + f"{table_prefix}potential_matches_pipeline", + lambda: pre_pipeline.transform(pm.select(*required_columns)), + persist=True, + ) + plm = self.task.link_run.trained_models[f"{table_prefix}trained_model"] + pp_required_cols = set(plm.stages[0].getInputCols() + [id_a, id_b]) + pre_pipeline = self.task.spark.table( + f"{table_prefix}potential_matches_pipeline" + ).select(*pp_required_cols) + score_tmp = plm.transform(pre_pipeline) + # TODO: Move save_feature_importances to training or model evaluation step + # _save_feature_importances(self.spark, score_tmp) + + alpha_threshold = chosen_model_params.get("threshold", 0.5) + threshold_ratio = threshold_core.get_threshold_ratio( + config[training_conf], chosen_model_params, default=1.3 + ) + predictions = threshold_core.predict_using_thresholds( + score_tmp, + alpha_threshold, + threshold_ratio, + config[training_conf], + config["id_column"], + ) + predictions.write.mode("overwrite").saveAsTable(f"{table_prefix}predictions") + pmp = self.task.spark.table(f"{table_prefix}potential_matches_pipeline") + self._save_table_with_requested_columns(pm, pmp, predictions, id_a, id_b) + self._save_predicted_matches(config, id_a, id_b) + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + + def _save_feature_importances(self, spark, score_tmp): + config = self.task.link_run.config + if not (config[f"{self.task.training_conf}"].get("feature_importances", False)): + return + cols = ( + score_tmp.select("*").schema["features_vector"].metadata["ml_attr"]["attrs"] + ) + list_extract = [] + for i in cols: + list_extract += cols[i] + + varlist = spark.createDataFrame(Row(**x) for x in list_extract) + varlist.write.mode("overwrite").saveAsTable( + f"{self.task.table_prefix}features_list" + ) + + def _save_table_with_requested_columns(self, pm, pmp, predictions, id_a, id_b): + # merge back in original data for feature verification + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + if config.get("drop_data_from_scored_matches", False): + output_columns = [ + f"{id_a}", + f"{id_b}", + "probability_array", + "probability", + "prediction", + ] + columns_to_select = sorted( + list(set(predictions.columns) & set(output_columns)) + ) + self.task.run_register_python( + f"{table_prefix}scored_potential_matches", + lambda: self.task.spark.table(f"{table_prefix}predictions").select( + columns_to_select + ), + persist=True, + ) + else: + pm_source_cols = list(set(pmp.columns) - set(predictions.columns)) + self.task.run_register_sql( + f"{table_prefix}scored_potential_matches", + template="scored_potential_matches", + t_ctx={ + "pm_source_cols": pm_source_cols, + "id_a": id_a, + "id_b": id_b, + "predictions": f"{table_prefix}predictions", + "potential_matches": f"{table_prefix}potential_matches_pipeline", + }, + persist=True, + ) + print( + f"Scored potential matches have been saved to the Spark table '{table_prefix}scored_potential_matches'." + ) + + def _save_predicted_matches(self, conf, id_a, id_b): + table_prefix = self.task.table_prefix + + spms = self.task.spark.table(f"{table_prefix}scored_potential_matches").filter( + "prediction == 1" + ) + w = Window.partitionBy(f"{id_b}") + spms = spms.select("*", f.count(f"{id_b}").over(w).alias(f"{id_b}_count")) + spms = spms.filter(f"{id_b}_count == 1") + spms = spms.drop(f"{id_b}_count") + spms.write.mode("overwrite").saveAsTable(f"{table_prefix}predicted_matches") + print( + f"Predicted matches with duplicate histid_b removed have been saved to the Spark table '{table_prefix}predicted_matches'." + ) + + def _create_features(self, conf): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + + dep_var = conf[training_conf]["dependent_var"] + potential_matches = f"{table_prefix}potential_matches" + table_name = f"{table_prefix}potential_matches_prepped" + pm_columns = self.task.spark.table(potential_matches).columns + ( + comp_features, + advanced_comp_features, + hh_comp_features, + dist_features, + ) = comparison_feature_core.get_features( + conf, + conf[f"{training_conf}"]["independent_vars"], + pregen_features=pm_columns, + ) + t_ctx_def = { + "comp_features": comp_features, + "match_feature": dep_var, + "advanced_comp_features": advanced_comp_features, + "id": conf["id_column"], + "potential_matches": potential_matches, + } + join_clauses, dist_tables = dist_table_core.register_dist_tables_and_create_sql( + self.task, dist_features + ) + t_ctx_def["distance_table"] = join_clauses + if len(dist_tables): + t_ctx_def["broadcast_hints"] = dist_table_core.get_broadcast_hint( + dist_tables + ) + + comparison_feature_core.create_feature_tables( + self.task, + t_ctx_def, + advanced_comp_features, + hh_comp_features, + conf["id_column"], + table_name=table_name, + ) diff --git a/hlink/linking/matching/matching.py b/hlink/linking/matching/matching.py new file mode 100644 index 0000000..3180283 --- /dev/null +++ b/hlink/linking/matching/matching.py @@ -0,0 +1,20 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from .link_step_explode import LinkStepExplode +from .link_step_match import LinkStepMatch +from .link_step_score import LinkStepScore + + +class Matching(LinkTask): + def __init__(self, link_run): + super().__init__(link_run) + self.training_conf = "training" + self.table_prefix = "" + + def get_steps(self): + return [LinkStepExplode(self), LinkStepMatch(self), LinkStepScore(self)] diff --git a/hlink/linking/matching/templates/potential_matches.sql b/hlink/linking/matching/templates/potential_matches.sql new file mode 100644 index 0000000..bb61c03 --- /dev/null +++ b/hlink/linking/matching/templates/potential_matches.sql @@ -0,0 +1,29 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT DISTINCT +{% for c in dataset_columns %} + {% if not loop.first %},{% endif %}a.{{c}} as {{c}}_a + ,b.{{c}} as {{c}}_b +{% endfor %} +{% if feature_columns %} + {% for c in feature_columns %} + ,{{c}} + {% endfor %} +{% endif %} +FROM exploded_df_a a +JOIN exploded_df_b b ON +{% for col in blocking_columns %} +a.{{ col }} = b.{{ col }} {{ "AND" if not loop.last }} +{% endfor %} +{% if distance_table %} + {% for d in distance_table %} + {{d}} + {% endfor %} +{% endif %} +{% if matching_clause %} +WHERE +{{ matching_clause }} +{% endif %} diff --git a/hlink/linking/matching/templates/potential_matches_count.sql b/hlink/linking/matching/templates/potential_matches_count.sql new file mode 100644 index 0000000..69f66a9 --- /dev/null +++ b/hlink/linking/matching/templates/potential_matches_count.sql @@ -0,0 +1,11 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT COUNT(1) +FROM exploded_df_a a +JOIN exploded_df_b b ON + {% for col in blocking_columns %} + a.{{ col }} = b.{{ col }} {{ "AND" if not loop.last }} + {% endfor %} diff --git a/hlink/linking/matching/templates/potential_matches_universe.sql b/hlink/linking/matching/templates/potential_matches_universe.sql new file mode 100644 index 0000000..4260996 --- /dev/null +++ b/hlink/linking/matching/templates/potential_matches_universe.sql @@ -0,0 +1,14 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT * +FROM {{prepped_df}} +{% if universe_exprs %} + WHERE + {% for expression in universe_exprs %} + {{ expression }} + {{ "AND" if not loop.last }} + {% endfor %} +{% endif %} diff --git a/hlink/linking/model_exploration/__init__.py b/hlink/linking/model_exploration/__init__.py new file mode 100644 index 0000000..ada88f3 --- /dev/null +++ b/hlink/linking/model_exploration/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .model_exploration import ModelExploration diff --git a/hlink/linking/model_exploration/link_step_create_features.py b/hlink/linking/model_exploration/link_step_create_features.py new file mode 100644 index 0000000..cba8a60 --- /dev/null +++ b/hlink/linking/model_exploration/link_step_create_features.py @@ -0,0 +1,125 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml import Pipeline + +import hlink.linking.core.comparison_feature as comparison_feature_core +import hlink.linking.core.dist_table as dist_table_core +import hlink.linking.core.pipeline as pipeline_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepCreateFeatures(LinkStep): + def __init__(self, task): + super().__init__( + task, + "create features", + input_table_names=[ + "prepped_df_a", + "prepped_df_b", + f"{task.table_prefix}training_data", + ], + output_table_names=[ + f"{task.table_prefix}training_features", + f"{task.table_prefix}training_vectorized", + ], + ) + + def _run(self): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + id_col = config["id_column"] + dep_var = config[training_conf]["dependent_var"] + + if training_conf == "hh_training": + self.task.run_register_python( + f"{table_prefix}training_data_ids", + lambda: self.task.spark.table(f"{table_prefix}training_data").select( + f"{id_col}_a", f"{id_col}_b", "serialp_a", "serialp_b", dep_var + ), + persist=True, + ) + else: + self.task.run_register_python( + f"{table_prefix}training_data_ids", + lambda: self.task.spark.table(f"{table_prefix}training_data").select( + f"{id_col}_a", f"{id_col}_b", dep_var + ), + persist=True, + ) + self._create_training_features(dep_var) + + training_features = self.task.spark.table(f"{table_prefix}training_features") + pipeline = self._create_pipeline(training_features) + model = pipeline.fit(training_features) + prepped_data = model.transform(training_features) + prepped_data.write.mode("overwrite").saveAsTable( + f"{table_prefix}training_vectorized" + ) + self.task.link_run.drop_temp_tables() + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + + def _create_training_features(self, dep_var): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + if config[training_conf].get("use_training_data_features", False): + self.task.run_register_python( + f"{table_prefix}training_features", + lambda: self.task.spark.table(f"{table_prefix}training_data"), + persist=True, + ) + else: + ( + comp_features, + advanced_comp_features, + hh_comp_features, + dist_features, + ) = comparison_feature_core.get_features( + config, config[training_conf]["independent_vars"] + ) + t_ctx_def = { + "comp_features": comp_features, + "match_feature": dep_var, + "advanced_comp_features": advanced_comp_features, + "id": config["id_column"], + "potential_matches": f"{table_prefix}training_data_ids", + } + + ( + join_clauses, + dist_tables, + ) = dist_table_core.register_dist_tables_and_create_sql( + self.task, dist_features + ) + t_ctx_def["distance_table"] = join_clauses + if len(dist_tables) > 0: + t_ctx_def["broadcast_hints"] = dist_table_core.get_broadcast_hint( + dist_tables + ) + + comparison_feature_core.create_feature_tables( + self.task, + t_ctx_def, + advanced_comp_features, + hh_comp_features, + config["id_column"], + table_name=f"{table_prefix}training_features", + ) + + def _create_pipeline(self, training_features): + training_conf = str(self.task.training_conf) + config = self.task.link_run.config + ind_vars = list(config[training_conf]["independent_vars"]) + + pipeline_stages = pipeline_core.generate_pipeline_stages( + config, ind_vars, training_features, training_conf + ) + return Pipeline(stages=pipeline_stages) diff --git a/hlink/linking/model_exploration/link_step_get_feature_importances.py b/hlink/linking/model_exploration/link_step_get_feature_importances.py new file mode 100644 index 0000000..fa4fc46 --- /dev/null +++ b/hlink/linking/model_exploration/link_step_get_feature_importances.py @@ -0,0 +1,84 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml import PipelineModel +from hlink.linking.link_step import LinkStep + + +class LinkStepGetFeatureImportances(LinkStep): + def __init__(self, task): + super().__init__( + task, + "get feature importances", + input_table_names=[ + f"{task.table_prefix}training_features", + f"{task.table_prefix}training_vectorized", + ], + output_table_names=[f"{task.table_prefix}training_results"], + ) + + def _run(self): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + self.task.spark.sql("set spark.sql.shuffle.partitions=1") + + if "feature_importances" in config[training_conf]: + if config[training_conf]["feature_importances"]: + + # retrieve the saved chosen model + print("Loading chosen ML model...") + model_path = config["spark_tmp_dir"] + "/chosen_model" + try: + plm = PipelineModel.load(model_path) + except: + print( + "Model not found! You might need to run step_2 to generate and train the chosen model if you haven't already done so." + ) + + # make look at the features and their importances + print("Retrieving model feature importances or coefficients...") + try: + feature_imp = plm.stages[-2].coefficients + except: + try: + feature_imp = plm.stages[-2].featureImportances + except: + print( + "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type." + ) + else: + label = "Feature importances" + else: + feature_imp = feature_imp.round(4) + label = "Coefficients" + + varlist = self.task.spark.table( + f"{table_prefix}features_list" + ).toPandas() + for i in varlist["idx"]: + varlist.at[i, "score"] = feature_imp[i] + varlist.sort_values("score", ascending=False, inplace=True) + vl = self.spark.createDataFrame(varlist) + vl.write.mode("overwrite").saveAsTable( + f"{table_prefix}feature_importances" + ) + + print( + f"{label} have been saved to the Spark table '{table_prefix}feature_importances'." + ) + print(varlist) + + else: + print( + f"'feature_importances' not set == true in '{training_conf}' section of config! Calculation of feature importances or coefficients not completed!" + ) + else: + print( + f"'feature_importances' not included and set == true in '{training_conf}' section of config! Calculation of feature importances or coefficients not completed!" + ) + + self.task.spark.sql("set spark.sql.shuffle.partitions=200") diff --git a/hlink/linking/model_exploration/link_step_ingest_file.py b/hlink/linking/model_exploration/link_step_ingest_file.py new file mode 100644 index 0000000..d7ad6cf --- /dev/null +++ b/hlink/linking/model_exploration/link_step_ingest_file.py @@ -0,0 +1,27 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.linking.link_step import LinkStep + + +class LinkStepIngestFile(LinkStep): + def __init__(self, task): + super().__init__( + task, + "ingest file", + input_table_names=[], + output_table_names=[f"{task.table_prefix}training_data"], + ) + + def _run(self): + self.task.run_register_python( + f"{self.task.table_prefix}training_data", + lambda: self.task.spark.read.csv( + self.task.link_run.config[f"{self.task.training_conf}"]["dataset"], + header=True, + inferSchema=True, + ), + persist=True, + ) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py new file mode 100644 index 0000000..7a07bed --- /dev/null +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -0,0 +1,569 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import itertools +import math +import re +import numpy as np +import pandas as pd +from sklearn.metrics import precision_recall_curve, auc +from pyspark.sql.functions import count, mean + +import hlink.linking.core.threshold as threshold_core +import hlink.linking.core.classifier as classifier_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepTrainTestModels(LinkStep): + def __init__(self, task): + super().__init__( + task, + "train test models", + input_table_names=[ + f"{task.table_prefix}training_features", + f"{task.table_prefix}training_vectorized", + ], + output_table_names=[ + f"{task.table_prefix}training_results", + f"{task.table_prefix}repeat_FPs", + f"{task.table_prefix}repeat_FNs", + ], + ) + + def _run(self): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + self.task.spark.sql("set spark.sql.shuffle.partitions=1") + + dep_var = config[training_conf]["dependent_var"] + id_a = config["id_column"] + "_a" + id_b = config["id_column"] + "_b" + desc_df = _create_desc_df() + columns_to_keep = [id_a, id_b, "features_vector", dep_var] + prepped_data = ( + self.task.spark.table(f"{table_prefix}training_vectorized") + .select(columns_to_keep) + .cache() + ) + + otd_data = self._create_otd_data(id_a, id_b) + + n_training_iterations = config[training_conf].get("n_training_iterations", 10) + seed = config[training_conf].get("seed", 2133) + + splits = self._get_splits(prepped_data, id_a, n_training_iterations, seed) + + model_parameters = self._get_model_parameters(config) + for run in model_parameters: + params = run.copy() + model_type = params.pop("type") + + alpha_threshold = params.pop( + "threshold", config[training_conf].get("threshold", 0.8) + ) + if ( + config[training_conf].get("decision", False) + == "drop_duplicate_with_threshold_ratio" + ): + threshold_ratio = params.pop( + "threshold_ratio", + threshold_core.get_threshold_ratio(config[training_conf], params), + ) + else: + threshold_ratio = False + + threshold_matrix = _calc_threshold_matrix(alpha_threshold, threshold_ratio) + results_dfs = {} + for i in range(len(threshold_matrix)): + results_dfs[i] = _create_results_df() + + first = True + for training_data, test_data in splits: + training_data.cache() + test_data.cache() + + classifier, post_transformer = classifier_core.choose_classifier( + model_type, params, dep_var + ) + + model = classifier.fit(training_data) + + predictions_tmp = _get_probability_and_select_pred_columns( + test_data, model, post_transformer, id_a, id_b, dep_var + ).cache() + predict_train_tmp = _get_probability_and_select_pred_columns( + training_data, model, post_transformer, id_a, id_b, dep_var + ).cache() + + test_pred = predictions_tmp.toPandas() + precision, recall, thresholds_raw = precision_recall_curve( + test_pred[f"{dep_var}"], + test_pred["probability"].round(2), + pos_label=1, + ) + + thresholds_plus_1 = np.append(thresholds_raw, [np.nan]) + param_text = np.full(precision.shape, model_type + "_" + str(params)) + + pr_auc = auc(recall, precision) + print(f"Area under PR curve: {pr_auc}") + + if first: + prc = pd.DataFrame( + { + "params": param_text, + "precision": precision, + "recall": recall, + "threshold_gt_eq": thresholds_plus_1, + } + ) + self.task.spark.createDataFrame(prc).write.mode( + "overwrite" + ).saveAsTable( + f"{self.task.table_prefix}precision_recall_curve_" + + re.sub("[^A-Za-z0-9]", "_", model_type + str(params)) + ) + + first = False + + i = 0 + for at, tr in threshold_matrix: + predictions = threshold_core.predict_using_thresholds( + predictions_tmp, + at, + tr, + config[training_conf], + config["id_column"], + ) + predict_train = threshold_core.predict_using_thresholds( + predict_train_tmp, + at, + tr, + config[training_conf], + config["id_column"], + ) + + results_dfs[i] = self._capture_results( + predictions, + predict_train, + dep_var, + model, + results_dfs[i], + otd_data, + at, + tr, + pr_auc, + ) + i += 1 + + training_data.unpersist() + test_data.unpersist() + + for i in range(len(threshold_matrix)): + desc_df = _append_results(desc_df, results_dfs[i], model_type, params) + + _print_desc_df(desc_df) + desc_df = _load_desc_df_params(desc_df) + self._save_training_results(desc_df, self.task.spark) + self._save_otd_data(otd_data, self.task.spark) + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + + def _get_splits(self, prepped_data, id_a, n_training_iterations, seed): + if self.task.link_run.config[f"{self.task.training_conf}"].get( + "split_by_id_a", False + ): + split_ids = [ + prepped_data.select(id_a) + .distinct() + .randomSplit([0.5, 0.5], seed=seed + i) + for i in range(n_training_iterations) + ] + + splits = [] + for ids_a, ids_b in split_ids: + split_a = prepped_data.join(ids_a, on=id_a, how="inner") + split_b = prepped_data.join(ids_b, on=id_a, how="inner") + splits.append([split_a, split_b]) + + else: + splits = [ + prepped_data.randomSplit([0.5, 0.5], seed=seed + i) + for i in range(n_training_iterations) + ] + + return splits + + def _custom_param_grid_builder(self, conf): + print("Building param grid for models") + given_parameters = conf[f"{self.task.training_conf}"]["model_parameters"] + new_params = [] + for run in given_parameters: + params = run.copy() + model_type = params.pop("type") + + # dropping thresholds to prep for scikitlearn model exploration refactor + threshold = params.pop("threshold", False) + threshold_ratio = params.pop("threshold_ratio", False) + + keys = params.keys() + values = params.values() + + params_exploded = [] + for prod in itertools.product(*values): + params_exploded.append(dict(zip(keys, prod))) + + for subdict in params_exploded: + subdict["type"] = model_type + if threshold: + subdict["threshold"] = threshold + if threshold_ratio: + subdict["threshold_ratio"] = threshold_ratio + + new_params.extend(params_exploded) + return new_params + + def _capture_results( + self, + predictions, + predict_train, + dep_var, + model, + results_df, + otd_data, + at, + tr, + pr_auc, + ): + table_prefix = self.task.table_prefix + + print("Evaluating model performance...") + # write to sql tables for testing + predictions.createOrReplaceTempView(f"{table_prefix}predictions") + predict_train.createOrReplaceTempView(f"{table_prefix}predict_train") + + ( + test_TP_count, + test_FP_count, + test_FN_count, + test_TN_count, + ) = _get_confusion_matrix(predictions, dep_var, otd_data) + test_precision, test_recall, test_mcc = _get_aggregate_metrics( + test_TP_count, test_FP_count, test_FN_count, test_TN_count + ) + + ( + train_TP_count, + train_FP_count, + train_FN_count, + train_TN_count, + ) = _get_confusion_matrix(predict_train, dep_var, otd_data) + train_precision, train_recall, train_mcc = _get_aggregate_metrics( + train_TP_count, train_FP_count, train_FN_count, train_TN_count + ) + + return results_df.append( + { + "precision_test": test_precision, + "recall_test": test_recall, + "precision_train": train_precision, + "recall_train": train_recall, + "pr_auc": pr_auc, + "test_mcc": test_mcc, + "train_mcc": train_mcc, + "model_id": model, + "alpha_threshold": at, + "threshold_ratio": tr, + }, + ignore_index=True, + ) + + def _get_model_parameters(self, conf): + training_conf = str(self.task.training_conf) + + model_parameters = conf[training_conf]["model_parameters"] + if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]: + model_parameters = self._custom_param_grid_builder(conf) + elif model_parameters == []: + raise ValueError( + "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." + ) + return model_parameters + + def _save_training_results(self, desc_df, spark): + table_prefix = self.task.table_prefix + + if desc_df.empty: + print("Training results dataframe is empty.") + else: + desc_df.dropna(axis=1, how="all", inplace=True) + spark.createDataFrame(desc_df, samplingRatio=1).write.mode( + "overwrite" + ).saveAsTable(f"{table_prefix}training_results") + print( + f"Training results saved to Spark table '{table_prefix}training_results'." + ) + + def _save_otd_data(self, otd_data, spark): + table_prefix = self.task.table_prefix + + if otd_data is None: + return + id_a = otd_data["id_a"] + id_b = otd_data["id_b"] + if not otd_data["FP_data"].empty: + sp_FPs = spark.createDataFrame(otd_data["FP_data"]) + counted_FPs = ( + sp_FPs.groupBy(id_a, id_b) + .agg( + count("*").alias("count"), + mean("probability").alias("mean_probability"), + ) + .filter("count > 1") + .orderBy(["count", f"{id_a}", f"{id_b}"]) + ) + counted_FPs.write.mode("overwrite").saveAsTable(f"{table_prefix}repeat_FPs") + print( + f"A table of false positives of length {counted_FPs.count()} was saved as '{table_prefix}repeat_FPs' for analysis." + ) + else: + print("There were no false positives recorded.") + if not otd_data["FN_data"].empty: + sp_FNs = spark.createDataFrame(otd_data["FN_data"]) + counted_FNs = ( + sp_FNs.groupBy(id_a, id_b) + .agg( + count("*").alias("count"), + mean("probability").alias("mean_probability"), + ) + .filter("count > 1") + .orderBy(["count", f"{id_a}", f"{id_b}"]) + ) + counted_FNs.write.mode("overwrite").saveAsTable(f"{table_prefix}repeat_FNs") + print( + f"A table of false negatives of length {counted_FNs.count()} was saved as '{table_prefix}repeat_FNs' for analysis." + ) + else: + print("There were no false negatives recorded.") + + def _create_otd_data(self, id_a, id_b): + """Output Suspicous Data (OTD): used to check config to see if you should find sketchy training data that the models routinely mis-classify""" + training_conf = str(self.task.training_conf) + config = self.task.link_run.config + + if ( + "output_suspicious_TD" in config[training_conf] + and config[training_conf]["output_suspicious_TD"] + ): + return { + "FP_data": pd.DataFrame(), + "FN_data": pd.DataFrame(), + "id_a": id_a, + "id_b": id_b, + } + else: + return None + + +def _calc_mcc(TP, TN, FP, FN): + if (math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))) != 0: + mcc = ((TP * TN) - (FP * FN)) / ( + math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) + ) + else: + mcc = 0 + return mcc + + +def _calc_threshold_matrix(alpha_threshold, threshold_ratio): + if alpha_threshold and type(alpha_threshold) != list: + alpha_threshold = [alpha_threshold] + + if threshold_ratio and type(threshold_ratio) != list: + threshold_ratio = [threshold_ratio] + + if threshold_ratio: + threshold_matrix = [[a, b] for a in alpha_threshold for b in threshold_ratio] + else: + threshold_matrix = [[a, np.nan] for a in alpha_threshold] + + return threshold_matrix + + +def _get_probability_and_select_pred_columns( + pred_df, model, post_transformer, id_a, id_b, dep_var +): + all_prediction_cols = set( + [ + f"{id_a}", + f"{id_b}", + dep_var, + "probability", + "probability_array", + "prediction", + ] + ) + transformed_df = model.transform(pred_df) + post_transform_df = post_transformer.transform(transformed_df) + required_col_df = post_transform_df.select( + list(all_prediction_cols & set(post_transform_df.columns)) + ) + return required_col_df + + +def _get_confusion_matrix(predictions, dep_var, otd_data): + TP = predictions.filter((predictions[dep_var] == 1) & (predictions.prediction == 1)) + TP_count = TP.count() + + FP = predictions.filter((predictions[dep_var] == 0) & (predictions.prediction == 1)) + FP_count = FP.count() + + FN = predictions.filter((predictions[dep_var] == 1) & (predictions.prediction == 0)) + FN_count = FN.count() + + TN = predictions.filter((predictions[dep_var] == 0) & (predictions.prediction == 0)) + TN_count = TN.count() + + if otd_data: + new_FP_data = FP.select( + otd_data["id_a"], otd_data["id_b"], dep_var, "prediction", "probability" + ).toPandas() + otd_data["FP_data"] = otd_data["FP_data"].append(new_FP_data) + + new_FN_data = FN.select( + otd_data["id_a"], otd_data["id_b"], dep_var, "prediction", "probability" + ).toPandas() + otd_data["FN_data"] = otd_data["FN_data"].append(new_FN_data) + return TP_count, FP_count, FN_count, TN_count + + +def _get_aggregate_metrics(TP_count, FP_count, FN_count, TN_count): + if (TP_count + FP_count) == 0: + precision = np.nan + else: + precision = TP_count / (TP_count + FP_count) + if (TP_count + FN_count) == 0: + recall = np.nan + else: + recall = TP_count / (TP_count + FN_count) + mcc = _calc_mcc(TP_count, TN_count, FP_count, FN_count) + return precision, recall, mcc + + +def _create_results_df(): + return pd.DataFrame( + columns=[ + "precision_test", + "recall_test", + "precision_train", + "recall_train", + "pr_auc", + "test_mcc", + "train_mcc", + "model_id", + "alpha_threshold", + "threshold_ratio", + ] + ) + + +def _append_results(desc_df, results_df, model_type, params): + # run.pop("type") + print(results_df) + desc_df = desc_df.append( + { + "model": model_type, + "parameters": params, + "alpha_threshold": results_df["alpha_threshold"][0], + "threshold_ratio": results_df["threshold_ratio"][0], + "precision_test_mean": results_df["precision_test"].mean(), + "precision_test_sd": results_df["precision_test"].std(), + "recall_test_mean": results_df["recall_test"].mean(), + "recall_test_sd": results_df["recall_test"].std(), + "pr_auc_mean": results_df["pr_auc"].mean(), + "pr_auc_sd": results_df["pr_auc"].std(), + "mcc_test_mean": results_df["test_mcc"].mean(), + "mcc_test_sd": results_df["test_mcc"].std(), + "precision_train_mean": results_df["precision_train"].mean(), + "precision_train_sd": results_df["precision_train"].std(), + "recall_train_mean": results_df["recall_train"].mean(), + "recall_train_sd": results_df["recall_train"].std(), + "mcc_train_mean": results_df["train_mcc"].mean(), + "mcc_train_sd": results_df["train_mcc"].std(), + }, + ignore_index=True, + ) + _print_desc_df(desc_df) + return desc_df + + +def _print_desc_df(desc_df): + pd.set_option("display.max_colwidth", -1) + print( + desc_df.drop( + [ + "recall_test_sd", + "recall_train_sd", + "precision_test_sd", + "precision_train_sd", + ], + axis=1, + ).iloc[-1] + ) + print("\n") + + +def _load_desc_df_params(desc_df): + params = [ + "maxDepth", + "numTrees", + "featureSubsetStrategy", + "subsample", + "minInstancesPerNode", + "maxBins", + "class_weight", + "C", + "kernel", + "threshold", + "maxIter", + ] + + load_params = lambda j, param: j.get(param, np.nan) + for param in params: + desc_df[param] = desc_df["parameters"].apply(load_params, args=(param,)) + desc_df["class_weight"] = desc_df["class_weight"].apply( + lambda x: str(x) if pd.notnull(x) else x + ) + desc_df["parameters"] = desc_df["parameters"].apply( + lambda t: str(t) if pd.notnull(t) else t + ) + return desc_df + + +def _create_desc_df(): + return pd.DataFrame( + columns=[ + "model", + "parameters", + "alpha_threshold", + "threshold_ratio", + "precision_test_mean", + "precision_test_sd", + "recall_test_mean", + "recall_test_sd", + "mcc_test_mean", + "mcc_test_sd", + "precision_train_mean", + "precision_train_sd", + "recall_train_mean", + "recall_train_sd", + "pr_auc_mean", + "pr_auc_sd", + "mcc_train_mean", + "mcc_train_sd", + ] + ) diff --git a/hlink/linking/model_exploration/model_exploration.py b/hlink/linking/model_exploration/model_exploration.py new file mode 100644 index 0000000..bd5df6a --- /dev/null +++ b/hlink/linking/model_exploration/model_exploration.py @@ -0,0 +1,26 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from .link_step_ingest_file import LinkStepIngestFile +from .link_step_create_features import LinkStepCreateFeatures +from .link_step_train_test_models import LinkStepTrainTestModels +from .link_step_get_feature_importances import LinkStepGetFeatureImportances + + +class ModelExploration(LinkTask): + def __init__(self, link_run): + super().__init__(link_run, display_name="Model Exploration") + self.training_conf = "training" + self.table_prefix = "model_eval_" + + def get_steps(self): + return [ + LinkStepIngestFile(self), + LinkStepCreateFeatures(self), + LinkStepTrainTestModels(self), + LinkStepGetFeatureImportances(self), + ] diff --git a/hlink/linking/preprocessing/__init__.py b/hlink/linking/preprocessing/__init__.py new file mode 100644 index 0000000..9a58a81 --- /dev/null +++ b/hlink/linking/preprocessing/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .preprocessing import Preprocessing diff --git a/hlink/linking/preprocessing/link_step_prep_dataframes.py b/hlink/linking/preprocessing/link_step_prep_dataframes.py new file mode 100644 index 0000000..327b64b --- /dev/null +++ b/hlink/linking/preprocessing/link_step_prep_dataframes.py @@ -0,0 +1,109 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.functions import col + +import hlink.linking.core.column_mapping as column_mapping_core +import hlink.linking.core.substitutions as substitutions_core +import hlink.linking.core.transforms as transforms_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepPrepDataframes(LinkStep): + def __init__(self, task): + super().__init__( + task, + "prepare dataframes", + input_table_names=["raw_df_a", "raw_df_b"], + output_table_names=["prepped_df_a", "prepped_df_b"], + ) + + def _run(self): + config = self.task.link_run.config + if config.get("mesos", False): + self.task.spark.sql("set spark.sql.shuffle.partitions=4000") + + substitution_columns = config.get("substitution_columns", []) + self.task.run_register_python( + name="prepped_df_a", + func=lambda: self._prep_dataframe( + self.task.spark.table("raw_df_a"), + config["column_mappings"], + substitution_columns, + config["feature_selections"], + True, + config["id_column"], + ), + persist=True, + ) + self.task.run_register_python( + name="prepped_df_b", + func=lambda: self._prep_dataframe( + self.task.spark.table("raw_df_b"), + config["column_mappings"], + substitution_columns, + config["feature_selections"], + False, + config["id_column"], + ), + persist=True, + ) + + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + + # Create a function to correctly map and select the columns from the data frames + def _prep_dataframe( + self, + df, + column_definitions, + substitution_columns, + feature_selections, + is_a, + id_column, + ): + """ + Returns a new dataframe after having selected the given columns out with appropriate + transformations and substitutions. + + Parameters + ---------- + df: dataframe to operate on + column_definitions: config array of columns to pull out and transforms to apply + substitution_columns: config array of substitutions to apply to the selected columns + id_column: unique id column for a record + + Returns + ---------- + New dataframe with the operations having been applied. + """ + df_selected = df + spark = self.task.spark + column_selects = [col(id_column)] + if column_definitions and isinstance(column_definitions[0], list): + print( + "DEPRECATION WARNING: The config value 'column_mappings' is no longer a nested (double) array and is now an array of objects. Please change your config for future releases." + ) + flat_column_mappings = [ + item for sublist in column_definitions for item in sublist + ] + else: + flat_column_mappings = column_definitions + + for column_mapping in flat_column_mappings: + df_selected, column_selects = column_mapping_core.select_column_mapping( + column_mapping, df_selected, is_a, column_selects + ) + + df_selected = df_selected.select(column_selects) + + df_selected = substitutions_core.generate_substitutions( + spark, df_selected, substitution_columns + ) + + df_selected = transforms_core.generate_transforms( + spark, df_selected, feature_selections, self.task, is_a, id_column + ) + return df_selected diff --git a/hlink/linking/preprocessing/link_step_register_raw_dfs.py b/hlink/linking/preprocessing/link_step_register_raw_dfs.py new file mode 100644 index 0000000..d4060e8 --- /dev/null +++ b/hlink/linking/preprocessing/link_step_register_raw_dfs.py @@ -0,0 +1,173 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import os.path + +from hlink.errors import DataError +from hlink.linking.link_step import LinkStep + + +def handle_paths(datasource, a_or_b): + if "parquet_file" in datasource: + path = os.path.realpath(datasource["parquet_file"]) + file_type = "parquet" + return path, file_type + elif "file" in datasource: + file_str = datasource["file"] + filename, file_extension = os.path.splitext(file_str) + if file_extension == ".csv" or file_extension == ".parquet": + path = os.path.realpath(datasource["file"]) + file_type = file_extension.strip(".") + return path, file_type + else: + raise ValueError( + f"The file given for datasource {a_or_b} must be either a CSV or parquet file. You provided a {file_extension} file." + ) + else: + raise ValueError( + f"You must specify either a parquet or csv file to be used as datasource {a_or_b}. This should be a property of 'datasource_{a_or_b}' in the config file." + ) + + +class LinkStepRegisterRawDfs(LinkStep): + def __init__(self, task): + super().__init__( + task, + "register raw dataframes", + input_table_names=[], + output_table_names=["raw_df_a", "raw_df_b"], + ) + + def _run(self): + config = self.task.link_run.config + path_a, file_type_a = handle_paths(config["datasource_a"], "a") + path_b, file_type_b = handle_paths(config["datasource_b"], "b") + + self._load_unpartitioned(file_type_a, "_a", path_a) + self._load_unpartitioned(file_type_b, "_b", path_b) + + self.task.run_register_python( + name="raw_df_a", + func=lambda: self._filter_dataframe(config, "a"), + persist=True, + ) + self.task.run_register_python( + name="raw_df_b", + func=lambda: self._filter_dataframe(config, "b"), + persist=True, + ) + + self._check_for_all_spaces_unrestricted_file("raw_df_a") + self._check_for_all_spaces_unrestricted_file("raw_df_b") + + def _load_unpartitioned(self, file_type, a_or_b, path): + if file_type == "parquet": + self.task.run_register_python( + "raw_df_unpartitioned" + a_or_b, + lambda: self.task.spark.read.parquet(path), + ) + elif file_type == "csv": + self.task.run_register_python( + "raw_df_unpartitioned" + a_or_b, + lambda: self.task.spark.read.csv(path, header=True, inferSchema=True), + ) + else: + raise ValueError( + f"{file_type} is not a valid file type for this operation." + ) + + def _filter_dataframe(self, config, a_or_b): + spark = self.task.spark + table_name = f"raw_df_unpartitioned_{a_or_b}" + filtered_df = spark.table(table_name) + if "filter" in config: + for dataset_filter in config["filter"]: + if "expression" in dataset_filter: + filter_expression = dataset_filter["expression"] + if ( + "datasource" not in dataset_filter + or dataset_filter["datasource"] == a_or_b + ): + if ( + "household" in dataset_filter + and dataset_filter["household"] + ): + serial_a = dataset_filter["serial_a"] + serial_b = dataset_filter["serial_b"] + ser = serial_a if a_or_b == "a" else serial_b + serials_df = ( + filtered_df.filter(filter_expression) + .select(ser) + .distinct() + ) + filtered_df = filtered_df.join(serials_df, on=[ser]) + else: + filtered_df = filtered_df.filter(filter_expression) + elif "training_data_subset" in dataset_filter: + if dataset_filter["training_data_subset"]: + if ( + "datasource" not in dataset_filter + or dataset_filter["datasource"] == a_or_b + ): + if "training_data" not in str( + self.task.spark.catalog.listTables() + ): + self.task.run_register_python( + "training_data", + lambda: self.task.spark.read.csv( + config["training"]["dataset"], + header=True, + inferSchema=True, + ), + persist=True, + ) + filtered_df.createOrReplaceTempView("temp_filtered_df") + filtered_df = self.task.run_register_sql( + name=None, + template="training_data_subset", + t_ctx={ + "table_name": "temp_filtered_df", + "a_or_b": a_or_b, + "id": config["id_column"], + }, + ) + spark.catalog.dropTempView("temp_filtered_df") + else: + pass + else: + pass + else: + raise ValueError(f"Invalid filter: {dataset_filter}") + return filtered_df + + def _check_for_all_spaces_unrestricted_file(self, df_name): + df = self.task.spark.table(df_name) + col_types = dict(df.dtypes) + string_cols = [name for name, type in col_types.items() if type == "string"] + + space_columns = [] + + df_len = df.count() + + for column_name in string_cols: + + if ("name" in str.lower(column_name)) or ( + "street" in str.lower(column_name) + ): + if ( + self.task.spark.sql( + f"SELECT count(*) from {df_name} where {column_name} rlike '^ +$'" + ).first()[0] + == df_len + ): + space_columns.append(column_name) + + if space_columns: + col_names = ", ".join(space_columns) + raise DataError( + f"The following columns in the {df_name} table contain data which consist of all spaces, as exported in unrestricted data files: {col_names}.\nPlease point to data files with restricted versions of the data in your configuration file." + ) + else: + pass diff --git a/hlink/linking/preprocessing/preprocessing.py b/hlink/linking/preprocessing/preprocessing.py new file mode 100644 index 0000000..288d6ad --- /dev/null +++ b/hlink/linking/preprocessing/preprocessing.py @@ -0,0 +1,14 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from .link_step_register_raw_dfs import LinkStepRegisterRawDfs +from .link_step_prep_dataframes import LinkStepPrepDataframes + + +class Preprocessing(LinkTask): + def get_steps(self): + return [LinkStepRegisterRawDfs(self), LinkStepPrepDataframes(self)] diff --git a/hlink/linking/preprocessing/templates/attach_family_col.sql b/hlink/linking/preprocessing/templates/attach_family_col.sql new file mode 100644 index 0000000..6254ed9 --- /dev/null +++ b/hlink/linking/preprocessing/templates/attach_family_col.sql @@ -0,0 +1,8 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT pd.*, pd_fam.{{other_col}} as {{output_col}} +FROM {{prepped_df}} pd +LEFT JOIN {{prepped_df}} pd_fam ON pd_fam.{{family_id}} = pd.{{family_id}} AND pd_fam.{{person_id}} = pd.{{person_pointer}} diff --git a/hlink/linking/preprocessing/templates/attach_neighbor_col.sql b/hlink/linking/preprocessing/templates/attach_neighbor_col.sql new file mode 100644 index 0000000..516ddb7 --- /dev/null +++ b/hlink/linking/preprocessing/templates/attach_neighbor_col.sql @@ -0,0 +1,8 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT p.*, hh.neighbor_names as {{output_column}} +FROM prepped_df_tmp p +LEFT JOIN hh_nbor hh ON p.{{sort_column}} = hh.serial diff --git a/hlink/linking/preprocessing/templates/attach_related_col.sql b/hlink/linking/preprocessing/templates/attach_related_col.sql new file mode 100644 index 0000000..f3a026c --- /dev/null +++ b/hlink/linking/preprocessing/templates/attach_related_col.sql @@ -0,0 +1,13 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT pd.*, pd_rel.{{output_col}} +FROM {{prepped_df}} pd +LEFT JOIN ( + SELECT pd.{{id}}, collect_set(nvl(pd_fam.{{input_col}}, NULL)) as {{output_col}} + FROM {{prepped_df}} pd + LEFT JOIN {{prepped_df}} pd_fam ON pd_fam.{{family_id}} = pd.{{family_id}} AND pd_fam.{{id}} != pd.{{id}} AND pd_fam.{{relate_col}} <= {{top_code}} AND pd_fam.{{relate_col}} >= {{bottom_code}} + GROUP BY pd.{{id}} +) pd_rel ON pd.{{id}} = pd_rel.{{id}} diff --git a/hlink/linking/preprocessing/templates/attach_related_cols_as_rows.sql b/hlink/linking/preprocessing/templates/attach_related_cols_as_rows.sql new file mode 100644 index 0000000..25b2c57 --- /dev/null +++ b/hlink/linking/preprocessing/templates/attach_related_cols_as_rows.sql @@ -0,0 +1,25 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT pd.*, pd_rel.{{output_col}} +FROM {{prepped_df}} pd +LEFT JOIN ( + SELECT pd.{{id}}, collect_set( + if(pd_fam.{{input_cols[0]}} IS NOT NULL, + named_struct( + {% for c in input_cols %} + '{{c}}', pd_fam.{{c}} {% if not loop.last %}, {% endif%} + {% endfor %} + ), NULL) + ) as {{output_col}} + FROM {{prepped_df}} pd + LEFT JOIN ( + SELECT * + FROM {{prepped_df}} pd_fam + WHERE pd_fam.{{relate_col}} <= {{top_code}} AND pd_fam.{{relate_col}} >= {{bottom_code}} + {% if filter %} AND {{filter}} {% endif %} + ) pd_fam ON pd_fam.{{family_id}} = pd.{{family_id}} AND pd_fam.{{id}} != pd.{{id}} + GROUP BY pd.{{id}} +) pd_rel ON pd.{{id}} = pd_rel.{{id}} diff --git a/hlink/linking/preprocessing/templates/hh_nbor.sql b/hlink/linking/preprocessing/templates/hh_nbor.sql new file mode 100644 index 0000000..c2c8b3d --- /dev/null +++ b/hlink/linking/preprocessing/templates/hh_nbor.sql @@ -0,0 +1,11 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT hh.serial, collect_list(hh2.hh_name) as neighbor_names +FROM hh_nbor_rank hh +LEFT JOIN hh_nbor_rank hh2 ON hh2.neighborhood = hh.neighborhood + AND hh2.serial != hh.serial + AND abs(hh2.num - hh.num) <= {{range}} +GROUP BY hh.serial diff --git a/hlink/linking/preprocessing/templates/hh_nbor_rank.sql b/hlink/linking/preprocessing/templates/hh_nbor_rank.sql new file mode 100644 index 0000000..16c213a --- /dev/null +++ b/hlink/linking/preprocessing/templates/hh_nbor_rank.sql @@ -0,0 +1,16 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT hh.serial, p_num.hh_name, p_num.neighborhood, row_number() OVER (PARTITION BY p_num.neighborhood ORDER BY p_num.serial) as num +FROM ( + SELECT {{sort_column}} as serial + FROM prepped_df_tmp + GROUP BY {{sort_column}} +) hh +LEFT JOIN ( + SELECT p.{{sort_column}} as serial, p.{{input_column}} as hh_name, p.{{neighborhood_column}} as neighborhood, row_number() OVER (PARTITION BY p.{{sort_column}} ORDER BY p.PERNUM) as num + FROM prepped_df_tmp p +) p_num ON p_num.num = 1 AND p_num.serial = hh.serial + diff --git a/hlink/linking/preprocessing/templates/training_data_subset.sql b/hlink/linking/preprocessing/templates/training_data_subset.sql new file mode 100644 index 0000000..b862546 --- /dev/null +++ b/hlink/linking/preprocessing/templates/training_data_subset.sql @@ -0,0 +1,12 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT /*+ BROADCAST(s) */ t.* +FROM {{table_name}} t +JOIN ( + SELECT DISTINCT t.serialp + FROM {{table_name}} t + JOIN training_data td ON t.{{id}} = td.{{id}}_{{a_or_b}} +) s ON s.serialp = t.serialp diff --git a/hlink/linking/reporting/__init__.py b/hlink/linking/reporting/__init__.py new file mode 100644 index 0000000..9429241 --- /dev/null +++ b/hlink/linking/reporting/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .reporting import Reporting diff --git a/hlink/linking/reporting/link_step_export_crosswalk.py b/hlink/linking/reporting/link_step_export_crosswalk.py new file mode 100644 index 0000000..1ff06d0 --- /dev/null +++ b/hlink/linking/reporting/link_step_export_crosswalk.py @@ -0,0 +1,75 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink +import pyspark.sql.functions as f +import os + +from hlink.linking.link_step import LinkStep + + +class LinkStepExportCrosswalk(LinkStep): + def __init__(self, task): + super().__init__( + task, + "export crosswalk", + input_table_names=[ + "raw_df_a", + "raw_df_b", + "predicted_matches", + "hh_predicted_matches", + ], + output_table_names=[], + ) + + def _run(self): + """ Pull in key demographic data for linked individuals and export a fixed-width crosswalk file. """ + config = self.task.link_run.config + + pm = ( + self.task.spark.table("predicted_matches") + .select("histid_a", "histid_b") + .withColumn("linked_round", f.lit(1)) + ) + hhpm = ( + self.task.spark.table("hh_predicted_matches") + .select("histid_a", "histid_b") + .withColumn("linked_round", f.lit(2)) + ) + pm.unionByName(hhpm).write.mode("overwrite").saveAsTable( + "all_predicted_matches" + ) + + raw_cols = ["histid", "serialp", "pernum", "age", "sex", "statefip", "bpl"] + + raw_cols_sql = "select " + raw_cols_sql += ", ".join([f"raw_a.{col} as {col}_a" for col in raw_cols]) + raw_cols_sql += ", " + raw_cols_sql += ", ".join([f"raw_b.{col} as {col}_b" for col in raw_cols]) + raw_cols_sql += "from all_predicted_matches left join raw_df_a raw_a on histid_a left join raw_df_b raw_b on histid_b" + + joined_predictions_with_demog = self.task.spark.sql(raw_cols_sql) + joined_predictions_with_demog.write.mode("overwrite").saveAsTable( + "joined_predictions" + ) + jp = self.task.spark.table("joined_predictions") + + year_a = config["datasource_a"]["alias"] + year_b = config["datasource_b"]["alias"] + + this_path = os.path.dirname(__file__) + reports_path = os.path.join(this_path, "../../../output_data") + folder_path = os.path.join(this_path, "../../../output_data/crosswalks") + csv_path = os.path.join( + this_path, + f"../../../output_data/crosswalks/{year_a}_{year_b}_predicted_matches_crosswalk.csv", + ) + if not os.path.exists(reports_path): + os.mkdir(reports_path) + if not os.path.exists(folder_path): + os.mkdir(folder_path) + + jp.toPandas().to_csv(csv_path) + + # TODO: generate crosswalk output as fixed width instead of CSV (modify code from NHGIS) + # column widths = {"histid": 36, "serialp": 8, "pernum": 4, "age": 3, "sex": 1, "statefip": 2, "bpl": 5} diff --git a/hlink/linking/reporting/link_step_report_r2_percent_linked.py b/hlink/linking/reporting/link_step_report_r2_percent_linked.py new file mode 100644 index 0000000..b0df12b --- /dev/null +++ b/hlink/linking/reporting/link_step_report_r2_percent_linked.py @@ -0,0 +1,85 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pyspark.sql.functions as f +from pyspark.sql.window import Window + +from hlink.linking.link_step import LinkStep + + +class LinkStepReportR2PercentLinked(LinkStep): + def __init__(self, task): + super().__init__( + task, + "report round 2 percent linked", + input_table_names=[ + "prepped_df_a", + "predicted_matches", + "hh_predicted_matches", + ], + output_table_names=[], + ) + + def _run(self): + """ For households with anyone linked in round 1, report percent of remaining household members linked in round 2. """ + + if self.task.link_run.config.get("mesos", False): + self.task.spark.sql("set spark.sql.shuffle.partitions=4000") + + pdfa = self.task.spark.table("prepped_df_a").select("serialp", "histid") + pm = ( + self.task.spark.table("predicted_matches") + .select("histid_a") + .withColumn("linked_round", f.lit(1)) + ) + hhpm = ( + self.task.spark.table("hh_predicted_matches") + .select("histid_a") + .withColumn("linked_round", f.lit(2)) + ) + linked_rnds = ( + pdfa.join(pm, pdfa["histid"] == pm["histid_a"], "left") + .drop("histid_a") + .join(hhpm, pdfa["histid"] == hhpm["histid_a"], "left") + .drop("histid_a") + .select( + "serialp", + "histid", + f.when(~f.isnull(pm["linked_round"]), pm["linked_round"]) + .otherwise(hhpm["linked_round"]) + .alias("linked_round"), + ) + .fillna(0) + ) + + linked_rnds.cache().createOrReplaceTempView("linked_rounds") + + window = Window.partitionBy(linked_rnds["serialp"]) + df = linked_rnds.withColumn("histid_ct_total", f.count("serialp").over(window)) + df0 = df.withColumn( + "R1count", f.count(f.when(f.col("linked_round") == 1, True)).over(window) + ) + df1 = df0.withColumn( + "R2count", f.count(f.when(f.col("linked_round") == 2, True)).over(window) + ) + + dfu = df1.select("serialp", "histid_ct_total", "R1count", "R2count").distinct() + df2 = dfu.withColumn("R1_pct", dfu["R1count"] / dfu["histid_ct_total"]) + df3 = df2.withColumn( + "R2_pct", df2["R2count"] / (df2["histid_ct_total"] - df2["R1count"]) + ) + + df3.cache().createOrReplaceTempView("counted_links") + + print( + "Round 1 match rate: " + + str(df3.agg({"R1_pct": "avg"}).collect()[0]["avg(R1_pct)"]) + ) + print( + "Round 2 match rate of remaining HH members: " + + str(df3.agg({"R2_pct": "avg"}).collect()[0]["avg(R2_pct)"]) + ) + + self.task.spark.sql("set spark.sql.shuffle.partitions=200") diff --git a/hlink/linking/reporting/link_step_report_representivity.py b/hlink/linking/reporting/link_step_report_representivity.py new file mode 100644 index 0000000..f89c3ef --- /dev/null +++ b/hlink/linking/reporting/link_step_report_representivity.py @@ -0,0 +1,602 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import csv +import os +from timeit import default_timer as timer +import pyspark.sql.functions as f +from pyspark.sql.window import Window + +from hlink.linking.link_step import LinkStep + + +class LinkStepReportRepresentivity(LinkStep): + def __init__(self, task): + super().__init__( + task, + "report representivity", + input_table_names=[ + "raw_df_a", + "raw_df_b", + "prepped_df_a", + "prepped_df_b", + "predicted_matches", + "hh_predicted_matches", + ], + output_table_names=[], + ) + + def _run(self): + """ Report on representivity of linked data compared to source populations for 1900, 1910, linked round 1, and linked round 2. """ + spark = self.task.spark + config = self.task.link_run.config + + # check to make sure columns are in data + raw_cols_wishlist = { + "histid", + "serialp", + "sex", + "age", + "marst", + "durmarr", + "sei", + } + raw_a_cols_present = set([x.lower() for x in spark.table("raw_df_a").columns]) + raw_b_cols_present = set([x.lower() for x in spark.table("raw_df_b").columns]) + raw_cols = list(raw_cols_wishlist & raw_a_cols_present & raw_b_cols_present) + + prepped_cols_wishlist = { + "histid", + "race_div_100", + "relate_div_100", + "region", + "bpl_clean", + "namefrst_unstd", + "namefrst_std", + "namelast_clean", + "statefip", + } + pdfa_cols_present = set( + [x.lower() for x in spark.table("prepped_df_a").columns] + ) + pdfb_cols_present = set( + [x.lower() for x in spark.table("prepped_df_b").columns] + ) + prepped_cols = list( + prepped_cols_wishlist & pdfa_cols_present & pdfb_cols_present + ) + + rdfa = spark.table("raw_df_a").select(raw_cols) + rdfb = spark.table("raw_df_b").select(raw_cols) + pdfa = spark.table("prepped_df_a").select(prepped_cols) + pdfb = spark.table("prepped_df_b").select(prepped_cols) + pm = ( + spark.table("predicted_matches") + .select("histid_a", "histid_b") + .withColumn("linked_round", f.lit(1)) + ) + hhpm = ( + spark.table("hh_predicted_matches") + .select("histid_a", "histid_b") + .withColumn("linked_round", f.lit(2)) + ) + + source_data_a = ( + rdfa.join(pm, rdfa["histid"] == pm["histid_a"], "left") + .drop("histid_a") + .join(hhpm, rdfa["histid"] == hhpm["histid_a"], "left") + .drop("histid_a") + .select( + "*", + f.when(~f.isnull(pm["linked_round"]), pm["linked_round"]) + .when(~f.isnull(hhpm["linked_round"]), hhpm["linked_round"]) + .otherwise(0) + .alias("linked_round_all"), + ) + .drop("linked_round", "histid_b") + .join(pdfa, "histid", "left") + ) + + source_data_b = ( + rdfb.join(pm, rdfb["histid"] == pm["histid_b"], "left") + .drop("histid_b") + .join(hhpm, rdfb["histid"] == hhpm["histid_b"], "left") + .drop("histid_b") + .select( + "*", + f.when(~f.isnull(pm["linked_round"]), pm["linked_round"]) + .when(~f.isnull(hhpm["linked_round"]), hhpm["linked_round"]) + .otherwise(0) + .alias("linked_round_all"), + ) + .drop("linked_round", "histid_a") + .join(pdfb, "histid", "left") + ) + + source_data_a.createOrReplaceTempView("source_data_a_pre0") + source_data_b.createOrReplaceTempView("source_data_b_pre0") + + sda_ct = source_data_a.count() + sdb_ct = source_data_b.count() + sda_r1_ct = source_data_a.filter("linked_round_all = 1").count() + sda_r2_ct = source_data_a.filter("linked_round_all = 2").count() + sdb_r1_ct = source_data_b.filter("linked_round_all = 1").count() + sdb_r2_ct = source_data_b.filter("linked_round_all = 2").count() + + # Add the region of residence col + input_col = "statefip" # join key in core data + output_col = "region_of_residence" + col_to_join_on = "bpl" + col_to_add = "region" + this_path = os.path.dirname(__file__) + region_dict = os.path.join(this_path, "../../tests/input_data/region.csv") + null_filler = 99 + col_type = "integer" + + # open up csv file + self.task.run_register_python( + name="region_data", + func=lambda: spark.read.csv(region_dict, header=True, inferSchema=True), + persist=True, + ) + + # join the csv file to the dataframe (df_selected) + source_data_a = self.task.run_register_sql( + "source_data_a_pre1", + template="attach_variable", + t_ctx={ + "input_col": input_col, + "output_col": output_col, + "prepped_df": "source_data_a_pre0", + "col_to_join_on": col_to_join_on, + "col_to_add": col_to_add, + "region_data": "region_data", + }, + overwrite_preexisting_tables=True, + persist=True, + ) + source_data_b = self.task.run_register_sql( + "source_data_b_pre1", + template="attach_variable", + t_ctx={ + "input_col": input_col, + "output_col": output_col, + "prepped_df": "source_data_b_pre0", + "col_to_join_on": col_to_join_on, + "col_to_add": col_to_add, + "region_data": "region_data", + }, + overwrite_preexisting_tables=True, + persist=True, + ) + source_data_a = source_data_a.fillna(null_filler, subset=[output_col]) + source_data_b = source_data_b.fillna(null_filler, subset=[output_col]) + + source_data_a.withColumn( + output_col, source_data_a[output_col].cast(col_type) + ).write.mode("overwrite").saveAsTable("source_data_a") + source_data_b.withColumn( + output_col, source_data_b[output_col].cast(col_type) + ).write.mode("overwrite").saveAsTable("source_data_b") + + source_data_a = spark.table("source_data_a") + source_data_b = spark.table("source_data_b") + + reports_path = os.path.join(this_path, "../../../output_data") + folder_path = os.path.join(this_path, "../../../output_data/reports") + csv_path = os.path.join( + this_path, "../../../output_data/reports/representivity.csv" + ) + if not os.path.exists(reports_path): + os.mkdir(reports_path) + if not os.path.exists(folder_path): + os.mkdir(folder_path) + + alias_source_a = config["datasource_a"]["alias"] + alias_source_b = config["datasource_b"]["alias"] + + with open(csv_path, "w", newline="") as csvfile: + csvwriter = csv.writer(csvfile) + header = [ + "feature", + "values", + f"{alias_source_a} all count", + f"{alias_source_a} all percent", + f"{alias_source_a} round 1 count", + f"{alias_source_a} round 1 percent", + f"{alias_source_a} round 2 count", + f"{alias_source_a} round 2 percent", + f"{alias_source_b} all count", + f"{alias_source_b} all percent", + f"{alias_source_b} round 1 count", + f"{alias_source_b} round 1 percent", + f"{alias_source_b} round 2 count", + f"{alias_source_b} round 2 percent", + ] + csvwriter.writerow(header) + csvwriter.writerow( + [ + "Total count", + "", + sda_ct, + "", + sda_r1_ct, + "", + sda_r2_ct, + "", + sdb_ct, + "", + sdb_r1_ct, + "", + sdb_r2_ct, + "", + ] + ) + csvwriter.writerow([]) + + def _groupby_cascade( + feature, + label=False, + comp_type="other", + interval=False, + dni=False, + gt_threshold=False, + lt_threshold=False, + second_feature=False, + groupby_feat=False, + csvwriter=csvwriter, + source_data_a=source_data_a, + source_data_b=source_data_b, + sda_ct=sda_ct, + sda_r1_ct=sda_r1_ct, + sda_r2_ct=sda_r2_ct, + sdb_ct=sdb_ct, + sdb_r1_ct=sdb_r1_ct, + sdb_r2_ct=sdb_r2_ct, + ): + start = timer() + + def _get_withColumn(sd): + if comp_type == "keep_low_by_group": + with_col = "rb" + sd = sd.withColumn( + with_col, + f.when( + sd[feature].cast("int") >= lt_threshold, "null" + ).otherwise( + sd[groupby_feat].cast("int") + - (sd[groupby_feat].cast("int") % interval) + ), + ) + + elif comp_type == "keep_high_by_group": + with_col = "rb" + if dni: + sd = sd.withColumn( + with_col, + f.when( + (sd[feature].cast("int") > gt_threshold) + & (sd[feature].cast("int") != dni), + sd[groupby_feat].cast("int") + - (sd[groupby_feat].cast("int") % interval), + ).otherwise("null"), + ) + + else: + sd = sd.withColumn( + with_col, + f.when( + sd[feature].cast("int") > gt_threshold, + sd[groupby_feat].cast("int") + - (sd[groupby_feat].cast("int") % interval), + ).otherwise("null"), + ) + + elif comp_type == "not_equals": + with_col = "rb" + sd = sd.withColumn( + with_col, + ( + sd[feature].cast("int") + != sd[second_feature].cast("int") + ).cast("int"), + ) + + elif comp_type == "not_equals_by_group": + with_col = "rb" + sd = sd.withColumn( + with_col, + f.when( + ( + sd[feature].cast("int") + != sd[second_feature].cast("int") + ), + sd[groupby_feat].cast("int") + - (sd[groupby_feat].cast("int") % interval), + ).otherwise("null"), + ) + + elif comp_type == "groupby_then_bucketize_name_count": + with_col = "range" + w = Window.partitionBy(feature) + sd = sd.withColumn("n", f.count(feature).over(w)) + sd = sd.withColumn( + with_col, + f.when(sd["n"] < 6, "1-5") + .when((sd["n"] >= 6) & (sd["n"] < 21), "6-20") + .when((sd["n"] >= 21) & (sd["n"] < 61), "21-60") + .otherwise("61-"), + ) + + elif interval and not gt_threshold: + with_col = "range" + sd = sd.withColumn( + with_col, + sd[feature].cast("int") + - (sd[feature].cast("int") % interval), + ) + + elif gt_threshold and not interval: + with_col = "bool" + if dni: + sd = sd.withColumn( + with_col, + f.when( + sd[feature] != dni, + (sd[feature].cast("int") > gt_threshold).cast( + "int" + ), + ).otherwise(0), + ) + + else: + sd = sd.withColumn( + with_col, + (sd[feature].cast("int") > gt_threshold).cast("int"), + ) + + elif lt_threshold and not interval: + with_col = "bool" + if dni: + sd = sd.withColumn( + with_col, + f.when( + sd[feature] != dni, + (sd[feature].cast("int") < lt_threshold).cast( + "int" + ), + ).otherwise(0), + ) + + else: + sd = sd.withColumn( + with_col, + (sd[feature].cast("int") < lt_threshold).cast("int"), + ) + + else: + with_col = feature + return sd, with_col + + if feature in source_data_a.columns: + source_data_a, with_col = _get_withColumn(source_data_a) + source_data_b, with_col = _get_withColumn(source_data_b) + + data_sources = [ + source_data_a, + source_data_a.filter("linked_round_all = 1"), + source_data_a.filter("linked_round_all = 2"), + source_data_b, + source_data_b.filter("linked_round_all = 1"), + source_data_b.filter("linked_round_all = 2"), + ] + data_outputs = ["all_a", "r1_a", "r2_a", "all_b", "r1_b", "r2_b"] + data = {} + + for ds, do in zip(data_sources, data_outputs): + data[do] = _get_dict_from_window_rows( + ds.groupby(with_col).count().collect(), with_col + ) + rows = {} + if label: + lb = label + else: + lb = feature + keys = sorted(set(data["all_a"].keys()) | set(data["all_b"].keys())) + dfs = ["r1_a", "r2_a", "all_b", "r1_b", "r2_b"] + cts = [sda_r1_ct, sda_r2_ct, sdb_ct, sdb_r1_ct, sdb_r2_ct] + for key in keys: + rows[key] = [ + lb, + key, + data["all_a"].get(key, 0), + data["all_a"].get(key, 0) / sda_ct, + ] + for df, ct in zip(dfs, cts): + for key in keys: + rows[key] = rows[key] + [ + data[df].get(key, 0), + data[df].get(key, 0) / ct, + ] + csvwriter.writerows(rows.values()) + csvwriter.writerow([]) + end = timer() + elapsed_time = round(end - start, 2) + print(f"Finished generating {lb}: {elapsed_time}s") + else: + print(f"Not comparing {feature}: not present in source data.") + + def _serialp_children_over_10_window( + label, + csvwriter=csvwriter, + source_data_a=source_data_a, + source_data_b=source_data_b, + sda_ct=sda_ct, + sda_r1_ct=sda_r1_ct, + sda_r2_ct=sda_r2_ct, + sdb_ct=sdb_ct, + sdb_r1_ct=sdb_r1_ct, + sdb_r2_ct=sdb_r2_ct, + ): + start = timer() + + # use window to get count of persons in serialp household who have age > 10 and relate code 3XX for data a and b + window = Window.partitionBy("serialp") + + source_data_a = source_data_a.withColumn( + "count_of_children_over_10", + f.sum( + ( + (source_data_a["relate_div_100"].cast("int") == 3) + & (source_data_a["age"].cast("int") > 10) + ).cast("int") + ).over(window), + ) + source_data_b = source_data_b.withColumn( + "count_of_children_over_10", + f.sum( + ( + (source_data_b["relate_div_100"].cast("int") == 3) + & (source_data_b["age"].cast("int") > 10) + ).cast("int") + ).over(window), + ) + + # bucketize by count + coc = "count_of_children_over_10" + source_data_a = source_data_a.withColumn( + "child_count_bucketized", + f.when(source_data_a[coc] == 0, "0") + .when((source_data_a[coc] >= 1) & (source_data_a[coc] < 3), "1-2") + .when((source_data_a[coc] >= 3) & (source_data_a[coc] < 6), "3-5") + .otherwise("6-"), + ) + source_data_b = source_data_b.withColumn( + "child_count_bucketized", + f.when(source_data_b[coc] == 0, "0") + .when((source_data_b[coc] >= 1) & (source_data_b[coc] < 3), "1-2") + .when((source_data_b[coc] >= 3) & (source_data_b[coc] < 6), "3-5") + .otherwise("6-"), + ) + + # for each datasource, get counts and percentages and write to CSV + data_sources = [ + source_data_a, + source_data_a.filter("linked_round_all = 1"), + source_data_a.filter("linked_round_all = 2"), + source_data_b, + source_data_b.filter("linked_round_all = 1"), + source_data_b.filter("linked_round_all = 2"), + ] + + data_outputs = ["all_a", "r1_a", "r2_a", "all_b", "r1_b", "r2_b"] + data = {} + + for ds, do in zip(data_sources, data_outputs): + data[do] = _get_dict_from_window_rows( + ds.groupby("child_count_bucketized").count().collect(), + "child_count_bucketized", + ) + rows = {} + + keys = sorted(set(data["all_a"].keys()) | set(data["all_b"].keys())) + dfs = ["r1_a", "r2_a", "all_b", "r1_b", "r2_b"] + cts = [sda_r1_ct, sda_r2_ct, sdb_ct, sdb_r1_ct, sdb_r2_ct] + for key in keys: + rows[key] = [ + label, + key, + data["all_a"].get(key, 0), + data["all_a"].get(key, 0) / sda_ct, + ] + for df, ct in zip(dfs, cts): + for key in keys: + rows[key] = rows[key] + [ + data[df].get(key, 0), + data[df].get(key, 0) / ct, + ] + csvwriter.writerows(rows.values()) + csvwriter.writerow([]) + end = timer() + elapsed_time = round(end - start, 2) + print(f"Finished generating {label}: {elapsed_time}s") + + _serialp_children_over_10_window( + label="presence of children over the age of 10 in the household" + ) + + # TODO: bucketize specific codes to text + # TODO: window of household, are children over the age of 10 present + + _groupby_cascade(feature="sex") + _groupby_cascade(feature="age", interval=10, label="age") + _groupby_cascade(feature="race_div_100", label="race") + _groupby_cascade( + feature="relate_div_100", label="relationship to household head" + ) + _groupby_cascade(feature="marst") + _groupby_cascade(feature="marst", lt_threshold=3, label="married") + _groupby_cascade( + feature="marst", + comp_type="keep_low_by_group", + lt_threshold=3, + groupby_feat="age", + interval=10, + label="married, by age", + ) + _groupby_cascade( + feature="durmarr", + gt_threshold=9, + dni=99, + label="marriage duration at least ten years", + ) + _groupby_cascade( + feature="durmarr", + comp_type="keep_high_by_group", + gt_threshold=9, + groupby_feat="age", + interval=10, + label="marriage duration at least 10 years, by age", + dni=99, + ) + _groupby_cascade(feature="region_of_residence", label="region of residence") + _groupby_cascade(feature="region", label="region of birth") + _groupby_cascade(feature="sei", interval=15, label="socioeconomic status") + _groupby_cascade( + feature="bpl_clean", + second_feature="statefip", + label="lifetime migrant", + comp_type="not_equals", + ) + _groupby_cascade( + feature="bpl_clean", + second_feature="statefip", + groupby_feat="age", + label="lifetime migrant by age", + comp_type="not_equals_by_group", + interval=10, + ) + _groupby_cascade( + feature="namefrst_unstd", + label="namefrst_unstd commonality", + comp_type="groupby_then_bucketize_name_count", + ) + _groupby_cascade( + feature="namefrst_std", + label="namefrst_std commonality", + comp_type="groupby_then_bucketize_name_count", + ) + _groupby_cascade( + feature="namelast_clean", + label="namelast_clean commonality", + comp_type="groupby_then_bucketize_name_count", + ) + + +def _get_dict_from_window_rows(collected_rows, new_col): + new_dict = {} + for row in collected_rows: + new_dict[row[new_col]] = row["count"] + return new_dict diff --git a/hlink/linking/reporting/reporting.py b/hlink/linking/reporting/reporting.py new file mode 100644 index 0000000..564c1d2 --- /dev/null +++ b/hlink/linking/reporting/reporting.py @@ -0,0 +1,19 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from .link_step_report_r2_percent_linked import LinkStepReportR2PercentLinked +from .link_step_report_representivity import LinkStepReportRepresentivity +from .link_step_export_crosswalk import LinkStepExportCrosswalk + + +class Reporting(LinkTask): + def get_steps(self): + return [ + LinkStepReportR2PercentLinked(self), + LinkStepReportRepresentivity(self), + LinkStepExportCrosswalk(self), + ] diff --git a/hlink/linking/table.py b/hlink/linking/table.py new file mode 100644 index 0000000..9993fd8 --- /dev/null +++ b/hlink/linking/table.py @@ -0,0 +1,53 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +class Table: + """Represents a spark table which may or may not currently exist. + + It's possible to pass table names that aren't valid spark table names to + this class (for example, "@@@"). In this case, this class does not throw + errors; it just treats the tables like any other spark tables that don't + exist. + """ + + def __init__(self, spark, name: str, desc: str, hide: bool = False): + self.spark = spark + # User-facing name + self.name = name + # Name used to interact with spark + self._name_lower = name.lower() + self.desc = desc + self.hide = hide + + def exists(self) -> bool: + """Check whether the table currently exists in spark.""" + return self._name_lower in [ + table.name for table in self.spark.catalog.listTables() + ] + + def drop(self): + """Drop the table if it `exists()`. + + If the table doesn't exist, then don't do anything. + """ + if self.exists(): + self.spark.sql(f"DROP TABLE {self._name_lower}") + assert ( + not self.exists() + ), f"table '{self.name}' has been dropped but still exists" + + def df(self): + """Get the DataFrame of the table from spark. + + Returns: + Option[DataFrame]: the DataFrame of the table, or None if the table doesn't exist + """ + if self.exists(): + return self.spark.table(self._name_lower) + return None + + def __str__(self): + return f"Table '{self.name}' <- {self.desc}" diff --git a/hlink/linking/table_definitions.csv b/hlink/linking/table_definitions.csv new file mode 100644 index 0000000..3a64621 --- /dev/null +++ b/hlink/linking/table_definitions.csv @@ -0,0 +1,34 @@ +name,desc,hide +raw_df_a,Preprocessing: Raw data read in from datasource A,0 +raw_df_b,Preprocessing: Raw data read in from datasource B,0 +prepped_df_a,Preprocessing: Preprocessed data from source A with selected columns and features,0 +prepped_df_b,Preprocessing: Preprocessed data from source B with selected columns and features,0 +exploded_df_a,Matching: Exploded preprocessed data,1 +exploded_df_b,Matching: Exploded preprocessed data,1 +potential_matches,Matching: Potential matches,0 +unique_matches,Matching: Only potential matches without another close potential match,1 +potential_matches_prepped,Matching: Potential matches with ML features generated,1 +hh_potential_matches_prepped,Household Matching: HH potential matches with ML features generated,1 +predictions,Matching: Raw predictions,1 +scored_potential_matches,Training: Potential matches scored by ML model,0 +hh_blocked_matches,Household Matching: Unfiltered potential matches generated by blocking on linked individuals' households,1 +hh_potential_matches,Household Matching: Potential matches generated by filtering matches created by blocking linked households,0 +hh_scored_potential_matches,Household Matching: HH potential matches scored by HH ML model,0 +training_data,Training: Raw training data as read from specified file,1 +hh_training_data,Household Training: Raw HH training data as read from specified file,1 +training_features,Training: Transformed data with features,1 +hh_training_features,Household Training: Transformed HH training data with features,1 +model_eval_training_data,Model Exploration: Raw training data as read from specified file,1 +model_eval_training_vectorized,Model Exploration: Training data after applying comparison feature and pipeline transformations,1 +model_eval_training_results,Model Exploration: Results of ML model exploration train/test splits,0 +model_eval_training_features,Model Exploration: Transformed data with features,1 +model_eval_repeat_FPs,Model Exploration: Potential false positives repeated in training data during train/test split,1 +model_eval_repeat_FNs,Model Exploration: Potential false negatives repeated in training data during train/test split,1 +hh_model_eval_training_features,Household Model Exploration: Transformed data with features,1 +hh_model_eval_training_vectorized,Household Model Exploration: Training data after applying comparison feature and pipeline transformations,1 +hh_model_eval_training_results,Household Model Exploration: Results of household ML model exploration train/test splits,0 +hh_model_eval_training_data,Household Model Exploration: Raw training data as read from specified file,1 +hh_model_eval_repeat_FPs,Household Model Exploration: Potential false positives repeated in household training data during train/test split,1 +hh_model_eval_repeat_FNs,Household Model Exploration: Potential false negatives repeated in household training data during train/test split,1 +predicted_matches,Matching: Matches predicted by algorithm after thresholds and removal of duplicate histid_b,0 +hh_predicted_matches,Household Matching: Matches predicted by algorithm after thresholds and removal of duplicate histid_b,0 diff --git a/hlink/linking/templates/shared/aggregate_features.sql b/hlink/linking/templates/shared/aggregate_features.sql new file mode 100644 index 0000000..885e131 --- /dev/null +++ b/hlink/linking/templates/shared/aggregate_features.sql @@ -0,0 +1,22 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT pm.* + {% for feature in advanced_comp_features %} + , agg.{{feature}} + {% endfor %} +FROM {{ potential_matches }} pm +JOIN ( + SELECT + pm.{{id}}_a + {% if "hits" in advanced_comp_features %}, COUNT(pm.{{id}}_b) as hits {% endif %} + {% if "hits2" in advanced_comp_features %}, pow(COUNT(pm.{{id}}_b), 2) as hits2 {% endif %} + {% if "exact_mult" in advanced_comp_features %}, SUM(CAST(pm.exact as INT)) > 1 as exact_mult {% endif %} + {% if "exact_all_mult" in advanced_comp_features %}, SUM(CAST(pm.exact_all as INT)) as exact_all_mult {% endif %} + {% if "exact_all_mult2" in advanced_comp_features %}, pow(SUM(CAST(pm.exact_all AS INT)), 2) as exact_all_mult2 {% endif %} + FROM {{ potential_matches }} pm + GROUP BY pm.{{id}}_a +) agg ON agg.{{id}}_a = pm.{{id}}_a + diff --git a/hlink/linking/templates/shared/all_household_members.sql b/hlink/linking/templates/shared/all_household_members.sql new file mode 100644 index 0000000..b6ae193 --- /dev/null +++ b/hlink/linking/templates/shared/all_household_members.sql @@ -0,0 +1,32 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +{% if selects_a is defined %} + {{ ",\n".join(selects_a) }} +{% else %} + {% include 'shared/includes/all_household_members_selects_a.sql' %} +{% endif %} +FROM raw_df_a ra +JOIN {{ hh_keeps if hh_keeps is defined else "hh_keeps" }} hc ON ra.NEW_SERIAL_a = hc.NEW_SERIAL_a +LEFT JOIN {{ plinks if plinks is defined else "plinks_round_1_2_3" }} pl ON pl.id_a = ra.id AND pl.SERIAL_b = hc.SERIAL_b + +UNION ALL + +SELECT +{% if selects_b is defined %} + {{ ",\n".join(selects_b) }} +{% else %} + {% include 'shared/includes/all_household_members_selects_b.sql' %} +{% endif %} +FROM raw_df_b rb +JOIN {{ hh_keeps if hh_keeps is defined else "hh_keeps" }} hc ON rb.SERIAL = hc.SERIAL_b +LEFT JOIN {{ plinks if plinks is defined else "plinks_round_1_2_3" }} pl ON pl.id_b = rb.id AND pl.NEW_SERIAL_a = hc.NEW_SERIAL_a +WHERE pl.SERIAL_a IS NULL +{% if order_bys %} + ORDER BY {{ ", ".join(order_bys) }} +{% else %} + ORDER BY NEW_SERIAL_a, SERIAL_b, NAMEFRST_MAX_JW DESC +{% endif %} diff --git a/hlink/linking/templates/shared/attach_variable.sql b/hlink/linking/templates/shared/attach_variable.sql new file mode 100644 index 0000000..38cd9ac --- /dev/null +++ b/hlink/linking/templates/shared/attach_variable.sql @@ -0,0 +1,8 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT p.*, rd.{{col_to_add}} as {{output_col}} +FROM {{prepped_df}} p +LEFT JOIN {{region_data}} rd ON rd.{{col_to_join_on}} = p.{{input_col}} diff --git a/hlink/linking/templates/shared/drop_links.sql b/hlink/linking/templates/shared/drop_links.sql new file mode 100644 index 0000000..9868784 --- /dev/null +++ b/hlink/linking/templates/shared/drop_links.sql @@ -0,0 +1,15 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT starting.* +FROM {{ starting_links }} starting +{% for links_to_drop in links_to_drop_list %} +LEFT JOIN {{ links_to_drop }} dropping_a_{{ loop.index }} ON dropping_a_{{ loop.index }}.id_a = starting.id_a AND dropping_a_{{ loop.index }}.NEW_SERIAL_a = starting.NEW_SERIAL_a AND dropping_a_{{ loop.index }}.SERIAL_b = starting.SERIAL_b +LEFT JOIN {{ links_to_drop }} dropping_b_{{ loop.index }} ON dropping_b_{{ loop.index }}.id_b = starting.id_b AND dropping_b_{{ loop.index }}.NEW_SERIAL_a = starting.NEW_SERIAL_a AND dropping_b_{{ loop.index }}.SERIAL_b = starting.SERIAL_b +{% endfor %} +WHERE +{% for links_to_drop in links_to_drop_list %} +{{ 'AND' if not loop.first else '' }} dropping_a_{{ loop.index }}.PERNUM_a IS NULL AND dropping_b_{{ loop.index }}.PERNUM_b IS NULL +{% endfor %} diff --git a/hlink/linking/templates/shared/hh_aggregate_features.sql b/hlink/linking/templates/shared/hh_aggregate_features.sql new file mode 100644 index 0000000..eb594a0 --- /dev/null +++ b/hlink/linking/templates/shared/hh_aggregate_features.sql @@ -0,0 +1,27 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT pm.* + {% if "jw_max_a" in hh_comp_features %} + , coalesce( + greatest( + max(case when pm.byrdiff <= 10 then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b ORDER BY pm.{{id}}_a ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), + max(case when pm.byrdiff <= 10 then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b ORDER BY pm.{{id}}_a ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) + ), + max(case when pm.byrdiff <= 10 then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b ORDER BY pm.{{id}}_a ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), + max(case when pm.byrdiff <= 10 then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b ORDER BY pm.{{id}}_a ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING), + 0 + ) as jw_max_a {% endif %} + {% if "jw_max_b" in hh_comp_features %} + , coalesce( + greatest( + max(case when pm.byrdiff <= 10 and pm.sexmatch then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b order by pm.{{id}}_a ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), + max(case when pm.byrdiff <= 10 and pm.sexmatch then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b order by pm.{{id}}_a ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) + ), + max(case when pm.byrdiff <= 10 and pm.sexmatch then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b order by pm.{{id}}_a ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), + max(case when pm.byrdiff <= 10 and pm.sexmatch then pm.namefrst_jw else 0 end) over(PARTITION BY pm.{{id}}_b, pm.{{hh_col}}_a, pm.{{hh_col}}_b order by pm.{{id}}_a ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING), + 0 + ) as jw_max_b {% endif %} +FROM {{potential_matches}} pm diff --git a/hlink/linking/templates/shared/includes/all_household_members_selects_a.sql b/hlink/linking/templates/shared/includes/all_household_members_selects_a.sql new file mode 100644 index 0000000..d937112 --- /dev/null +++ b/hlink/linking/templates/shared/includes/all_household_members_selects_a.sql @@ -0,0 +1,75 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +hc.hh_round, +pl.top_a_to_b, +pl.link_type_a1, +pl.link_type_a2, +ra.id as id_a, +hc.NEW_SERIAL_a, +ra.SERIAL as SERIAL_a, +ra.PERNUM as PERNUM_a, +ra.NAMEFRST as NAMEFRST_a, +ra.NAMELAST as NAMELAST_a, +ra.AGE as AGE_a, +ra.BPL as BPL_a, +ra.SEX as SEX_a, +pl.top_b_to_a, +pl.link_type_b1, +pl.link_type_b2, +pl.id_b, +hc.SERIAL_b, +pl.PERNUM_b, +pl.NAMEFRST_b, +pl.NAMELAST_b, +pl.AGE_b, +pl.BPL_b, +pl.SEX_b, +pl.NAMEFRST_UNSTD_a_UNSTD_b_JW, +pl.NAMEFRST_UNSTD_a_STD_b_JW, +pl.NAMEFRST_STD_a_UNSTD_b_JW, +pl.NAMEFRST_STD_a_STD_b_JW, +pl.NAMELAST_JW, +pl.neighbor_ge9_count_close, +pl.neighbor_ge9_count_far, +pl.NAMEFRST_UNSTD2_a, +pl.NAMEFRST_STD_a, +pl.NAMEFRST_UNSTD_a, +pl.NAMEFRST_UNSTD3_a, +pl.NAMEFRST_UNSTD2_b, +pl.NAMEFRST_STD_b, +pl.NAMEFRST_UNSTD_b, +pl.NAMEFRST_UNSTD3_b, +pl.count_a, +pl.count_b, +pl.count_ge9_a, +pl.count_ge9_b, +pl.sum_count, +pl.sum_count_ge9, +pl.race_code_b, +ra.SELF_EMPTY_INFO_RACE as SELF_EMPTY_INFO_RACE_a, +pl.marst_code_b, +pl.rel_gen_b, +pl.rel_det_b, +pl.nuke80_count, +pl.ext80_count, +pl.unrel80_count, +pl.STATE_a, +pl.COUNTY_a, +pl.TOWNSHIP_a, +pl.CITY_a, +pl.stateicp_b, +pl.countyicp_b, +pl.stdcity_coded_b, +pl.stdtownship_coded_b, +pl.SURNAME_SERIAL_a, +pl.SURNAME_ORDER_a, +pl.SURNAME_SIZE_a, +pl.NAMEFRST_MAX_JW, +pl.AGE_DIFF, + +hc.close_sum_count as hhf_sum_count, +hc.has_shared_neighbors as hhf_has_shared_neighbors, +hc.unlinked_percent as hhf_unlinked_percent diff --git a/hlink/linking/templates/shared/includes/all_household_members_selects_b.sql b/hlink/linking/templates/shared/includes/all_household_members_selects_b.sql new file mode 100644 index 0000000..49f2ea4 --- /dev/null +++ b/hlink/linking/templates/shared/includes/all_household_members_selects_b.sql @@ -0,0 +1,74 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +hc.hh_round, +pl.top_a_to_b, +pl.link_type_a1, +pl.link_type_a2, +pl.id_a, +hc.NEW_SERIAL_a, +pl.SERIAL_a, +pl.PERNUM_a, +pl.NAMEFRST_a, +pl.NAMELAST_a, +pl.AGE_a, +pl.BPL_a, +pl.SEX_a, +pl.top_b_to_a, +pl.link_type_b1, +pl.link_type_b2, +rb.id as id_b, +hc.SERIAL_b, +rb.PERNUM as PERNUM_b, +rb.NAMEFRST as NAMEFRST_b, +rb.NAMELAST as NAMELAST_b, +rb.AGE as AGE_b, +rb.BPL as BPL_b, +rb.SEX as SEX_b, +pl.NAMEFRST_UNSTD_a_UNSTD_b_JW, +pl.NAMEFRST_UNSTD_a_STD_b_JW, +pl.NAMEFRST_STD_a_UNSTD_b_JW, +pl.NAMEFRST_STD_a_STD_b_JW, +pl.NAMELAST_JW, +pl.neighbor_ge9_count_close, +pl.neighbor_ge9_count_far, +pl.NAMEFRST_UNSTD2_a, +pl.NAMEFRST_STD_a, +pl.NAMEFRST_UNSTD_a, +pl.NAMEFRST_UNSTD3_a, +pl.NAMEFRST_UNSTD2_b, +pl.NAMEFRST_STD_b, +pl.NAMEFRST_UNSTD_b, +pl.NAMEFRST_UNSTD3_b, +pl.count_a, +pl.count_b, +pl.count_ge9_a, +pl.count_ge9_b, +pl.sum_count, +pl.sum_count_ge9, +rb.race_code as race_code_b, +pl.SELF_EMPTY_INFO_RACE_a, +rb.marst_code as marst_code_b, +rb.rel_gen as rel_gen_b, +rb.rel_det as rel_det_b, +pl.nuke80_count, +pl.ext80_count, +pl.unrel80_count, +pl.STATE_a, +pl.COUNTY_a, +pl.TOWNSHIP_a, +pl.CITY_a, +pl.stateicp_b, +pl.countyicp_b, +pl.stdcity_coded_b, +pl.stdtownship_coded_b, +pl.SURNAME_SERIAL_a, +pl.SURNAME_ORDER_a, +pl.SURNAME_SIZE_a, +pl.NAMEFRST_MAX_JW, +pl.AGE_DIFF, +hc.close_sum_count as hhf_sum_count, +hc.has_shared_neighbors as hhf_has_shared_neighbors, +hc.unlinked_percent as hhf_unlinked_percent diff --git a/hlink/linking/templates/shared/pl_easy_features.sql b/hlink/linking/templates/shared/pl_easy_features.sql new file mode 100644 index 0000000..38dd270 --- /dev/null +++ b/hlink/linking/templates/shared/pl_easy_features.sql @@ -0,0 +1,9 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT * + , GREATEST(FLOAT(NAMEFRST_UNSTD_a_UNSTD_b_JW), FLOAT(NAMEFRST_UNSTD_a_STD_b_JW), FLOAT(NAMEFRST_STD_a_UNSTD_b_JW), FLOAT(NAMEFRST_STD_a_STD_b_JW)) as NAMEFRST_MAX_JW + , ABS(AGE_a + 10 - AGE_b) as AGE_DIFF +FROM {{ plinks_accepted_ab_neighbors }} diff --git a/hlink/linking/templates/shared/potential_matches_base_features.sql b/hlink/linking/templates/shared/potential_matches_base_features.sql new file mode 100644 index 0000000..ccba7b5 --- /dev/null +++ b/hlink/linking/templates/shared/potential_matches_base_features.sql @@ -0,0 +1,29 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +{% if broadcast_hints %} + {{broadcast_hints}} +{% endif %} +{% if broadcast_a_b %} +/*+ BROADCAST(a) */ +/*+ BROADCAST(b) */ +{% endif %} + +pm.* + +{% if comp_features %} +,{{comp_features}} +{% endif %} + +FROM {{ potential_matches }} pm +JOIN prepped_df_a a ON a.{{id}} = pm.{{id}}_a +JOIN prepped_df_b b ON b.{{id}} = pm.{{id}}_b + +{% if distance_table %} + {% for d in distance_table %} + {{d}} + {% endfor %} +{% endif %} diff --git a/hlink/linking/templates/shared/scored_potential_matches.sql b/hlink/linking/templates/shared/scored_potential_matches.sql new file mode 100644 index 0000000..62fd4e5 --- /dev/null +++ b/hlink/linking/templates/shared/scored_potential_matches.sql @@ -0,0 +1,13 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +pr.* +{% for c in pm_source_cols %} + , pm.{{c}} +{% endfor %} +FROM {{predictions}} pr +JOIN {{potential_matches}} pm +ON pr.{{id_a}} = pm.{{id_a}} AND pr.{{id_b}} = pm.{{id_b}} diff --git a/hlink/linking/templates/shared/select_columns.sql b/hlink/linking/templates/shared/select_columns.sql new file mode 100644 index 0000000..546770f --- /dev/null +++ b/hlink/linking/templates/shared/select_columns.sql @@ -0,0 +1,13 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +{% for c in id_columns %} + {% if not loop.first %},{% endif %} {{c}} as {{c}} +{% endfor %} +{% for c in selected_columns %} + , CAST({{c}} as FLOAT) as {{c}} +{% endfor %} +FROM {{df}} diff --git a/hlink/linking/templates/shared/tfam_tables.sql b/hlink/linking/templates/shared/tfam_tables.sql new file mode 100644 index 0000000..42843c3 --- /dev/null +++ b/hlink/linking/templates/shared/tfam_tables.sql @@ -0,0 +1,23 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT distinct + {% for feature in cols %} + {% if not loop.first %},{% endif %} rdf.{{feature}} + {% endfor %} +FROM +( + select distinct + rdfs.serialp as serialp_{{a_or_b}} + from + {{ source_table}} hhpm + JOIN + raw_df_{{a_or_b}} rdfs + ON + hhpm.{{id}}_{{a_or_b}} = rdfs.{{id}} +) j +LEFT JOIN +raw_df_{{a_or_b}} rdf +ON rdf.serialp = j.serialp_{{a_or_b}} diff --git a/hlink/linking/templates/shared/training_features.sql b/hlink/linking/templates/shared/training_features.sql new file mode 100644 index 0000000..3b7e8ce --- /dev/null +++ b/hlink/linking/templates/shared/training_features.sql @@ -0,0 +1,29 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +{% if a_selects %} +{% for sel in a_selects %} + a.{{sel}} as {{sel}}_a, +{% endfor %} +{% for sel in b_selects %} + b.{{sel}} as {{sel}}_b, +{% endfor %} +{% else %} +a.{{id}} as {{id}}_a, +b.{{id}} as {{id}}_b, +{% endif %} +{{comp_features}}, +{{match_feature}} + +FROM training_data td +JOIN prepped_df_a a ON a.{{id}} = td.{{id}}_a +JOIN prepped_df_b b ON b.{{id}} = td.{{id}}_b + +{% if distance_table %} + {% for d in distance_table %} + {{d}} + {% endfor %} +{% endif %} diff --git a/hlink/linking/templates/shared/training_prepped.sql b/hlink/linking/templates/shared/training_prepped.sql new file mode 100644 index 0000000..9df5fcf --- /dev/null +++ b/hlink/linking/templates/shared/training_prepped.sql @@ -0,0 +1,16 @@ +{# This file is part of the ISRDI's hlink. #} +{# For copyright and licensing information, see the NOTICE and LICENSE files #} +{# in this project's top-level directory, and also on-line at: #} +{# https://github.com/ipums/hlink #} + +SELECT +{% for sel in a_selects %} + pa.{{sel}} as {{sel}}_a, +{% endfor %} +{% for sel in b_selects %} + pb.{{sel}} as {{sel}}_b{% if not(loop.last) %},{% endif %} +{% endfor %} + +FROM training_data td +JOIN prepped_df_a pa ON pa.id = td.id_a +JOIN prepped_df_b pb ON pb.id = td.id_b diff --git a/hlink/linking/training/__init__.py b/hlink/linking/training/__init__.py new file mode 100644 index 0000000..1f4e4a0 --- /dev/null +++ b/hlink/linking/training/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from .training import Training diff --git a/hlink/linking/training/link_step_create_comparison_features.py b/hlink/linking/training/link_step_create_comparison_features.py new file mode 100644 index 0000000..153017a --- /dev/null +++ b/hlink/linking/training/link_step_create_comparison_features.py @@ -0,0 +1,96 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.linking.core.comparison_feature as comparison_feature_core +import hlink.linking.core.dist_table as dist_table_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepCreateComparisonFeatures(LinkStep): + def __init__(self, task): + super().__init__( + task, + "create comparison features", + input_table_names=[ + "prepped_df_a", + "prepped_df_b", + f"{task.table_prefix}training_data", + ], + output_table_names=[f"{task.table_prefix}training_features"], + ) + + def _run(self): + self.task.spark.sql("set spark.sql.shuffle.partitions=200") + self.__create_training_features() + + def __create_training_features(self): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + if config[training_conf].get("use_training_data_features", False): + return self.task.run_register_python( + f"{table_prefix}training_features", + lambda: self.task.spark.table(f"{table_prefix}training_data"), + persist=True, + ) + id_col = config["id_column"] + dep_var = config[training_conf]["dependent_var"] + if training_conf == "hh_training": + hh_col = config[training_conf].get("hh_col", "serialp") + tdl = self.task.spark.sql( + f"""SELECT + td.{id_col}_a, + td.{id_col}_b, + td.{dep_var}, + pdfa.{hh_col} as {hh_col}_a, + pdfb.{hh_col} as {hh_col}_b + from + {table_prefix}training_data td + left join + prepped_df_a pdfa + on pdfa.{id_col} = td.{id_col}_a + left join + prepped_df_b pdfb + on pdfb.{id_col} = td.{id_col}_b + """ + ) + else: + tdl = self.task.spark.table(f"{table_prefix}training_data").select( + f"{id_col}_a", f"{id_col}_b", dep_var + ) + self.task.run_register_python(f"{table_prefix}training_data_ids", lambda: tdl) + ( + comp_features, + advanced_comp_features, + hh_comp_features, + dist_features, + ) = comparison_feature_core.get_features( + config, config[training_conf]["independent_vars"] + ) + t_ctx_def = { + "comp_features": comp_features, + "match_feature": config[training_conf]["dependent_var"], + "id": config["id_column"], + "potential_matches": f"{table_prefix}training_data_ids", + } + join_clauses, dist_tables = dist_table_core.register_dist_tables_and_create_sql( + self.task, dist_features + ) + t_ctx_def["distance_table"] = join_clauses + if len(dist_tables) > 0: + t_ctx_def["broadcast_hints"] = dist_table_core.get_broadcast_hint( + dist_tables + ) + + comparison_feature_core.create_feature_tables( + self.task, + t_ctx_def, + advanced_comp_features, + hh_comp_features, + config["id_column"], + table_name=f"{table_prefix}training_features", + ) diff --git a/hlink/linking/training/link_step_ingest_file.py b/hlink/linking/training/link_step_ingest_file.py new file mode 100644 index 0000000..d7ad6cf --- /dev/null +++ b/hlink/linking/training/link_step_ingest_file.py @@ -0,0 +1,27 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.linking.link_step import LinkStep + + +class LinkStepIngestFile(LinkStep): + def __init__(self, task): + super().__init__( + task, + "ingest file", + input_table_names=[], + output_table_names=[f"{task.table_prefix}training_data"], + ) + + def _run(self): + self.task.run_register_python( + f"{self.task.table_prefix}training_data", + lambda: self.task.spark.read.csv( + self.task.link_run.config[f"{self.task.training_conf}"]["dataset"], + header=True, + inferSchema=True, + ), + persist=True, + ) diff --git a/hlink/linking/training/link_step_train_and_save_model.py b/hlink/linking/training/link_step_train_and_save_model.py new file mode 100644 index 0000000..ed05f76 --- /dev/null +++ b/hlink/linking/training/link_step_train_and_save_model.py @@ -0,0 +1,76 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml import Pipeline + +import hlink.linking.core.classifier as classifier_core +import hlink.linking.core.pipeline as pipeline_core +import hlink.linking.core.threshold as threshold_core + +from hlink.linking.link_step import LinkStep + + +class LinkStepTrainAndSaveModel(LinkStep): + def __init__(self, task): + super().__init__( + task, + "train and save the model", + input_table_names=[f"{task.table_prefix}training_features"], + output_table_names=[], + output_model_names=[f"{task.table_prefix}trained_model"], + ) + + def _run(self): + training_conf = str(self.task.training_conf) + table_prefix = self.task.table_prefix + config = self.task.link_run.config + + if not (config[training_conf].get("score_with_model", False)): + raise ValueError( + f"'score_with_model' not included or set to true in '{training_conf}' section of config! Initiation of a model and scoring not completed!" + ) + + chosen_model_params = config[training_conf]["chosen_model"].copy() + chosen_model_type = chosen_model_params.pop("type") + chosen_model_params.pop( + "threshold", config[training_conf].get("threshold", 0.8) + ) + chosen_model_params.pop( + "threshold_ratio", + threshold_core.get_threshold_ratio( + config[training_conf], chosen_model_params + ), + ) + + ind_vars = config[training_conf]["independent_vars"] + dep_var = config[training_conf]["dependent_var"] + + tf = self.task.spark.table(f"{table_prefix}training_features") + + # Create pipeline + pipeline_stages = pipeline_core.generate_pipeline_stages( + config, ind_vars, tf, training_conf + ) + # TODO: Test if this will break if the scaler is used + vector_assembler = pipeline_stages[-1] + + pre_pipeline = Pipeline(stages=pipeline_stages[:-1]).fit(tf) + self.task.link_run.trained_models[f"{table_prefix}pre_pipeline"] = pre_pipeline + tf_prepped = pre_pipeline.transform(tf) + + classifier, post_transformer = classifier_core.choose_classifier( + chosen_model_type, chosen_model_params, dep_var + ) + + # Train and save pipeline + pipeline = Pipeline(stages=[vector_assembler, classifier, post_transformer]) + + model = pipeline.fit(tf_prepped) + # model_path = config["spark_tmp_dir"] + "/chosen_model" + self.task.link_run.trained_models[f"{table_prefix}trained_model"] = model + # model.write().overwrite().save(model_path) + # model.transform(pre_pipeline.transform(tf)).write.mode("overwrite").saveAsTable("training_features_scored") + + # model.transform(tf).write.mode("overwrite").saveAsTable("training_features_pipelined") diff --git a/hlink/linking/training/training.py b/hlink/linking/training/training.py new file mode 100644 index 0000000..a17cf95 --- /dev/null +++ b/hlink/linking/training/training.py @@ -0,0 +1,24 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from ..link_task import LinkTask + +from .link_step_ingest_file import LinkStepIngestFile +from .link_step_create_comparison_features import LinkStepCreateComparisonFeatures +from .link_step_train_and_save_model import LinkStepTrainAndSaveModel + + +class Training(LinkTask): + def __init__(self, link_run): + super().__init__(link_run) + self.training_conf = "training" + self.table_prefix = "" + + def get_steps(self): + return [ + LinkStepIngestFile(self), + LinkStepCreateComparisonFeatures(self), + LinkStepTrainAndSaveModel(self), + ] diff --git a/hlink/linking/transformers/__init__.py b/hlink/linking/transformers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/linking/transformers/float_cast_transformer.py b/hlink/linking/transformers/float_cast_transformer.py new file mode 100644 index 0000000..8b14401 --- /dev/null +++ b/hlink/linking/transformers/float_cast_transformer.py @@ -0,0 +1,36 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml import Transformer +from pyspark.ml.param.shared import HasInputCols +from pyspark.ml.util import DefaultParamsWritable, DefaultParamsReadable +from pyspark import keyword_only + + +class FloatCastTransformer( + Transformer, HasInputCols, DefaultParamsReadable, DefaultParamsWritable +): + """ + A custom Transformer which casts the input column to a float. + """ + + @keyword_only + def __init__(self, inputCols=None): + super(FloatCastTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCols=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _transform(self, df): + other_cols = set(df.columns) - set(self.getInputCols()) + casted_cols = [ + f"CAST({inputCol} as float) as {inputCol}" + for inputCol in self.getInputCols() + ] + return df.selectExpr(list(other_cols) + casted_cols) diff --git a/hlink/linking/transformers/interaction_transformer.py b/hlink/linking/transformers/interaction_transformer.py new file mode 100644 index 0000000..e9d4851 --- /dev/null +++ b/hlink/linking/transformers/interaction_transformer.py @@ -0,0 +1,62 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml.util import JavaMLReadable, JavaMLWritable +from pyspark.ml.param.shared import HasInputCols, HasOutputCol +from pyspark import keyword_only +from pyspark.ml.wrapper import JavaTransformer + + +class InteractionTransformer( + JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, JavaMLWritable +): + """ + from https://github.com/apache/spark/commit/5bf5d9d854db53541956dedb03e2de8eecf65b81: + Implements the feature interaction transform. This transformer takes in Double and Vector type + columns and outputs a flattened vector of their feature interactions. To handle interaction, + we first one-hot encode any nominal features. Then, a vector of the feature cross-products is + produced. + For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be + `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal + with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`. + df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"]) + interaction = Interaction(inputCols=["a", "b"], outputCol="ab") + interaction.transform(df).show() + +---+---+-----+ + | a| b| ab| + +---+---+-----+ + |0.0|1.0|[0.0]| + |2.0|3.0|[6.0]| + +---+---+-----+ + ... + interactionPath = temp_path + "/interaction" + interaction.save(interactionPath) + loadedInteraction = Interaction.load(interactionPath) + loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab + True + .. versionadded:: 3.0.0 + """ + + @keyword_only + def __init__(self, inputCols=None, outputCol=None): + """ + __init__(self, inputCols=None, outputCol=None): + """ + super(InteractionTransformer, self).__init__() + self._java_obj = self._new_java_obj( + "org.apache.spark.ml.feature.Interaction", self.uid + ) + self._setDefault() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCols=None, outputCol=None): + """ + setParams(self, inputCols=None, outputCol=None) + for this Interaction. + """ + kwargs = self._input_kwargs + return self._set(**kwargs) diff --git a/hlink/linking/transformers/rename_prob_column.py b/hlink/linking/transformers/rename_prob_column.py new file mode 100644 index 0000000..eae95af --- /dev/null +++ b/hlink/linking/transformers/rename_prob_column.py @@ -0,0 +1,14 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml import Transformer +from pyspark.ml.util import DefaultParamsWritable, DefaultParamsReadable + + +class RenameProbColumn(Transformer, DefaultParamsWritable, DefaultParamsReadable): + def _transform(self, dataset): + return dataset.withColumnRenamed("probability", "probability_array").selectExpr( + "*", "parseProbVector(probability_array, 1) as probability" + ) diff --git a/hlink/scripts/__init__.py b/hlink/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/scripts/lib/__init__.py b/hlink/scripts/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py new file mode 100644 index 0000000..f8c7da1 --- /dev/null +++ b/hlink/scripts/lib/conf_validations.py @@ -0,0 +1,342 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.utils import AnalysisException +from os import path +import colorama + + +def print_checking(section: str): + print(f"Checking {section}...", end=" ") + + +def print_ok(): + print(colorama.Fore.GREEN + "OK" + colorama.Style.RESET_ALL) + + +def analyze_conf(link_run): + """Print an analysis of the configuration of the `link_run`.""" + colorama.init() + + try: + print_checking("datasource_a") + df_a = parse_datasource(link_run, "datasource_a") + print_ok() + + print_checking("datasource_b") + df_b = parse_datasource(link_run, "datasource_b") + print_ok() + + print_checking("filters") + check_filters(link_run.config, df_a, df_b) + print_ok() + + print_checking("column_mappings") + columns_available = check_column_mappings(link_run.config, df_a, df_b) + print_ok() + + print_checking("substitution_columns") + check_substitution_columns(link_run.config, columns_available) + print_ok() + + print_checking("feature_selections") + check_feature_selections(link_run.config, columns_available) + print_ok() + + print_checking("blocking") + check_blocking(link_run.config, columns_available) + print_ok() + + print_checking("comparison_features") + comp_features = check_comparison_features(link_run.config, columns_available) + print_ok() + + print_checking("comparisons") + check_comparisons(link_run.config, comp_features) + print_ok() + + print_checking("pipeline_features") + check_pipeline_features(link_run.config, comp_features) + print_ok() + + print_checking("training") + check_training(link_run.config, comp_features) + print_ok() + + print_checking("hh_training") + check_hh_training(link_run.config, comp_features) + print_ok() + finally: + colorama.deinit() + + +def check_hh_training(config, comp_features): + comp_features += ["jw_max_a", "jw_max_b"] + hh_training = config.get("hh_training") + if hh_training is None: + return + independent_vars = hh_training.get("independent_vars") + if independent_vars is None: + raise ValueError( + "No independent_vars value specified in the [training] section." + ) + for var in independent_vars: + if var not in comp_features: + raise ValueError( + f"Within [training] the independent_var: '{var}' does not exist. Please add a specification as a [[comparison_feature]] or a [[pipeline_feature]]." + ) + + +def check_training(config, comp_features): + comp_features += ["hits", "hits2", "exact_mult"] + training = config.get("training") + if training is None: + return + independent_vars = training.get("independent_vars") + if independent_vars is None: + raise ValueError( + "No independent_vars value specified in the [training] section." + ) + for var in independent_vars: + if var not in comp_features: + raise ValueError( + f"Within [training] the independent_var: '{var}' does not exist. Please add a specification as a [[comparison_feature]] or a [[pipeline_feature]]." + ) + + +def check_pipeline_features(config, comp_features): + pipeline_features = config.get("pipeline_features") + if pipeline_features is None: + return + for p in pipeline_features: + input_column = p.get("input_column") + if (input_column is not None) and (input_column not in comp_features): + raise ValueError( + f"Within [[pipeline_features]] the input_column: '{input_column}' is not available from a previous [[comparison_features]] or [[pipeline_features]] section. \n Available columns: \n {comp_features}" + ) + input_columns = p.get("input_columns") + if input_columns is not None: + for c in input_columns: + if c not in comp_features: + raise ValueError( + f"Within [[pipeline_features]] the input_column: '{c}' is not available from a previous [[comparison_features]] or [[pipeline_features]] section. \n Available columns: \n {comp_features}" + ) + output_column = p.get("output_column") + if output_column is None: + raise ValueError( + "Within [[pipeline_features]] no 'output_column' specified for {p}." + ) + comp_features.append(output_column) + + +def check_comparisons(config, comp_features): + comparisons = config.get("comparisons") + if comparisons is None: + raise ValueError( + "No [comparisons] section exists. Please add a [comparisons] section." + ) + comp_a = comparisons.get("comp_a") + comp_b = comparisons.get("comp_b") + if comp_a is not None: + feature_name = comp_a.get("feature_name") + if (feature_name is not None) and (feature_name not in comp_features): + raise ValueError( + f"Within [comparisons] the feature_name '{feature_name}' is not available. Please add a corresponding feature in the [[comparison_features]] section. \n Available features: \n {comp_features}" + ) + if comp_b is not None: + feature_name = comp_b.get("feature_name") + if (feature_name is not None) and (feature_name not in comp_features): + raise ValueError( + f"Within [comparisons] the feature_name '{feature_name}' is not available. Please add a corresponding feature in the [[comparison_features]] section. \n Available features: \n {comp_features}" + ) + + +def check_comparison_features(config, columns_available): + comps = [] + comparison_features = config.get("comparison_features") + if comparison_features is None: + raise ValueError( + "No [[comparison_features]] exist. Please add [[comparison_features]]." + ) + for c in comparison_features: + alias = c.get("alias") + if alias is None: + raise ValueError( + f"No alias exists for a [[comparison_features]]: {c}. Please add an 'alias'." + ) + column_name = c.get("column_name") or c.get("first_init_col") + column_names = c.get("column_names") or c.get("mid_init_cols") + if column_name is not None: + if column_name not in columns_available: + raise ValueError( + f"Within [[comparison_features]] the 'column_name' {column_name} is not available from a previous [[column_mappings]] or [[feature_selections]]: {c}" + ) + if column_names is not None: + for cname in column_names: + if cname not in columns_available: + raise ValueError( + f"Within [[comparison_features]] the 'column_name' {cname} is not available from a previous [[column_mappings]] or [[feature_selections]]: {c}" + ) + comps.append(alias) + return comps + + +def check_blocking(config, columns_available): + blockings = config.get("blocking") + if blockings is None: + raise ValueError("No [[blocking]] exist. Please add blocking.") + for b in blockings: + column_name = b.get("derived_from") or b.get("column_name") + if column_name is None: + raise ValueError(f"Within [[blocking]] no column name is specified: {b}.") + if column_name not in columns_available: + raise ValueError( + f"Within [[blocking]] the column_name of '{column_name}' is not available from an earlier [[column_mappings]] or [[feature_selections]]. \n Available columns: \n {columns_available}" + ) + + +def check_feature_selections(config, columns_available): + feature_selections = config.get("feature_selections") + if feature_selections is None: + return + for f in feature_selections: + input_column = f.get("input_column") + output_column = f.get("output_column") or f.get("output_col") + other_col = f.get("other_col") + if input_column is not None and input_column not in columns_available: + raise ValueError( + f"Within [[feature_selections]] the input_column: '{input_column}' is not created by an earlier [[column_mappings]] or [[feature_selections]]. \n Available Columns: \n {columns_available}." + ) + if other_col is not None and other_col not in columns_available: + raise ValueError( + f"Within [[feature_selections]] the other_col: '{other_col}' is not created by an earlier [[column_mappings]] or [[feature_selections]]. \n Available Columns: \n {columns_available}." + ) + if output_column is None: + raise ValueError( + f"No 'output_column' or 'output_col' value for [[feature_selections]]: {f}" + ) + columns_available.append(output_column) + + +def check_substitution_columns(config, columns_available): + substitution_columns = config.get("substitution_columns") + if substitution_columns is None: + return + for s in substitution_columns: + column_name = s.get("column_name") + substitutions = s.get("substitutions") + if column_name is None: + raise ValueError("Within [[substitution_columns]] no 'column_name' exists.") + if substitutions is None: + raise ValueError( + "Within [[substitution_columns]] no [[substitution_columns.substitutions]] exists." + ) + for sub in substitutions: + join_column = sub.get("join_column") + join_value = sub.get("join_value") + f = sub.get("substitution_file") + if join_column is None or join_column not in columns_available: + raise ValueError( + f"Within [[substitution_columns.substitutions]] the join_column '{join_column}' does not exist or is not available within columns specificed within [[column_mappings]]. \nList of available columns: \n {columns_available}" + ) + if join_value is None: + raise ValueError( + " Within [[substitution_columns.substitutions]] no 'join_value' exists." + ) + if f is None or not path.exists(f): + raise ValueError( + f" Within [[substitution_columns.substitutions]] no 'substitution_file' exists or does not point to an existing file: {f}" + ) + + +def check_column_mappings(config, df_a, df_b): + column_mappings = config.get("column_mappings") + if not column_mappings: + raise ValueError("No [[column_mappings]] exist in the conf file.") + columns_available = [] + for c in column_mappings: + alias = c.get("alias") + column_name = c.get("column_name") + set_value_column_a = c.get("set_value_column_a") + set_value_column_b = c.get("set_value_column_b") + if not column_name: + raise ValueError( + "The following [[column_mappings]] has no 'column_name' attribute: {c}" + ) + if set_value_column_a is None: + if column_name.lower() not in [c.lower() for c in df_a.columns]: + if column_name not in columns_available: + raise ValueError( + f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_a and no previous [[column_mapping]] alias exists for it. \nColumn mapping: {c}. \nAvailable columns: \n {df_a.columns}" + ) + if set_value_column_b is None: + if column_name.lower() not in [c.lower() for c in df_b.columns]: + if column_name not in columns_available: + raise ValueError( + f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}" + ) + if alias: + columns_available.append(alias) + else: + columns_available.append(column_name) + return columns_available + + +def check_filters(config, df_a, df_b): + filters = config.get("filter") + if not filters: + return + for f in filters: + expression = f.get("expression") + if not expression: + raise ValueError("A [[filter]] has no expression value in the config.") + try: + df_a.where(expression) + df_b.where(expression) + except AnalysisException as e: + raise ValueError( + f"Within a [[filter]] the expression '{expression}' is not valid. Spark gives the following error: {e}." + ) + + +def parse_datasource(link_run, section_name: str): + datasource = link_run.config.get(section_name) + + if not datasource: + raise ValueError(f"Section [{section_name}] does not exist in config.") + + parquet_file = datasource.get("parquet_file") + file = datasource.get("file") + + if not parquet_file and not file: + raise ValueError( + f"Within [{section_name}] neither 'parquet_file' nor 'file' exist." + ) + if parquet_file and file: + raise ValueError( + f"Within [{section_name}] both 'parquet_file' and 'file' exist." + ) + + # Now we know that either file or parquet_file was provided, but not both. + if parquet_file: + if not path.exists(parquet_file): + raise ValueError( + f"Within [{section_name}] path of parquet file {parquet_file} does not exist." + ) + return link_run.spark.read.parquet(parquet_file) + else: + if not path.exists(file): + raise ValueError( + f"Within [{section_name}] path of file {file} does not exist." + ) + _, file_extension = path.splitext(file) + if file_extension == ".csv": + return link_run.spark.read.csv(file, header=True) + elif file_extension == ".parquet": + return link_run.spark.read.parquet(file) + else: + raise ValueError( + f"Within [{section_name}] file {file} is neither a CSV file nor a parquet file." + ) diff --git a/hlink/scripts/lib/experimental/__init__.py b/hlink/scripts/lib/experimental/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/scripts/lib/experimental/reporting.py b/hlink/scripts/lib/experimental/reporting.py new file mode 100755 index 0000000..ce472c4 --- /dev/null +++ b/hlink/scripts/lib/experimental/reporting.py @@ -0,0 +1,112 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import subprocess +import pyspark.sql.functions as pyspark_funcs + + +def export_crosswalk(spark, output_path, variables, include_round): + crosswalk_vars = ["histid_a", "histid_b"] + + if include_round: + crosswalk_vars.append("round") + + if "histid" not in variables: + variables.append("histid") + + # We are accessing these records in order to attach extra contextual + # variables to the pairs of people in the predicted matches tables. + raw_df_a = spark.table("raw_df_a") + raw_df_b = spark.table("raw_df_b") + + # We are adding (unioning) together individual matches (predicted_matches) and + # household matches (hh_predicted_matches) + # + # The addition of 'round' is to identify which matches came from which matching round. + # We may or may not select and export it depending on the --include-round flag. + predicted_matches = spark.table("predicted_matches").withColumn( + "round", pyspark_funcs.lit(1) + ) + + hh_predicted_matches = spark.table("hh_predicted_matches").withColumn( + "round", pyspark_funcs.lit(2) + ) + + for variable in variables: + if variable not in [c.lower() for c in raw_df_a.columns]: + print(f"Error: variable '{variable}' does not exist in raw_df_a.") + return + if variable not in [c.lower() for c in raw_df_b.columns]: + print(f"Error: variable '{variable}' does not exist in raw_df_a.") + return + + all_matches = predicted_matches.select(crosswalk_vars).unionByName( + hh_predicted_matches.select(crosswalk_vars) + ) + + # Make distinct sets of variable names for the a and b datasets + columns_a = [f"{variable} as {variable}_a" for variable in variables] + columns_b = [f"{variable} as {variable}_b" for variable in variables] + + raw_df_a_selected = raw_df_a.selectExpr(columns_a) + raw_df_b_selected = raw_df_b.selectExpr(columns_b) + + all_matches_with_selections = all_matches.join(raw_df_a_selected, "histid_a").join( + raw_df_b_selected, "histid_b" + ) + + if "csv" in output_path.split("."): + export_csv(all_matches_with_selections, output_path) + else: + export_fixed_width(variables, all_matches_with_selections, output_path) + + +def export_csv(all_matches_with_selections, output_path): + output_tmp = output_path + ".tmp" + all_matches_with_selections.write.csv(output_tmp, header=False) + header = ( + '"' + '","'.join([col.name for col in all_matches_with_selections.schema]) + '"' + ) + + commands = [ + f"echo '{header}' > {output_path}", + f"cat {output_tmp}/* >> {output_path} ", + f"rm -rf {output_tmp}", + ] + + for command in commands: + subprocess.run(command, shell=True) + + +def export_fixed_width(variables, all_matches_with_selections, output_path): + output_tmp = output_path + ".tmp" + sizes = { + "histid": 36, + "serialp": 8, + "pernum": 4, + "age": 3, + "sex": 1, + "statefip_p": 2, + "bpl": 5, + } + fw_columns_a = [] + fw_columns_b = [] + for variable in variables: + size = sizes.get(variable, 15) + fw_columns_a.append( + [f"LPAD({variable}_a, {size}, ' ') as {variable}_a", size, f"{variable}_a"] + ) + fw_columns_b.append( + [f"LPAD({variable}_b, {size}, ' ') as {variable}_b", size, f"{variable}_b"] + ) + all_column_selects = [c[0] for c in (fw_columns_a + fw_columns_b)] + + [print(f"{c[2]} - {c[1]}") for c in (fw_columns_a + fw_columns_b)] + all_matches_fixed_width = all_matches_with_selections.selectExpr(all_column_selects) + all_matches_fixed_width.selectExpr("CONCAT_WS('', *)").write.text(output_tmp) + + commands = [f"cat {output_tmp}/* >> {output_path} ", f"rm -rf {output_tmp}"] + for command in commands: + subprocess.run(command, shell=True) diff --git a/hlink/scripts/lib/experimental/tfam.py b/hlink/scripts/lib/experimental/tfam.py new file mode 100644 index 0000000..a6a86d3 --- /dev/null +++ b/hlink/scripts/lib/experimental/tfam.py @@ -0,0 +1,295 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.linking.link_task import LinkTask + + +def tfam(link_run, id_col, id_a, id_b): + cols = [ + f"{id_col}", + "serialp", + "pernum", + "relate", + "namefrst", + "namelast", + "age", + "birthyr", + "sex", + "race", + "marst", + "durmarr", + "bpl", + "nativity", + "citizen", + "mbpl", + "fbpl", + "statefip_p", + "street_p", + "countyicp_p", + "region_p", + ] + + if ( + link_run.get_table("training_data").exists() + and link_run.get_table("training_features").exists() + ): + pass + else: + link_run.training.run_step(0) + link_run.training.run_step(1) + + table_a, table_b = _prep_tfam_tables( + link_run, "tfam_table", "training_data", cols, id_col + ) + + serialp_a = table_a.filter(f"{id_col} == '{id_a}'").take(1)[0]["serialp"] + serialp_b = table_b.filter(f"{id_col} == '{id_b}'").take(1)[0]["serialp"] + print("Family 1900:") + table_a.where(f"serialp = '{serialp_a}'").orderBy("pernum").select(cols).show( + truncate=False + ) + print("Family 1910:") + table_b.where(f"serialp = '{serialp_b}'").orderBy("pernum").select(cols).show( + truncate=False + ) + + true_match = ( + link_run.spark.table("training_data") + .where(f"histid_a = '{id_a}' AND histid_b != '{id_b}' AND match = 1") + .take(1) + ) + + if len(true_match) != 0: + print("Match family 1910:") + histid_b_tm = true_match[0][f"{id_col}_b"] + print(histid_b_tm) + serialp_matched = table_b.filter(f"{id_col} == '{histid_b_tm}'").take(1)[0][ + "serialp" + ] + table_b.where(f"serialp = '{serialp_matched}'").orderBy("pernum").select( + cols + ).show(truncate=False) + + print("All hits:") + + tfam_table = link_run.get_table("tfam_hits") + if tfam_table.exists(): + table_hits = tfam_table.df() + else: + sql = """ + select pa.histid as histid_a, pb.histid as histid_b, pa.namefrst as namefrst_a, pa.namelast as namelast_a, pb.namefrst as namefrst_b, pb.namelast as namelast_b, pa.mbpl as mbpl_a, pb.mbpl as mbpl_b, pa.fbpl as fbpl_a, pb.fbpl as fbpl_b, pa.statefip_p as state_a, pb.statefip_p as state_b, pa.countyicp_p as county_a, pb.countyicp_p as county_b, tf.namefrst_jw, tf.namelast_jw + from training_features tf + left join raw_df_a pa on pa.histid = tf.histid_a + join raw_df_b pb on pb.histid = tf.histid_b + """ + table_hits = LinkTask(link_run).run_register_sql( + name=tfam_table.name, sql=sql, persist=True + ) + + table_hits.where(f"{id_col}_a == '{id_a}'").orderBy( + ["namelast_jw", "namefrst_jw"], ascending=False + ).show(20, False) + + print("Features:") + link_run.spark.table("training_features").where( + f"histid_a = '{id_a}' AND histid_b = '{id_b}'" + ).show(100, False) + + +def tfam_raw(link_run, id_col, id_a, id_b): + cols = [ + f"{id_col}", + "serial_p", + "pernum", + "relate", + "namefrst", + "namelast", + "age", + "birthyr", + "sex", + "race", + "marst", + "durmarr", + "bpl", + "nativity", + "citizen", + "mbpl", + "fbpl", + "statefip_p", + "street_p", + "countyicp_p", + "region_p", + ] + + table_a = link_run.spark.table("raw_df_a") + table_b = link_run.spark.table("raw_df_b") + + serialp_a = table_a.filter(f"{id_col} == '{id_a}'").take(1)[0]["SERIAL_P"] + serialp_b = table_b.filter(f"{id_col} == '{id_b}'").take(1)[0]["SERIAL_P"] + print("Family 1900:") + table_a.where(f"SERIAL_P = '{serialp_a}'").orderBy("PERNUM").select(cols).show( + truncate=False + ) + print("Family 1910:") + table_b.where(f"SERIAL_P = '{serialp_b}'").orderBy("PERNUM").select(cols).show( + truncate=False + ) + + +def hh_tfam(link_run, id_col, id_a, id_b): + cols = [ + f"{id_col}", + "serialp", + "pernum", + "relate", + "namefrst", + "namelast", + "age", + "birthyr", + "sex", + "race", + "marst", + "durmarr", + "bpl", + "nativity", + "citizen", + "mbpl", + "fbpl", + "statefip_p", + "street_p", + "countyicp_p", + "region_p", + ] + + table_a, table_b = _prep_tfam_tables( + link_run, "hh_tfam_table", "hh_predicted_matches", cols, id_col + ) + + serialp_a = table_a.filter(f"{id_col} == '{id_a}'").take(1)[0]["serialp"] + serialp_b = table_b.filter(f"{id_col} == '{id_b}'").take(1)[0]["serialp"] + print("Family 1900:") + table_a.where(f"serialp = '{serialp_a}'").orderBy("pernum").select(cols).show( + truncate=False + ) + print("Family 1910:") + table_b.where(f"serialp = '{serialp_b}'").orderBy("pernum").select(cols).show( + truncate=False + ) + + +def hh_tfam_2a(link_run, id_col, id_a1, id_a2, id_b): + cols = [ + "serialp", + "pernum", + "relate", + "age", + "birthyr", + "sex", + "race", + "marst", + "durmarr", + "bpl", + "nativity", + "citizen", + "birthyr", + "namelast", + "namefrst", + "mbpl", + "fbpl", + "statefip_p", + "street_p", + "countyicp_p", + "region_p", + "histid", + ] + + table_a, table_b = _prep_tfam_tables( + link_run, "hh_tfam_table", "hh_predicted_matches", cols, id_col + ) + + serialp_a1 = table_a.filter(f"{id_col} == '{id_a1}'").take(1)[0]["serialp"] + serialp_a2 = table_a.filter(f"{id_col} == '{id_a2}'").take(1)[0]["serialp"] + serialp_b = table_b.filter(f"{id_col} == '{id_b}'").take(1)[0]["serialp"] + print("Family 1900 option 1:") + table_a.where(f"serialp = '{serialp_a1}'").orderBy("pernum").select(cols).show( + truncate=False + ) + print("Family 1900 option 2:") + table_b.where(f"serialp = '{serialp_a2}'").orderBy("pernum").select(cols).show( + truncate=False + ) + print("Family 1910:") + table_b.where(f"serialp = '{serialp_b}'").orderBy("pernum").select(cols).show( + truncate=False + ) + + +def hh_tfam_2b(link_run, id_col, id_a, id_b1, id_b2): + cols = [ + f"{id_col}", + "serialp", + "pernum", + "relate", + "namefrst", + "namelast", + "age", + "birthyr", + "sex", + "race", + "marst", + "durmarr", + "bpl", + "nativity", + "citizen", + "mbpl", + "fbpl", + "statefip_p", + "street_p", + "countyicp_p", + "region_p", + ] + + table_a, table_b = _prep_tfam_tables( + link_run, "hh_tfam_table", "hh_predicted_matches", cols, id_col + ) + + serialp_a = table_a.filter(f"{id_col} == '{id_a}'").take(1)[0]["serialp"] + serialp_b1 = table_b.filter(f"{id_col} == '{id_b1}'").take(1)[0]["serialp"] + serialp_b2 = table_b.filter(f"{id_col} == '{id_b2}'").take(1)[0]["serialp"] + print("Family 1900:") + table_a.where(f"serialp = '{serialp_a}'").orderBy("pernum").select(cols).show( + truncate=False + ) + print("Family 1910 option 1:") + table_b.where(f"serialp = '{serialp_b1}'").orderBy("pernum").select(cols).show( + truncate=False + ) + print("Family 1910 option 2:") + table_b.where(f"serialp = '{serialp_b2}'").orderBy("pernum").select(cols).show( + truncate=False + ) + + +def _prep_tfam_tables(link_run, table_name, source_table, cols, id_col): + tables = [] + for a_or_b in ["a", "b"]: + table = link_run.get_table(f"{table_name}_{a_or_b}") + if table.exists(): + tables.append(table.df()) + else: + tables.append( + LinkTask(link_run).run_register_sql( + name=table.name, + template="tfam_tables", + t_ctx={ + "a_or_b": a_or_b, + "cols": cols, + "id": id_col, + "source_table": f"{source_table}", + }, + persist=True, + ) + ) + return tables[0], tables[1] diff --git a/hlink/scripts/lib/io.py b/hlink/scripts/lib/io.py new file mode 100644 index 0000000..b89cf11 --- /dev/null +++ b/hlink/scripts/lib/io.py @@ -0,0 +1,91 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import os +import subprocess + +from pyspark.sql.types import StringType + +from hlink.scripts.lib.util import report_and_log_error + + +def write_table_to_csv(spark, table_name, output_path, num_partitions=None): + """Write a spark table to csv. + + `num_partitions` can be used to partition the output, creating a directory + with multiple csv files. + + Args: + spark (SparkSession) + table_name (str): the name of the table to write out + output_path (str): the output path to write the csv to + num_partitions (int, optional): How many partitions to use when writing the csv. Defaults to None. + """ + df = spark.table(table_name) + selects = [] + for col in df.schema: + if col.dataType.typeName() == "array": + selects.append(f"array_to_string({table_name}.{col.name})") + elif col.dataType.typeName() == "vectorudt": + selects.append(f"vector_to_string({table_name}.{col.name})") + elif col.dataType.typeName() == "map": + selects.append(f"CAST({col.name} as STRING) as {col.name}") + else: + selects.append(col.name) + sql_selects = ",\n ".join(f for f in selects) + + col_names = [col.name for col in df.schema] + + spark.udf.registerJavaFunction( + "array_to_string", "com.isrdi.udfs.ArrayToString", StringType() + ) + spark.udf.registerJavaFunction( + "vector_to_string", "com.isrdi.udfs.VectorToString", StringType() + ) + + df_selected = spark.sql(f"SELECT {sql_selects} FROM {table_name}") + if num_partitions is not None: + df_selected.repartition(num_partitions).write.csv( + output_path, sep=",", header=True, quoteAll=True + ) + else: + output_tmp = output_path + ".tmp" + df_selected.write.csv(output_tmp, sep=",", header=False, quoteAll=True) + + header = '"' + '","'.join(col_names) + '"' + commands = [ + f"echo '{header}' > {output_path}", + f"cat {output_tmp}/* >> {output_path} ", + f"rm -rf {output_tmp}", + ] + for command in commands: + subprocess.run(command, shell=True) + + +def read_csv_and_write_parquet(spark, csv_path, parquet_path): + """Read in csv and write it out to parquet.""" + spark.read.csv(csv_path, header=True, nullValue="").na.fill("").write.parquet( + parquet_path + ) + + +def load_external_table(spark, input_path, table_name): + """Load an external datasource into spark as a table.""" + spark.catalog.createExternalTable(table_name, path=input_path) + + +def borrow_spark_tables(spark, borrow_tables_from): + table_dirs = [f.path for f in os.scandir(borrow_tables_from) if f.is_dir()] + + for t in table_dirs: + try: + table_name = os.path.basename(t) + print(f"Borrowing:\t{table_name}...\t\t\t", end="") + + spark.catalog.createTable(table_name, path=t) + print("SUCCEEDED") + except Exception as err: + print("FAILED") + report_and_log_error("Error borrowing " + table_name, err) diff --git a/hlink/scripts/lib/linking_ops.py b/hlink/scripts/lib/linking_ops.py new file mode 100644 index 0000000..2338633 --- /dev/null +++ b/hlink/scripts/lib/linking_ops.py @@ -0,0 +1,74 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from colorama import init as colorama_init, deinit as colorama_deinit, Fore, Style + + +def show_step_info(link_task, link_run): + """Show step information for the given `link_task`.""" + colorama_init() + print(Fore.CYAN + f"Link task: {link_task}" + Style.RESET_ALL) + + steps = link_task.get_steps() + tables = link_run.known_tables + + for (i, step) in enumerate(steps): + print(Fore.GREEN + f"step {i}: {step}" + Style.RESET_ALL) + + print("\tTables used:") + for input_table_name in step.input_table_names: + print(f"\t\t{tables[input_table_name]}") + + if len(step.input_model_names) > 0: + print("\tModels loaded:") + for input_model_name in step.input_model_names: + print(Fore.MAGENTA + f"\t\t{input_model_name}" + Style.RESET_ALL) + + print("\tTables created:") + for output_table_name in step.output_table_names: + print(f"\t\t{tables[output_table_name]}") + + if len(step.output_model_names) > 0: + print("\tModels saved:") + for output_model_name in step.output_model_names: + print(Fore.MAGENTA + f"\t\t{output_model_name}" + Style.RESET_ALL) + + colorama_deinit() + + +def show_tasks(current_task, link_run, link_task_choices): + """Show information about the available link tasks for the link run. + + Args: + current_task (LinkTask): the currently active link task + link_run (LinkRun) + link_task_choices (Dict[str, LinkTask]): a dict mapping string names to link tasks + """ + colorama_init() + print(Fore.CYAN + f"Current link task: {current_task}") + + print("Linking task choices are: " + Style.RESET_ALL) + for link_task in link_task_choices: + task_inst = link_run.get_task(link_task) + print(Fore.GREEN + f"{link_task} :: {task_inst}" + Style.RESET_ALL) + + input_tables = set() + output_tables = set() + for step in task_inst.get_steps(): + input_tables.update(set(step.input_table_names)) + output_tables.update(set(step.output_table_names)) + + input_tables = input_tables - output_tables + + if len(input_tables) == 0: + print("\tRequires no preexisting tables.") + else: + print("\tRequires tables: " + str(input_tables)) + if len(output_tables) == 0: + print("\tProduces no persistent tables.") + else: + print("\tProduces tables: " + str(output_tables)) + + colorama_deinit() diff --git a/hlink/scripts/lib/table_ops.py b/hlink/scripts/lib/table_ops.py new file mode 100644 index 0000000..532fd25 --- /dev/null +++ b/hlink/scripts/lib/table_ops.py @@ -0,0 +1,143 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.sql.types import StructType, StructField, StringType +import pyspark.sql.functions as pyspark_funcs +import pandas as pd + + +def run_and_show_sql(spark, sql, limit=100, truncate=True): + """Run the given sql query and print the results. + + Args: + spark (SparkSession) + sql (str): the sql query to run + limit (int, optional): The maximum number of rows to show. Defaults to 100. + truncate (bool, optional): Whether to shorten long strings in the output. Defaults to True. + """ + spark.sql(sql).show(limit, truncate=truncate) + + +def show_table_row_count(spark, table_name): + spark.sql(f"SELECT COUNT(*) FROM {table_name}").show() + + +def show_table_columns(spark, table_name): + spark.sql(f"DESCRIBE {table_name}").show(1000, truncate=False) + + +def show_column_summary(spark, table_name, col_name): + spark.table(table_name).select(col_name).summary().show() + + +def show_column_tab(spark, table_name, col_name): + """Print a tabulation of the given column in the given table.""" + spark.table(table_name).groupBy(col_name).count().orderBy(col_name).show( + 100, truncate=False + ) + + +def show_table(spark, table_name, limit=10, truncate=True): + """Print the first `limit` rows of the table with the given name. + + Args: + spark (SparkSession) + table_name (str): the name of the table to show + limit (int, optional): How many rows of the table to show. Defaults to 10. + truncate (bool, optional): Whether to truncate long strings in the output or not. Defaults to True. + """ + spark.sql(f"SELECT * FROM {table_name}").show(limit, truncate=truncate) + + +def list_tables(link_run, list_all=False): + """Print some information on the currently existing spark tables. + + Args: + link_run (LinkRun) + list_all (bool, optional): Whether to show all tables, or just those marked as important by the link run. Defaults to False. + """ + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("description", StringType(), True), + ] + ) + table_descs = {table.name: table.desc for table in link_run.known_tables.values()} + pd_df1 = pd.DataFrame.from_dict(table_descs, orient="index") + pd_df1.reset_index(inplace=True) + pd_df1.rename(columns={"index": "name", 0: "description"}, inplace=True) + df1 = link_run.spark.createDataFrame(pd_df1, schema) + df2 = link_run.spark.sql("SHOW tables") + if not list_all: + # Print only important tables + important_tables = [ + table + for table in link_run.known_tables + if not link_run.known_tables[table].hide + ] + df2 = df2.filter(df2["tableName"].isin(important_tables)) + df2.join(df1, df2.tableName == df1.name, "left").orderBy( + "description", "tableName" + ).drop("name").show(1000, truncate=False) + + +def drop_table(link_run, table_name): + table = link_run.get_table(table_name) + + if table.exists(): + print(f"Dropping {table.name}") + table.drop() + else: + print(f"Table {table.name} doesn't exist; no need to drop") + + +def drop_tables_satisfying(link_run, cond): + """Drop all spark tables satisfying the given condition. + + `cond` is passed spark table info objects as returned by `spark.catalog.listTables()`. + Tables for which `cond` evaluates to True will be dropped. + + Args: + link_run (LinkRun) + cond (spark table -> bool): filtering function to determine which tables should be dropped + """ + all_tables = link_run.spark.catalog.listTables() + satis_tables = filter(cond, all_tables) + + for table in satis_tables: + print(f"Dropping {table.name}") + link_run.get_table(table.name).drop() + + +def drop_all_tables(link_run): + drop_tables_satisfying(link_run, (lambda _: True)) + + +def drop_prc_tables(link_run): + """Drop all precision_recall_curve-related tables.""" + drop_tables_satisfying(link_run, (lambda t: "precision_recall_curve" in t.name)) + + +def persist_table(spark, table_name): + """Make the given table permanent.""" + spark.table(table_name).write.mode("overwrite").saveAsTable(table_name) + + +def take_table_union(spark, table1_name, table2_name, output_table_name, mark_col_name): + """Create the union of two tables as a new temporary table. + + Args: + spark (SparkSession) + table1_name (str): the name of the first table + table2_name (str): the name of the second table + output_table_name (str): the name of the destination table + mark_col_name (str): the name of the column used to mark which table the row came from + """ + t1 = spark.table(table1_name).withColumn(mark_col_name, pyspark_funcs.lit(True)) + t2 = spark.table(table2_name).withColumn(mark_col_name, pyspark_funcs.lit(False)) + new_cols = list(set(t1.columns).intersection(t2.columns)) + t1.select(new_cols).unionByName(t2.select(new_cols)).createOrReplaceTempView( + output_table_name + ) diff --git a/hlink/scripts/lib/util.py b/hlink/scripts/lib/util.py new file mode 100644 index 0000000..7aa6e63 --- /dev/null +++ b/hlink/scripts/lib/util.py @@ -0,0 +1,29 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import sys +import logging +import traceback + + +def report_and_log_error(message: str, err: Exception): + print(f"An error occured: {message}") + i = sys.exc_info() + print(f"ERROR type: {type(err)}") + print(f"ERROR message: {i[1]}") + print("See log for details.") + print("") + # Perhaps for a verbose mode: + # traceback.print_exception("",err,i[2]) + multi_line = "\n==========\n" + + logging.error( + str(i[0]) + + " : " + + str(i[1]) + + multi_line + + str.join("", traceback.format_exception(type(err), err, i[2])) + + multi_line + ) diff --git a/hlink/scripts/main.py b/hlink/scripts/main.py new file mode 100755 index 0000000..49de698 --- /dev/null +++ b/hlink/scripts/main.py @@ -0,0 +1,274 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import argparse +import getpass +from importlib import reload +import logging +import os +from pathlib import Path +import json +import pkg_resources +import readline +import sys +import traceback +import uuid + +from hlink.spark.session import SparkConnection +from hlink.configs.load_config import load_conf_file +from hlink.errors import SparkError, UsageError +from hlink.scripts.lib.util import report_and_log_error +from hlink.linking.link_run import LinkRun +from hlink.scripts.main_loop import Main +from hlink.scripts.lib.conf_validations import analyze_conf +from hlink.scripts.lib.table_ops import drop_all_tables + + +def load_conf(conf_name, user): + """Load and return the hlink config dictionary. + + Add the following attributes to the config dictionary: + "derby_dir", "warehouse_dir", "spark_tmp_dir", "log_file", "python", "conf_path" + """ + if "HLINK_CONF" not in os.environ: + global_conf = None + else: + global_conf_file = os.environ["HLINK_CONF"] + with open(global_conf_file) as f: + global_conf = json.load(f) + + run_name = Path(conf_name).stem + + if global_conf is None: + current_dir = Path.cwd() + hlink_dir = current_dir / "hlink_config" + base_derby_dir = hlink_dir / "derby" + base_warehouse_dir = hlink_dir / "warehouse" + base_spark_tmp_dir = hlink_dir / "spark_tmp_dir" + conf = load_conf_file(conf_name) + + conf["derby_dir"] = base_derby_dir / run_name + conf["warehouse_dir"] = base_warehouse_dir / run_name + conf["spark_tmp_dir"] = base_spark_tmp_dir / run_name + conf["log_file"] = hlink_dir / "run.log" + conf["python"] = sys.executable + else: + user_dir = Path(global_conf["users_dir"]) / user + user_dir_fast = Path(global_conf["users_dir_fast"]) / user + conf_dir = user_dir / "confs" + conf_path = conf_dir / conf_name + conf = load_conf_file(str(conf_path)) + + conf["derby_dir"] = user_dir / "derby" / run_name + conf["warehouse_dir"] = user_dir_fast / "warehouse" / run_name + conf["spark_tmp_dir"] = user_dir_fast / "tmp" / run_name + conf["log_file"] = user_dir / "hlink.log" + conf["python"] = global_conf["python"] + + print(f"*** Using config file {conf['conf_path']}") + return conf + + +def cli(): + """Called by the hlink script. Referenced in setup.py.""" + if "--version" in sys.argv: + version = pkg_resources.get_distribution("hlink").version + print(f"Hlink version: {version}") + return + args = _parse_args() + + try: + if args.conf: + run_conf = load_conf(args.conf, args.user) + else: + raise Exception( + "ERROR: You must specify a config file to use by including either the --run or --conf flag in your program call." + ) + except UsageError: + print("Exception setting up config") + i = sys.exc_info() + print(i[1]) + sys.exit(1) + except Exception as err: + i = sys.exc_info() + print(i[0]) + print(i[1]) + # traceback.print_tb(i[2]) + traceback.print_exception("", err, None) + sys.exit(1) + + run_conf["mesos"] = args.mesos + + spark = _get_spark(run_conf, args) + _setup_logging(run_conf) + history_file = os.path.expanduser("~/.history_hlink") + _read_history_file(history_file) + + try: + if args.execute_tasks: + main = Main(LinkRun(spark, run_conf, use_preexisting_tables=False)) + main.preloop() + + task_list = " ".join(args.execute_tasks) + + main.do_run_all_steps(task_list) + elif args.execute_command: + main = Main( + LinkRun(spark, run_conf, use_preexisting_tables=False), + start_task=args.task, + ) + main.preloop() + command = " ".join(args.execute_command) + print(f"Running Command: {command}") + main.onecmd(command) + else: + _cli_loop(spark, args, run_conf) + readline.write_history_file(history_file) + spark.stop() + except RuntimeError as err: + report_and_log_error("Runtime Error", err) + sys.exit(1) + + except SparkError as err: + report_and_log_error("Spark Error", err) + sys.exit(1) + + except Exception as err: + report_and_log_error("Unclassified Error", err) + sys.exit(1) + + +def _parse_args(): + parser = argparse.ArgumentParser(description="Historical linking program.") + parser.add_argument( + "--mesos", + help="run on mesos at isrdi. Must be on isrdi machines to work.", + action="store_true", + ) + parser.add_argument( + "--user", help="run as a specific user", default=getpass.getuser() + ) + parser.add_argument( + "--cores", help="the max number of cores to use on mesos", default=4, type=int + ) + parser.add_argument( + "--executor_memory", help="the memory per executor to use", default="10G" + ) + parser.add_argument( + "--task", help="The initial task to begin processing.", default="preprocessing" + ) + parser.add_argument( + "--execute_tasks", + help="Execute a series of tasks then exit the program.", + nargs="+", + ) + parser.add_argument( + "--execute_command", + help="Execute a single command then exit the program.", + nargs="+", + ) + parser.add_argument( + "--conf", + "--run", + help="Specify a filepath where your config file for the run is located.", + ) + parser.add_argument( + "--clean", + help="Drop any preexisting Spark tables when hlink starts up.", + action="store_true", + ) + + return parser.parse_args() + + +def _get_spark(run_conf, args): + spark_connection = SparkConnection( + run_conf["derby_dir"], + run_conf["warehouse_dir"], + run_conf["spark_tmp_dir"], + run_conf["python"], + "linking", + ) + if not (args.mesos): + spark = spark_connection.local( + cores=args.cores, executor_memory=args.executor_memory + ) + else: + spark = spark_connection.mesos( + "mesos://mpc-cluster.pop.umn.edu:5050", + cores=args.cores, + executor_memory=args.executor_memory, + ) + return spark + + +def _read_history_file(history_file): + if not (os.path.exists(history_file)): + with open(history_file, "a"): + os.utime(history_file, (1330712280, 1330712292)) + readline.read_history_file(history_file) + + +def _cli_loop(spark, args, run_conf): + if args.clean: + print("Dropping preexisting tables") + drop_all_tables(LinkRun(spark, run_conf)) + + try: + print("Analyzing config file") + analyze_conf(LinkRun(spark, run_conf)) + except ValueError as err: + report_and_log_error("", err) + + while True: + main = Main(LinkRun(spark, run_conf), start_task=args.task) + try: + main.cmdloop() + if main.lastcmd == "reload": + _reload_modules() + # Reload modules twice in order to fix import problem + # with the _*.py files in the linking modules + _reload_modules() + run_conf = load_conf(args.conf, args.user) + else: + break + except Exception as err: + report_and_log_error("", err) + + +def _reload_modules(): + no_reloads = [] + mods_to_reload_raw = [name for name, mod in sys.modules.items()] + # We need to order the modules to reload the _*.py files in the + # linking modules before loading the __init__.py files. + mods_to_reload_ordered = sorted(mods_to_reload_raw)[::-1] + for name in mods_to_reload_ordered: + if name.startswith("hlink") and name not in no_reloads: + reload(sys.modules[name]) + + # Here we should reset the classes in link_run.link_task_choices with + # the newly reloaded classes. + + +def _setup_logging(conf): + log_file = conf["log_file"] + user = getpass.getuser() + session_id = uuid.uuid4().hex + # format_string = f"%(levelname)s %(asctime)s {user} {session_id} %(message)s -- {conf['conf_path']}" + format_string = "%(levelname)s %(asctime)s -- %(message)s" + print(f"*** Hlink log: {log_file}") + + logging.basicConfig(filename=log_file, level=logging.INFO, format=format_string) + + logging.info("") + logging.info( + "-------------------------------------------------------------------------------------" + ) + logging.info(f" New Session {session_id} by user {user} ") + logging.info(f" Configured with {conf['conf_path']}") + logging.info( + "-------------------------------------------------------------------------------------" + ) + logging.info("") diff --git a/hlink/scripts/main_loop.py b/hlink/scripts/main_loop.py new file mode 100755 index 0000000..e734ee2 --- /dev/null +++ b/hlink/scripts/main_loop.py @@ -0,0 +1,553 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from cmd import Cmd +import logging +from typing import Optional +import functools +from timeit import default_timer as timer + +from hlink.errors import UsageError +from hlink.linking.link_run import link_task_choices + +import hlink.scripts.lib.experimental.tfam as x_tfam +import hlink.scripts.lib.experimental.reporting as x_reporting +import hlink.scripts.lib.table_ops as table_ops +import hlink.scripts.lib.io as io +import hlink.scripts.lib.linking_ops as linking_ops +import hlink.scripts.lib.conf_validations as conf_validations + + +def split_and_check_args(expected_count): + """A parametrized decorator to make handling arguments easier for `Main` methods. + + The decorator splits the string `args` and checks the count of split args against `expected_count`. + If the count is allowed, it passes the split args along to the wrapped function. Otherwise, + it immediately returns None. It uses `Main.check_arg_count()` to check the split args. + The docstring of the decorated method is used as the `command_docs` argument to `Main.check_arg_count()`. + + Decorated methods should take `self` and `split_args` as their arguments. + """ + + def decorator(f): + @functools.wraps(f) + def wrapper(self, args): + split_args = args.split() + if self.check_arg_count(split_args, expected_count, f.__doc__): + return + return f(self, split_args) + + return wrapper + + return decorator + + +class Main(Cmd): + """Main program which handles user input. See https://docs.python.org/3/library/cmd.html for more information.""" + + prompt = "hlink $ " + intro = "Welcome to hlink. Type ? to list commands and q to quit.\n" + + def __init__( + self, + link_run, + start_task: Optional[str] = None, + ): + self.link_run = link_run + self.spark = self.link_run.spark + + if start_task is None: + self.current_link_task = self.link_run.preprocessing + else: + self.current_link_task = self.link_run.get_task(start_task) + + super().__init__() + + def preloop(self): + self.reload_auto_complete_cache() + + def reload_auto_complete_cache(self): + self.table_names = [t.name for t in self.spark.catalog.listTables()] + + def precmd(self, line): + return line + + # These are meant to be flags / switches, not long options with arguments following them + def extract_flags_from_args(self, applicable_flags, split_args): + """Separates the flags from the regular arguments, checks that flags + passed in match the applicable flags for the command. + arg1: list of applicable flags + arg2: Pre-split list of arguments including flags given by user.""" + + flags = [a for a in split_args if "--" in a] + non_flag_args = [a for a in split_args if "--" not in a] + + unsupported_flags = set(flags) - set(applicable_flags) + if unsupported_flags: + raise UsageError( + f"The flags {unsupported_flags} aren't supported by this command. Supported flags are {applicable_flags}" + ) + + return flags, non_flag_args + + def check_arg_count(self, split_args, expected_count, command_docs): + """Checks the number of arguments submitted against the expected number(s). + + Args: + split_args (List[str]): the arguments submitted + expected_count (int | List[int]): the expected number of arguments, or a list of allowed numbers of arguments + command_docs (str): the help documentation for the calling command + + Returns: + bool: True if the argument count is incorrect, False if it is correct + """ + num_args = len(split_args) + # Special case: there are actually 0 args if the only provided arg is the empty string + if num_args == 1 and split_args[0] == "": + num_args = 0 + + expected_counts = ( + expected_count if type(expected_count) == list else [expected_count] + ) + expected_counts_str = " or ".join(map(str, expected_counts)) + + arg_form = "argument" if expected_counts == [1] else "arguments" + + if num_args not in expected_counts: + print("Argument error!") + print( + f"This command takes {expected_counts_str} {arg_form} and you gave {num_args}." + ) + print("See the command description below:") + print(command_docs) + return True + + return False + + def emptyline(self): + return False + + @split_and_check_args(0) + def do_q(self, split_args): + """Quits the program. + Usage: q""" + return True + + @split_and_check_args(0) + def do_reload(self, split_args): + """Hot reload modules. + Usage: reload""" + return "reload" + + @split_and_check_args(1) + def do_set_link_task(self, split_args): + """Set the linking task to run steps for. + Arg 1: task + To retrieve a list of valid linking tasks, use the command 'get_tasks'.""" + link_task = split_args[0] + if link_task in link_task_choices: + self.current_link_task = self.link_run.get_task(link_task) + print(f"Set to: {self.current_link_task}") + else: + choices = ", \n\t".join(link_task_choices.keys()) + print(f"Invalid choice. \nValid choices are: \n\t{choices}") + + def complete_set_link_task(self, text, line, begidx, endidx): + return [t for t in link_task_choices if t.startswith(text)] + + @split_and_check_args(0) + def do_set_preexisting_tables(self, split_args): + """Toggle the preexisting tables flag. Steps will essentially be skipped when their output already exists. + Default setting is True (uses pre-existing tables.) + Usage: set_preexisting_tables""" + self.link_run.use_preexisting_tables = not self.link_run.use_preexisting_tables + print(f"Use preexisting tables: {self.link_run.use_preexisting_tables}") + + @split_and_check_args(0) + def do_set_print_sql(self, split_args): + """Toggle the print sql flag. + Default setting is False. + Usage: set_print_sql""" + self.link_run.print_sql = not self.link_run.print_sql + print(f"Print sql: {self.link_run.print_sql}") + + @split_and_check_args(0) + def do_get_settings(self, split_args): + """Show the current settings which can be toggled. + Current settings displayed include: + - use pre-existing tables + - print SQL + - current linking task. + Usage: get_settings""" + print(f"Use preexisting tables: {self.link_run.use_preexisting_tables}") + print(f"Print sql: {self.link_run.print_sql}") + print(f"Current link task: {self.current_link_task}") + + @split_and_check_args(0) + def do_ipython(self, split_args): + """Open an ipython shell. + Usage: ipython""" + import IPython + + IPython.embed() + + @split_and_check_args(0) + def do_get_tasks(self, split_args): + """Get all of the available linking tasks. + Usage: get_tasks + Hint: Specify the current linking task using the 'set_link_task' command.""" + linking_ops.show_tasks(self.current_link_task, self.link_run, link_task_choices) + + @split_and_check_args(0) + def do_get_steps(self, split_args): + """Get all of the steps for the current linking task. + Usage: get_steps + Hint: Specify the current linking task using the 'set_link_task' command.""" + linking_ops.show_step_info(self.current_link_task, self.link_run) + + @split_and_check_args(1) + def do_run_step(self, split_args): + """Run the specified household linking step. + Arg 1: step number (an integer) + Hint: Use the command 'get_steps' to fetch a list of all the household linking steps.""" + print(f"Link task: {self.current_link_task}") + step_num = int(split_args[0]) + self.current_link_task.run_step(step_num) + + def do_run_all_steps(self, args): + """Run all of the linking steps within the given tasks, in the order given. If no tasks are given, run all the steps for the current task. + ArgN (Optional): Link tasks to run all steps for.""" + split_args = args.split() + if len(split_args) > 0: + for link_task in split_args: + if link_task not in link_task_choices: + print( + f"Argument error! \nThis function takes a list of link tasks as arguments. \n Argument {link_task} is not a link task. See method description below:" + ) + print(self.do_run_all_steps.__doc__) + return + for link_task in split_args: + task_inst = self.link_run.get_task(link_task) + print(f"Running task: {task_inst}") + task_inst.run_all_steps() + print() + else: + print(f"Running task: {self.current_link_task}") + self.current_link_task.run_all_steps() + print() + + def complete_run_all_steps(self, text, line, begidx, endidx): + return [t for t in link_task_choices if t.startswith(text)] + + @split_and_check_args(0) + def do_analyze(self, split_args): + """Print an analysis of dependencies in the config.""" + conf_validations.analyze_conf(self.link_run) + + @split_and_check_args(1) + def do_count(self, split_args): + """Prints the count of rows in a table. + Arg 1: table""" + table_name = split_args[0] + table_ops.show_table_row_count(self.spark, table_name) + + def complete_count(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(1) + def do_desc(self, split_args): + """Prints the columns of a table. + Arg 1: table""" + table_name = split_args[0] + table_ops.show_table_columns(self.spark, table_name) + + def complete_desc(self, text, line, begidx, endidx): + return self.check_table_names(text) + + def do_list(self, args): + """List tables that have been registered. + Usage: list""" + split_args = args.split() + list_all = any(x in ["a", "all"] for x in split_args) + table_ops.list_tables(self.link_run, list_all=list_all) + + @split_and_check_args(1) + def do_show(self, split_args): + """Prints the first 10 lines of a table. + Arg 1: table""" + table_name = split_args[0] + table_ops.show_table(self.spark, table_name, limit=10) + + def complete_show(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(1) + def do_showf(self, split_args): + """Prints the first 10 lines of a table, without truncating data. + Arg 1: table""" + table_name = split_args[0] + table_ops.show_table(self.spark, table_name, limit=10, truncate=False) + + def complete_showf(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(1) + def do_drop(self, split_args): + """Delete a table. + Arg 1: table""" + table_name = split_args[0] + table_ops.drop_table(self.link_run, table_name) + + def complete_drop(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(0) + def do_drop_all(self, split_args): + """Delete all tables. + Usage: drop_all""" + table_ops.drop_all_tables(self.link_run) + + @split_and_check_args(0) + def do_drop_all_temp(self, split_args): + """Delete all temporary tables. + Usage: drop_all_temp""" + self.link_run.drop_temp_tables() + + @split_and_check_args(0) + def do_drop_all_prc(self, split_args): + """Delete all precision recall curve tables. + Usage: drop_all_prc""" + table_ops.drop_prc_tables(self.link_run) + + @split_and_check_args(2) + def do_x_summary(self, split_args): + """Prints a summary of a variable in a table. + [!] This command is experimental. + Arg 1: table + Arg 2: variable""" + table_name, col_name = split_args + table_ops.show_column_summary(self.spark, table_name, col_name) + + def complete_x_summary(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(2) + def do_x_tab(self, split_args): + """Prints tabulation of a variable. + [!] This command is experimental. + Arg 1: table + Arg 2: var_name""" + table_name, col_name = split_args + table_ops.show_column_tab(self.spark, table_name, col_name) + + def complete_x_tab(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(1) + def do_x_persist(self, split_args): + """Takes a temporary table and makes it permanent. + [!] This command is experimental. + Arg 1: table to persist""" + table_ops.persist_table(self.spark, split_args[0]) + + def complete_x_persist(self, text, line, begidx, endidx): + return self.check_table_names(text) + + def do_x_sql(self, args): + """Runs arbitrary sql. Drop to ipython for multiline queries. + [!] This command is experimental. + Args: SQL query""" + split_args = args.split() + if len(split_args) == 0: + print( + "Argument error! \nThis function takes a SQL query as an argument. \nSee method description below:" + ) + print(self.do_x_sql.__doc__) + return + table_ops.run_and_show_sql(self.spark, args) + + def do_x_sqlf(self, args): + """Runs arbitrary sql without truncating. Drop to ipython for multiline queries. + [!] This command is experimental. + Args: SQL query""" + split_args = args.split() + if len(split_args) == 0: + print( + "Argument error! \nThis function takes a SQL query as an argument. \nSee method description below:" + ) + print(self.do_x_sqlf.__doc__) + return + table_ops.run_and_show_sql(self.spark, args, truncate=False) + + @split_and_check_args(4) + def do_x_union(self, split_args): + """Creates a new table from the union of two previous tables. + [!] This command is experimental. + Arg 1: first table + Arg 2: second table + Arg 3: output name + Arg 4: mark column""" + table1_name, table2_name, output_table_name, mark_col_name = split_args + table_ops.take_table_union( + self.spark, table1_name, table2_name, output_table_name, mark_col_name + ) + + def complete_x_union(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args([2, 3]) + def do_csv(self, split_args): + """Writes a table out to csv. + Arg 1: table name + Arg 2: path + Arg 3 (optional): # of partitions""" + table_name = split_args[0] + output_path = split_args[1] + num_args = len(split_args) + num_partitions = int(split_args[2]) if num_args == 3 else None + + io.write_table_to_csv( + self.link_run.spark, table_name, output_path, num_partitions + ) + + def complete_csv(self, text, line, begidx, endidx): + return self.check_table_names(text) + + @split_and_check_args(1) + def do_borrow_tables(self, split_args): + """Register tables from another hlink installation. Takes an absolute path to a Spark warehouse directory and a job name e.g. + borrow_tables /mnt/nas/spark/linking/ccd/warehouse/full_count_1900_1910""" + borrow_tables_from = split_args[0] + "/linking.db" + print(f"Trying to borrow tables in {borrow_tables_from}") + print("") + + io.borrow_spark_tables(self.spark, borrow_tables_from) + + print("") + print("Type 'list' to show all the available tables.") + print("") + + def check_table_names(self, check): + return [t for t in self.table_names if t.startswith(check)] + + @split_and_check_args(2) + def do_x_load(self, split_args): + """Loads in an external datasource to the database as a table. + [!] This command is experimental. + Arg 1: input_path + Arg 2: table_name""" + input_path, table_name = split_args + io.load_external_table(self.spark, input_path, table_name) + + @split_and_check_args(2) + def do_x_parquet_from_csv(self, split_args): + """Reads a csv and creates a parquet file. + [!] This command is experimental. + Arg 1: input_path + Arg 2: output_path""" + csv_path, parquet_path = split_args + io.read_csv_and_write_parquet(self.spark, csv_path, parquet_path) + + def do_x_crosswalk(self, args): + """Export a crosswalk of all predicted matches for round 1 and round 2 linking. + [!] This command is experimental. + Arg 1: output path + Arg 2: comma seperated list of variables to export + Usage: crosswalk [output_path] [list_of_variables] + Example: 'crosswalk /mypath histid,serial,pernum,sex,age,bpl'""" + flags, split_args = self.extract_flags_from_args( + ["--include-rounds"], args.split() + ) + include_round = "--include-rounds" in flags + + # Flags have been removed from the split_args already, so that + # the correct number gets checked. + if self.check_arg_count(split_args, 2, self.do_x_crosswalk.__doc__): + return + + output_path, variables_string = split_args + variables = list([v.lower() for v in variables_string.split(",")]) + + if include_round: + print("Including round numbers in exported data") + else: + print( + "Not including rounds in export, to include them use the --include-rounds flag." + ) + + x_reporting.export_crosswalk(self.spark, output_path, variables, include_round) + + @split_and_check_args(2) + def do_x_tfam(self, split_args): + """Show the family of a training match. + [!] This command is experimental. + Arg 1: id_a + Arg 2: id_b""" + id_col = self.link_run.config["id_column"] + id_a, id_b = split_args + + x_tfam.tfam(self.link_run, id_col, id_a, id_b) + + @split_and_check_args(2) + def do_x_tfam_raw(self, split_args): + """Show the family of a potential match. + [!] This command is experimental. + Arg 1: id_a + Arg 2: id_b""" + start = timer() + + id_col = self.link_run.config["id_column"] + id_a, id_b = split_args + + x_tfam.tfam_raw(self.link_run, id_col, id_a, id_b) + + end = timer() + elapsed_time = round(end - start, 2) + + print(f"Time: {elapsed_time}s") + logging.info(f"Finished: hh_tfam display - {elapsed_time}") + + @split_and_check_args(2) + def do_x_hh_tfam(self, split_args): + """Show the family of a training match. + [!] This command is experimental. + Arg 1: id_a + Arg 2: id_b""" + start = timer() + + id_col = self.link_run.config["id_column"] + id_a, id_b = split_args + + x_tfam.hh_tfam(self.link_run, id_col, id_a, id_b) + + end = timer() + elapsed_time = round(end - start, 2) + + print(f"Time: {elapsed_time}s") + logging.info(f"Finished: hh_tfam display - {elapsed_time}") + + @split_and_check_args(3) + def do_x_hh_tfam_2a(self, split_args): + """Show the family of a training match. + [!] This command is experimental. + Arg 1: id_a option 1 + Arg 2: id_a option 2 + Arg 3: id_b""" + id_col = self.link_run.config["id_column"] + id_a1, id_a2, id_b = split_args + + x_tfam.hh_tfam_2a(self.link_run, id_col, id_a1, id_a2, id_b) + + @split_and_check_args(3) + def do_x_hh_tfam_2b(self, split_args): + """Show the family of a training match. + [!] This command is experimental. + Arg 1: id_a + Arg 2: id_b option 1 + Arg 3: id_b option 2""" + id_col = self.link_run.config["id_column"] + id_a, id_b1, id_b2 = split_args + + x_tfam.hh_tfam_2b(self.link_run, id_col, id_a, id_b1, id_b2) diff --git a/hlink/spark/__init__.py b/hlink/spark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/spark/factory.py b/hlink/spark/factory.py new file mode 100644 index 0000000..02b28e2 --- /dev/null +++ b/hlink/spark/factory.py @@ -0,0 +1,101 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import sys +from pathlib import Path + +from hlink.spark.session import SparkConnection + + +class SparkFactory: + """This class allows convenient creation of a spark session. + + It defines defaults for many settings that can be overwritten with the + `set_*` functions. Each `set_*` function returns the SparkFactory for easy + chaining of settings. + + Note that some settings values are paths. These paths should be absolute. + This applies to derby_dir, warehouse_dir, and tmp_dir. + """ + + def __init__(self): + spark_dir = Path("spark").resolve() + self.derby_dir = spark_dir / "derby" + self.warehouse_dir = spark_dir / "warehouse" + self.tmp_dir = spark_dir / "tmp" + self.python = sys.executable + self.db_name = "linking" + self.is_local = True + self.mesos_url = None + self.num_cores = 4 + self.executor_memory = "10G" + self.executor_cores = 16 + + def set_derby_dir(self, derby_dir): + self.derby_dir = derby_dir + return self + + def set_warehouse_dir(self, warehouse_dir): + self.warehouse_dir = warehouse_dir + return self + + def set_tmp_dir(self, tmp_dir): + self.tmp_dir = tmp_dir + return self + + def set_python(self, python): + """Set the python executable. + + Useful when you want to guarantee that remote machines are running the + same version of python. + """ + self.python = python + return self + + def set_db_name(self, db_name): + self.db_name = db_name + return self + + def set_local(self): + """Make a local spark connection.""" + self.is_local = True + self.mesos_url = None + return self + + def set_mesos(self, url): + """Make a spark connection to the mesos cluster at the given URL.""" + self.is_local = False + self.mesos_url = url + return self + + def set_num_cores(self, num_cores): + self.num_cores = num_cores + return self + + def set_executor_memory(self, executor_memory): + self.executor_memory = executor_memory + return self + + def set_executor_cores(self, executor_cores): + self.executor_cores = executor_cores + return self + + def create(self): + spark_conn = SparkConnection( + str(self.derby_dir), + str(self.warehouse_dir), + str(self.tmp_dir), + self.python, + self.db_name, + ) + if self.is_local: + return spark_conn.local(self.num_cores, self.executor_memory) + else: + return spark_conn.mesos( + self.mesos_url, + self.num_cores, + self.executor_memory, + self.executor_cores, + ) diff --git a/hlink/spark/jars/.keepme b/hlink/spark/jars/.keepme new file mode 100644 index 0000000..e69de29 diff --git a/hlink/spark/jars/hlink_lib-assembly-1.0.jar b/hlink/spark/jars/hlink_lib-assembly-1.0.jar new file mode 100644 index 0000000..3d8dc0a Binary files /dev/null and b/hlink/spark/jars/hlink_lib-assembly-1.0.jar differ diff --git a/hlink/spark/session.py b/hlink/spark/session.py new file mode 100644 index 0000000..17f4978 --- /dev/null +++ b/hlink/spark/session.py @@ -0,0 +1,119 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import os.path +from pyspark import SparkConf +from pyspark.sql import SparkSession +import hlink.spark +from pyspark.sql.types import * + + +class SparkConnection(object): + """Handles initialization of spark session and connection to local / mesos cluster.""" + + def __init__(self, derby_dir, warehouse_dir, tmp_dir, python, db_name): + self.derby_dir = derby_dir + self.warehouse_dir = warehouse_dir + self.db_name = db_name + self.tmp_dir = tmp_dir + self.python = python + + def spark_conf(self, executor_cores, executor_memory, cores): + spark_package_path = os.path.dirname(hlink.spark.__file__) + jar_path = os.path.join( + spark_package_path, "jars", "hlink_lib-assembly-1.0.jar" + ) + os.environ["PYSPARK_PYTHON"] = self.python + conf = ( + SparkConf() + .set("spark.pyspark.python", self.python) + .set("spark.local.dir", self.tmp_dir) + .set("spark.sql.warehouse.dir", self.warehouse_dir) + .set( + "spark.driver.extraJavaOptions", f"-Dderby.system.home={self.derby_dir}" + ) + .set("spark.executorEnv.SPARK_LOCAL_DIRS", self.tmp_dir) + .setAppName("linking") + # .set("spark.executor.cores", executor_cores) \ + ) + if executor_memory: + conf.set("spark.executor.memory", executor_memory) + # conf.set("spark.driver.memory", executor_memory) + if cores: + conf.set("spark.cores.max", cores) + + if os.path.isfile(jar_path): + conf = conf.set("spark.jars", jar_path) + return conf + + def local(self, cores=1, executor_memory="10G"): + """Create a local 'cluster'.""" + return self.connect(f"local[{cores}]", cores, executor_memory, cores) + + def mesos(self, url, cores, executor_memory, executor_cores=16): + """Connect to a mesos cluster at the given URL to get cluster resources.""" + return self.connect( + url, + executor_cores, + executor_memory, + cores, + ) + + def connect( + self, connection_string, executor_cores=None, executor_memory=None, cores=None + ): + conf = self.spark_conf( + executor_cores=executor_cores, executor_memory=executor_memory, cores=cores + ) + session = ( + SparkSession.builder.config(conf=conf) + .enableHiveSupport() + .master(connection_string) + .getOrCreate() + ) + session.sparkContext.setLogLevel("ERROR") + + if self.db_name not in [d.name for d in session.catalog.listDatabases()]: + session.sql(f"CREATE DATABASE IF NOT EXISTS {self.db_name}") + session.catalog.setCurrentDatabase(self.db_name) + session.sparkContext.setCheckpointDir(str(self.tmp_dir)) + self.__register_udfs(session) + return session + + def __register_udfs(self, session): + session.udf.registerJavaFunction("jw", "com.isrdi.udfs.JWCompare", DoubleType()) + session.udf.registerJavaFunction( + "jw_max", "com.isrdi.udfs.MaxJWCompare", DoubleType() + ) + session.udf.registerJavaFunction( + "jw_rate", "com.isrdi.udfs.JWRate", DoubleType() + ) + session.udf.registerJavaFunction( + "rel_jw", "com.isrdi.udfs.JWRelatedRows", DoubleType() + ) + session.udf.registerJavaFunction( + "extra_children", "com.isrdi.udfs.ExtraChildren", DoubleType() + ) + session.udf.registerJavaFunction( + "hh_compare_rate", "com.isrdi.udfs.HHCompare", DoubleType() + ) + session.udf.registerJavaFunction( + "has_matching_element", "com.isrdi.udfs.HasMatchingElement", BooleanType() + ) + session.udf.registerJavaFunction( + "parseProbVector", "com.isrdi.udfs.ParseProbabilityVector", DoubleType() + ) + session.udf.registerJavaFunction( + "hh_rows_get_first_value", + "com.isrdi.udfs.HHRowsGetFirstValue", + StructType( + [StructField("serial", LongType()), StructField("input", StringType())] + ), + ) + session.udf.registerJavaFunction( + "extract_neighbors", + "com.isrdi.udfs.ExtractNeighbors", + ArrayType(StringType()), + ) diff --git a/hlink/tests/__init__.py b/hlink/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/tests/conf/integration.toml b/hlink/tests/conf/integration.toml new file mode 100644 index 0000000..27672e4 --- /dev/null +++ b/hlink/tests/conf/integration.toml @@ -0,0 +1,71 @@ +id_column = "id" +feature_selections = [] +generate_potential_matches_with_full_data = true + +[datasource_a] +alias = "us1900" +file = "integration_a.csv" + +[datasource_b] +alias = "us1910" +file = "integration_b.csv" + +[[column_mappings]] +column_name = "serialp" +[[column_mappings]] +column_name = "sex" +[[column_mappings]] +column_name = "namelast" +[[column_mappings]] +column_name = "bpl" +[[column_mappings]] +column_name = "region" +[[column_mappings]] +column_name = "age" + +[[comparison_features]] +alias = "regionf" +column_name = "region" +comparison_type = "fetch_a" +categorical = true + +[[comparison_features]] +alias = "namelast_jw" +column_name = "namelast" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "sex_equals" +column_name = "sex" +comparison_type = "equals" + +[[blocking]] +column_name = "sex" +[[blocking]] +column_name = "age_3" +dataset = "a" +derived_from = "age" +expand_length = 3 +explode = true + +[comparisons] +comparison_type = "threshold" +feature_name = "namelast_jw" +threshold = 0.8 + +[[pipeline_features]] +input_columns = ["sex_equals", "regionf"] +output_column = "sex_regionf_interaction" +transformer_type = "interaction" + +[training] +dataset = "training_data.csv" +dependent_var = "match" +decision = "drop_duplicate_with_threshold_ratio" +independent_vars = [ "namelast_jw", "regionf", "hits", "sex_regionf_interaction"] +model_parameters = [ { type = "random_forest", maxDepth = 7, numTrees = 100, featureSubsetStrategy = "sqrt", threshold_ratio = 1.3 } ] +chosen_model = { type = "random_forest", maxDepth = 7, numTrees = 100, featureSubsetStrategy = "sqrt" } +threshold_ratio = 1.3 +score_with_model = true +n_training_iterations = 2 + diff --git a/hlink/tests/conf/test.json b/hlink/tests/conf/test.json new file mode 100644 index 0000000..56899ba --- /dev/null +++ b/hlink/tests/conf/test.json @@ -0,0 +1,7 @@ +{ + "column_mappings": [], + "id_column": "id", + "substitution_columns": [], + "filter": [], + "feature_selections": [] +} diff --git a/hlink/tests/conf/test1.toml b/hlink/tests/conf/test1.toml new file mode 100644 index 0000000..0fe427e --- /dev/null +++ b/hlink/tests/conf/test1.toml @@ -0,0 +1,5 @@ +column_mappings = [] +feature_selections = [] +filter = [] +id_column = "id-toml" +substitution_columns = [] diff --git a/hlink/tests/config_loader_test.py b/hlink/tests/config_loader_test.py new file mode 100644 index 0000000..aedea7b --- /dev/null +++ b/hlink/tests/config_loader_test.py @@ -0,0 +1,33 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.configs.load_config import load_conf_file +import hlink.scripts.main +import hlink.tests.conftest +from hlink.errors import UsageError +import os.path +import pytest + + +def test_load_conf_file_json(package_path): + conf_path = os.path.join(package_path, "conf") + conf_file = os.path.join(conf_path, "test") + conf = load_conf_file(conf_file) + assert conf["id_column"] == "id" + + +def test_load_conf_file_toml(package_path): + conf_path = os.path.join(package_path, "conf") + conf_file = os.path.join(conf_path, "test1") + conf = load_conf_file(conf_file) + assert conf["id_column"] == "id-toml" + + +def test_load_conf_file_nested(package_path): + running_path = package_path.rpartition("hlink/tests")[0] + conf_name = "hlink_config/config/test_conf_flag_run" + conf_file = os.path.join(running_path, conf_name) + conf = load_conf_file(conf_file) + assert conf["id_column"] == "id_conf_flag" diff --git a/hlink/tests/conftest.py b/hlink/tests/conftest.py new file mode 100755 index 0000000..73e98b7 --- /dev/null +++ b/hlink/tests/conftest.py @@ -0,0 +1,1559 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.spark.session import SparkConnection +import hlink.scripts.main_loop +import hlink.tests +from hlink.configs.load_config import load_conf_file +import hlink.linking.matching as match +import hlink.linking.training as train +import hlink.linking.hh_training as hh_train +import hlink.linking.hh_matching as hh_match +import hlink.linking.preprocessing as pre +import hlink.linking.reporting as rep +import hlink.linking.model_exploration as me +import hlink.linking.hh_model_exploration as hh_me +from hlink.linking.link_run import LinkRun +import json +import logging +import os +import pytest +from unittest.mock import patch +import sys +from types import SimpleNamespace + + +pytest_plugins = ( + "hlink.tests.plugins.datasources", + "hlink.tests.plugins.external_data_paths", +) + + +def load_table_from_csv(link_task, path, table_name): + link_task.spark.read.csv(path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable(table_name) + + +# A work-around for Redmine bug 21151 (exceptions that are printed when tests +# are run). They occur because of an issue within the version of pyspark +# and py4j that we use. They all are raised in this method in the standard +# logging library: +real_isEnabledFor = logging.Logger.isEnabledFor + + +# The issue is that during garbage-collection part of py4j, the method is +# called with a non-numeric level (i.e., None). So, we patch the library +# method with our own: +def our_isEnabledFor(self, level): + if level is None: + return False + return real_isEnabledFor(self, level) + + +# The patch is done in the pytest fixture below. + + +@pytest.fixture(scope="session") +def spark(tmpdir_factory): + # See comment above + patcher = patch("logging.Logger.isEnabledFor", our_isEnabledFor) + patcher.start() + + os.environ["PYSPARK_PYTHON"] = sys.executable + spark_connection = SparkConnection( + tmpdir_factory.mktemp("derby"), + tmpdir_factory.mktemp("warehouse"), + tmpdir_factory.mktemp("spark_tmp_dir"), + sys.executable, + "linking", + ) + return spark_connection.local() + + +@pytest.fixture(scope="function", autouse=True) +def set_shuffle(spark): + spark.conf.set("spark.sql.shuffle.partitions", "1") + spark.conf.set("spark.default.parallelism", "1") + + +@pytest.fixture(scope="function", autouse=True) +def drop_all_tables(main): + main.do_drop_all("") + + +@pytest.fixture() +def package_path(): + """The path to the tests package.""" + return os.path.dirname(hlink.tests.conftest.__file__) + + +@pytest.fixture() +def input_data_dir_path(package_path): + """The path to the directory containing test input data.""" + return os.path.join(package_path, "input_data") + + +@pytest.fixture() +def conf_dir_path(package_path): + """The path to the directory containing test config files.""" + return os.path.join(package_path, "conf") + + +@pytest.fixture() +def spark_test_tmp_dir_path(spark, package_path): + """The path to the test potential_matches csv file.""" + path = f"output_data/spark_test_tmp_dir{os.getenv('PYTEST_XDIST_WORKER', '')}" + full_path = os.path.join(package_path, path) + return full_path + + +@pytest.fixture() +def link_run(spark, conf): + return LinkRun(spark, conf) + + +@pytest.fixture() +def preprocessing(link_run): + return pre.Preprocessing(link_run) + + +@pytest.fixture() +def main(link_run): + main = hlink.scripts.main_loop.Main(link_run) + main.preloop() + return main + + +@pytest.fixture() +def matching(link_run): + return match.Matching(link_run) + + +@pytest.fixture() +def hh_matching(link_run): + return hh_match.HHMatching(link_run) + + +@pytest.fixture() +def training(link_run): + return train.Training(link_run) + + +@pytest.fixture() +def hh_training(link_run): + return hh_train.HHTraining(link_run) + + +@pytest.fixture() +def reporting(link_run): + return rep.Reporting(link_run) + + +@pytest.fixture() +def model_exploration(link_run): + return me.ModelExploration(link_run) + + +@pytest.fixture() +def hh_model_exploration(link_run): + return hh_me.HHModelExploration(link_run) + + +@pytest.fixture(scope="module") +def fake_self(spark): + d = {"training_conf": "training"} + n = SimpleNamespace(**d) + return n + + +# Because of the way pytest fixtures work, `conf` is evaluated only once per test +# function call. The result is cached, and any subsequent requests for the fixture +# return the *same object*. Since this fixture returns a mutable dictionary, changes +# to the returned object will affect other fixtures that request and use `conf`. +# +# We use this with the `link_run` fixture and other fixtures like `preprocessing_conf`. +# Tests will request `preprocessing_conf`, which modifies `conf`. This modifies the +# `LinkRun.config` dictionary, since that is also a pointer to `conf`. Any additional +# modifications to `preprocessing_conf` in the test are also applied to `LinkRun.config`. +# +# TODO: Maybe think of a different way to do this. This way is convenient, but it can +# be hard to understand. +@pytest.fixture(scope="function") +def conf(conf_dir_path): + return get_conf(conf_dir_path, "test.json") + + +@pytest.fixture(scope="function") +def integration_conf(input_data_dir_path, conf_dir_path): + conf_file = os.path.join(conf_dir_path, "integration") + conf = load_conf_file(conf_file) + + datasource_a = conf["datasource_a"] + datasource_b = conf["datasource_b"] + training = conf["training"] + datasource_a["file"] = os.path.join(input_data_dir_path, datasource_a["file"]) + datasource_b["file"] = os.path.join(input_data_dir_path, datasource_b["file"]) + training["dataset"] = os.path.join(input_data_dir_path, training["dataset"]) + return conf + + +def get_conf(conf_dir_path, name): + path_to_file = os.path.join(conf_dir_path, name) + with open(path_to_file) as f: + conf = json.load(f) + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf(spark, conf, base_datasources): + """Create a fixture to set the conf datasource_(a/b) values to the test data""" + pathname_a, pathname_b = base_datasources + conf["datasource_a"] = {"parquet_file": pathname_a} + conf["datasource_b"] = {"parquet_file": pathname_b} + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_all_space_columns( + spark, conf, datasource_unrestricted_blank_columns +): + """Create a fixture to set the conf datasource_(a/b) values to the test data""" + pathname_a, pathname_b = datasource_unrestricted_blank_columns + conf["datasource_a"] = {"parquet_file": pathname_a} + conf["datasource_b"] = {"parquet_file": pathname_b} + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_simple_names(spark, conf, datasource_preprocessing_simple_names): + """ Create a fixture for testing name substitution and bigrams """ + pathname_a, pathname_b = datasource_preprocessing_simple_names + conf["datasource_a"] = {"parquet_file": pathname_a} + conf["datasource_b"] = {"parquet_file": pathname_b} + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_popularity(spark, conf, ext_path_preprocessing_popularity): + """ Create a fixture for testing name substitution and bigrams """ + pathname = ext_path_preprocessing_popularity + conf["datasource_a"] = {"file": pathname} + conf["datasource_b"] = {"file": pathname} + + conf["column_mappings"] = [ + {"column_name": "sex"}, + {"column_name": "namefrst"}, + {"column_name": "namelast"}, + {"column_name": "birthyr"}, + {"column_name": "bpl"}, + ] + + conf["blocking"] = [] + conf["comparisons"] = {} + + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_street_names(spark, conf, test_street_names_data_path): + """ Create a fixture for testing street name abbreviation substitutions """ + conf["datasource_a"] = {"file": test_street_names_data_path} + conf["datasource_b"] = {"file": test_street_names_data_path} + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_birthyr(spark, conf, birthyr_replace_path): + """ Create a fixture for testing name substitution and bigrams """ + conf["datasource_a"] = {"file": birthyr_replace_path} + conf["datasource_b"] = {"file": birthyr_replace_path} + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_synthetic_household_data( + spark, conf, datasource_synthetic_households +): + """ Create a fixture conf for testing union transform of household/neighborhood data """ + pathname_a, pathname_b = datasource_synthetic_households + conf["datasource_a"] = {"parquet_file": pathname_a} + conf["datasource_b"] = {"parquet_file": pathname_b} + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_19thc_nativity_conf( + spark, conf, datasource_19thc_nativity_households_data +): + """ Create a fixture conf for testing nativity calculation """ + full_path_a, full_path_b = datasource_19thc_nativity_households_data + conf["datasource_a"] = {"file": full_path_a} + conf["datasource_b"] = {"file": full_path_b} + + conf["column_mappings"] = [ + {"column_name": "serial"}, + {"column_name": "pernum"}, + {"column_name": "relate"}, + {"column_name": "bpl"}, + {"column_name": "momloc"}, + {"column_name": "poploc"}, + {"column_name": "namefrst"}, + {"column_name": "namelast"}, + {"column_name": "key_nativity_calc"}, + { + "column_name": "nativity", + "set_value_column_b": 0, + }, + { + "alias": "test_nativity", + "column_name": "key_nativity_calc", + "set_value_column_a": 0, + }, + {"column_name": "key_mbpl"}, + {"column_name": "key_fbpl"}, + { + "column_name": "mbpl", + "set_value_column_a": 999, + }, + { + "column_name": "key_mbpl_range", + "set_value_column_b": 0, + }, + { + "column_name": "key_mbpl_range_b", + "set_value_column_a": 0, + }, + { + "column_name": "key_mother_nativity", + "set_value_column_b": 0, + }, + { + "column_name": "key_mbpl_match", + "set_value_column_b": 0, + }, + { + "column_name": "key_fbpl_match", + "set_value_column_b": 0, + }, + { + "column_name": "key_mfbpl_match", + "set_value_column_b": 0, + }, + { + "column_name": "key_m_caution_1870_1880", + "set_value_column_b": 0, + }, + { + "column_name": "key_m_caution_1850_1860", + "set_value_column_b": 0, + }, + ] + + conf["blocking"] = [{"column_name": "id"}] + conf["comparisons"] = {} + + conf["feature_selections"] = [ + { + "output_column": "mbpl_range_b", + "input_col": "mbpl", + "transform": "sql_condition", + "condition": "case when mbpl >= 997 then 0 when mbpl < 100 then 1 when (mbpl > 99 and mbpl < 997) then 2 else 0 end", + "set_value_column_a": 0, + }, + { + "family_id": "serial", + "other_col": "nativity", + "output_col": "mother_nativity", + "person_id": "pernum", + "person_pointer": "momloc", + "transform": "attach_family_col", + "set_value_column_b": 0, + }, + { + "output_col": "mbpl_calc", + "transform": "attach_family_col", + "other_col": "bpl", + "person_pointer": "momloc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "fbpl_calc", + "transform": "attach_family_col", + "other_col": "bpl", + "person_pointer": "poploc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "post_agg_feature": True, + "output_column": "mbpl_range", + "input_column": "mother_nativity", + "transform": "sql_condition", + "condition": "case when mother_nativity == 0 then 0 when mother_nativity > 0 and mother_nativity < 5 then 1 when mother_nativity == 5 then 2 else 0 end", + "override_column_b": "mbpl_range_b", + }, + { + "post_agg_feature": True, + "output_column": "nativity_calc", + "transform": "sql_condition", + "input_columns": ["bpl", "mbpl_calc", "fbpl_calc"], + "condition": """case + when bpl >= 997 or mbpl_calc >= 997 or fbpl_calc >= 997 + then 0 + when bpl < 100 and fbpl_calc < 100 and mbpl_calc < 100 + then 1 + when bpl < 100 and mbpl_calc > 99 and fbpl_calc > 99 + then 4 + when bpl < 100 and mbpl_calc > 99 and fbpl_calc < 100 + then 3 + when bpl < 100 and fbpl_calc > 99 and mbpl_calc < 100 + then 2 + when bpl > 99 and bpl < 997 + then 5 + else 0 + end""", + "override_column_b": "test_nativity", + }, + ] + + conf["comparison_features"] = [ + { + "alias": "mbpl_match", + "column_name": "mbpl_calc", + "comparison_type": "present_and_matching_categorical", + }, + { + "alias": "fbpl_match", + "column_name": "fbpl_calc", + "comparison_type": "present_and_matching_categorical", + }, + { + "alias": "mfbpl_match", + "column_name": "nativity_calc", + "comparison_type": "present_and_equal_categorical_in_universe", + "NIU": "0", + "categorical": True, + }, + { + "alias": "m_caution_1870_1880", + "categorical": True, + "column_name": "mbpl_range", + "comparison_type": "not_zero_and_not_equals", + }, + { + "alias": "m_caution_1850_1860", + "categorical": True, + "column_name": "mbpl_calc", + "comparison_type": "present_and_not_equal", + }, + { + "column_name": "key_mbpl_match", + "alias": "key_mbpl_match", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_fbpl_match", + "alias": "key_fbpl_match", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_mfbpl_match", + "alias": "key_mfbpl_match", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_m_caution_1870_1880", + "alias": "key_m_caution_1870_1880", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_m_caution_1850_1860", + "alias": "key_m_caution_1850_1860", + "comparison_type": "fetch_a", + }, + ] + + conf["training"] = { + "dependent_var": "match", + "independent_vars": [ + "mbpl_match", + "fbpl_match", + "mfbpl_match", + "m_caution_1870_1880", + "m_caution_1850_1860", + ], + } + + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_19thc_caution_conf( + spark, conf, datasource_19thc_nativity_households_data +): + """ Create a fixture conf for testing nativity calculation """ + full_path_a, full_path_b = datasource_19thc_nativity_households_data + conf["datasource_a"] = {"file": full_path_a} + conf["datasource_b"] = {"file": full_path_b} + + conf["column_mappings"] = [ + {"column_name": "serial"}, + {"column_name": "pernum"}, + {"column_name": "relate"}, + {"column_name": "bpl"}, + {"column_name": "momloc"}, + {"column_name": "stepmom"}, + {"column_name": "poploc"}, + {"column_name": "namefrst"}, + {"column_name": "namelast"}, + {"column_name": "birthyr"}, + {"column_name": "key_nativity_calc"}, + { + "column_name": "nativity", + "set_value_column_b": 0, + }, + { + "alias": "test_nativity", + "column_name": "key_nativity_calc", + "set_value_column_a": 0, + }, + {"column_name": "key_mbpl"}, + {"column_name": "key_fbpl"}, + { + "column_name": "mbpl", + "set_value_column_a": 999, + }, + { + "column_name": "key_mbpl_range", + "set_value_column_b": 0, + }, + { + "column_name": "key_mbpl_range_b", + "set_value_column_a": 0, + }, + { + "column_name": "key_mother_nativity", + "set_value_column_b": 0, + }, + { + "column_name": "key_mbpl_match", + "set_value_column_b": 0, + }, + { + "column_name": "key_fbpl_match", + "set_value_column_b": 0, + }, + { + "column_name": "key_mfbpl_match", + "set_value_column_b": 0, + }, + { + "column_name": "key_m_caution_1870_1880", + "set_value_column_b": 0, + }, + { + "column_name": "key_m_caution_1850_1860", + "set_value_column_b": 0, + }, + { + "column_name": "key_m_caution_cc3_012", + "set_value_column_b": 0, + }, + { + "column_name": "key_m_caution_cc4_012", + "set_value_column_b": 0, + }, + { + "column_name": "key_intermediate_mbpl_range_not_equals", + "set_value_column_b": 0, + }, + { + "column_name": "key_intermediate_mbpl_range_not_zero_and_not_equals", + "set_value_column_b": 0, + }, + { + "column_name": "key_intermediate_mother_birthyr_abs_diff_5", + "set_value_column_b": 0, + }, + { + "column_name": "key_intermediate_stepmom_parent_step_change", + "set_value_column_b": 0, + }, + { + "column_name": "key_intermediate_momloc_present_both_years", + "set_value_column_b": 0, + }, + ] + + conf["blocking"] = [{"column_name": "id"}] + conf["comparisons"] = {} + + conf["feature_selections"] = [ + { + "output_column": "mbpl_range_b", + "input_col": "mbpl", + "transform": "sql_condition", + "condition": "case when mbpl >= 997 then 0 when mbpl < 100 then 1 when (mbpl > 99 and mbpl < 997) then 2 else 0 end", + "set_value_column_a": 0, + }, + { + "family_id": "serial", + "other_col": "nativity", + "output_col": "mother_nativity", + "person_id": "pernum", + "person_pointer": "momloc", + "transform": "attach_family_col", + "set_value_column_b": 0, + }, + { + "output_col": "mbpl_calc", + "transform": "attach_family_col", + "other_col": "bpl", + "person_pointer": "momloc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "fbpl_calc", + "transform": "attach_family_col", + "other_col": "bpl", + "person_pointer": "poploc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "mother_birthyr", + "transform": "attach_family_col", + "other_col": "birthyr", + "person_pointer": "momloc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "post_agg_feature": True, + "output_column": "mbpl_range", + "input_column": "mother_nativity", + "transform": "sql_condition", + "condition": "case when mother_nativity == 0 then 0 when mother_nativity > 0 and mother_nativity < 5 then 1 when mother_nativity == 5 then 2 else 0 end", + "override_column_b": "mbpl_range_b", + }, + { + "post_agg_feature": True, + "output_column": "nativity_calc", + "transform": "sql_condition", + "input_columns": ["bpl", "mbpl_calc", "fbpl_calc"], + "condition": """case + when bpl >= 997 or mbpl_calc >= 997 or fbpl_calc >= 997 + then 0 + when bpl < 100 and fbpl_calc < 100 and mbpl_calc < 100 + then 1 + when bpl < 100 and mbpl_calc > 99 and fbpl_calc > 99 + then 4 + when bpl < 100 and mbpl_calc > 99 and fbpl_calc < 100 + then 3 + when bpl < 100 and fbpl_calc > 99 and mbpl_calc < 100 + then 2 + when bpl > 99 and bpl < 997 + then 5 + else 0 + end""", + "override_column_b": "test_nativity", + }, + ] + + conf["comparison_features"] = [ + { + "alias": "mbpl_match", + "column_name": "mbpl_calc", + "comparison_type": "present_and_matching_categorical", + }, + { + "alias": "fbpl_match", + "column_name": "fbpl_calc", + "comparison_type": "present_and_matching_categorical", + }, + { + "alias": "mfbpl_match", + "column_name": "nativity_calc", + "comparison_type": "present_and_equal_categorical_in_universe", + "NIU": "0", + "categorical": True, + }, + { + "alias": "m_caution_1870_1880", + "categorical": True, + "column_name": "mbpl_range", + "comparison_type": "not_zero_and_not_equals", + }, + { + "alias": "m_caution_1850_1860", + "categorical": True, + "column_name": "mbpl_calc", + "comparison_type": "present_and_not_equal", + }, + { + "column_name": "key_mbpl_match", + "alias": "key_mbpl_match", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_fbpl_match", + "alias": "key_fbpl_match", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_mfbpl_match", + "alias": "key_mfbpl_match", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_m_caution_1870_1880", + "alias": "key_m_caution_1870_1880", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_m_caution_1850_1860", + "alias": "key_m_caution_1850_1860", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_m_caution_cc3_012", + "alias": "key_m_caution_cc3_012", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_m_caution_cc4_012", + "alias": "key_m_caution_cc4_012", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_intermediate_mbpl_range_not_equals", + "alias": "key_intermediate_mbpl_range_not_equals", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_intermediate_mbpl_range_not_zero_and_not_equals", + "alias": "key_intermediate_mbpl_range_not_zero_and_not_equals", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_intermediate_mother_birthyr_abs_diff_5", + "alias": "key_intermediate_mother_birthyr_abs_diff_5", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_intermediate_stepmom_parent_step_change", + "alias": "key_intermediate_stepmom_parent_step_change", + "comparison_type": "fetch_a", + }, + { + "column_name": "key_intermediate_momloc_present_both_years", + "alias": "key_intermediate_momloc_present_both_years", + "comparison_type": "fetch_a", + }, + { + "alias": "m_caution_cc3_012", + "column_names": ["mbpl_range", "mother_birthyr", "momloc"], + "comparison_type": "caution_comp_3_012", + "categorical": True, + "comp_a": { + "column_name": "mbpl_range", + "comparison_type": "not_equals", + }, + "comp_b": { + "column_name": "mother_birthyr", + "comparison_type": "abs_diff", + "gt_threshold": 5, + }, + "comp_c": { + "column_name": "momloc", + "comparison_type": "present_both_years", + }, + }, + { + "alias": "m_caution_cc4_012", + "column_names": ["mbpl_range", "mother_birthyr", "stepmom", "momloc"], + "comparison_type": "caution_comp_4_012", + "categorical": True, + "comp_a": { + "column_name": "mbpl_range", + "comparison_type": "not_zero_and_not_equals", + }, + "comp_b": { + "column_name": "mother_birthyr", + "comparison_type": "abs_diff", + "gt_threshold": 5, + }, + "comp_c": { + "column_name": "stepmom", + "comparison_type": "parent_step_change", + }, + "comp_d": { + "column_name": "momloc", + "comparison_type": "present_both_years", + }, + }, + { + "alias": "intermediate_mbpl_range_not_equals", + "column_name": "mbpl_range", + "comparison_type": "not_equals", + }, + { + "alias": "intermediate_mbpl_range_not_zero_and_not_equals", + "column_name": "mbpl_range", + "comparison_type": "not_zero_and_not_equals", + }, + { + "alias": "intermediate_mother_birthyr_abs_diff_5", + "column_name": "mother_birthyr", + "comparison_type": "abs_diff", + "gt_threshold": 5, + }, + { + "alias": "intermediate_stepmom_parent_step_change", + "column_name": "stepmom", + "comparison_type": "parent_step_change", + }, + { + "alias": "intermediate_momloc_present_both_years", + "column_name": "momloc", + "comparison_type": "present_both_years", + }, + ] + + conf["training"] = { + "dependent_var": "match", + "independent_vars": [ + "mbpl_match", + "fbpl_match", + "mfbpl_match", + "m_caution_1870_1880", + "m_caution_1850_1860", + "m_caution_cc3_012", + "m_caution_cc4_012", + "intermediate_mbpl_range_not_equals", + "intermediate_mbpl_range_not_zero_and_not_equals", + "intermediate_mother_birthyr_abs_diff_5", + "intermediate_stepmom_parent_step_change", + "intermediate_momloc_present_both_years", + ], + } + + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_household_data(spark, conf, datasource_real_households): + """ Create a fixture conf for testing family/neighborhood transforms """ + full_path_a, full_path_b = datasource_real_households + conf["datasource_a"] = {"file": full_path_a} + conf["datasource_b"] = {"file": full_path_b} + + conf["column_mappings"] = [ + {"column_name": "namefrst", "alias": "namefrst_orig"}, + {"column_name": "namelast", "alias": "namelast_orig"}, + {"column_name": "bpl"}, + {"column_name": "sex"}, + {"column_name": "enumdist"}, + {"column_name": "pernum"}, + {"column_name": "serial"}, + {"column_name": "sploc"}, + {"column_name": "poploc"}, + {"column_name": "momloc"}, + {"column_name": "relate"}, + { + "column_name": "namefrst", + "alias": "namefrst_clean", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + }, + { + "column_name": "namelast", + "alias": "namelast_clean", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_prefixes", "values": ["ah"]}, + { + "type": "condense_prefixes", + "values": ["mc", "mac", "o", "de", "van", "di"], + }, + {"type": "remove_one_letter_names"}, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + }, + ] + + conf["blocking"] = [] + conf["comparisons"] = {} + + return conf + + +@pytest.fixture(scope="function") +def preprocessing_conf_rel_rows(spark, conf, test_data_rel_rows_age): + """ Create a fixture conf for testing family/neighborhood transforms """ + full_path_a, full_path_b = test_data_rel_rows_age + conf["datasource_a"] = {"file": full_path_a} + conf["datasource_b"] = {"file": full_path_b} + conf["id_column"] = "histid" + + conf["column_mappings"] = [ + {"column_name": "namefrst", "alias": "namefrst_orig"}, + {"column_name": "namelast", "alias": "namelast_orig"}, + {"column_name": "birthyr"}, + {"column_name": "sex"}, + {"column_name": "pernum"}, + {"column_name": "serialp"}, + {"column_name": "relate"}, + {"column_name": "age"}, + {"column_name": "yearp", "alias": "year"}, + { + "column_name": "namefrst", + "alias": "namefrst_clean", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + {"type": "remove_suffixes", "values": ["jr", "sr", "ii", "iii"]}, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + }, + { + "column_name": "namefrst_clean", + "alias": "namefrst_split", + "transforms": [{"type": "split"}], + }, + { + "column_name": "namefrst_split", + "alias": "namefrst_unstd", + "transforms": [{"type": "array_index", "value": 0}], + }, + { + "column_name": "birthyr", + "alias": "clean_birthyr", + "transforms": [ + { + "type": "mapping", + "mappings": {9999: "", 1999: ""}, + "output_type": "int", + } + ], + }, + ] + + conf["feature_selections"] = [ + { + "input_column": "clean_birthyr", + "output_column": "replaced_birthyr", + "condition": "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end", + "transform": "sql_condition", + }, + { + "family_id": "serialp", + "input_cols": [ + "histid", + "namefrst_unstd", + "replaced_birthyr", + "sex", + "relate", + ], + "output_col": "namefrst_related_rows", + "transform": "related_individual_rows", + "filters": [ + {"column": "relate", "min": 300, "max": 1099}, + {"column": "age", "min": 0, "max": 999}, + ], + }, + { + "family_id": "serialp", + "input_cols": [ + "histid", + "namefrst_unstd", + "replaced_birthyr", + "sex", + "relate", + ], + "output_col": "namefrst_related_rows_age_min_5", + "transform": "related_individual_rows", + "filters": [ + {"column": "relate", "min": 300, "max": 1099}, + {"column": "age", "min": 5, "max": 999}, + ], + }, + { + "family_id": "serialp", + "input_cols": [ + "histid", + "namefrst_unstd", + "replaced_birthyr", + "sex", + "relate", + ], + "output_col": "namefrst_related_rows_age_b_min_5", + "transform": "related_individual_rows", + "filters": [ + {"column": "relate", "min": 300, "max": 1099}, + {"column": "age", "min": 5, "max": 999, "dataset": "b"}, + ], + }, + ] + + conf["blocking"] = [] + conf["comparisons"] = {} + + return conf + + +@pytest.fixture(scope="function") +def matching_conf(spark, conf, datasource_matching, matching): + """Create conf fixture for testing matching steps using the prepped_df_(a/b) dataframes and populate basic config values""" + matching.link_run.print_sql = True + conf["column_mappings"] = [ + {"column_name": "serialp"}, + {"column_name": "namefrst"}, + {"column_name": "namelast"}, + {"column_name": "bpl"}, + {"column_name": "sex"}, + {"column_name": "street"}, + {"column_name": "enum_dist"}, + ] + conf["blocking"] = [{"column_name": "sex"}] + conf["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + } + conf["training"] = {} + + conf["training"]["dependent_var"] = "match" + conf["training"]["independent_vars"] = ["namefrst_jw", "namelast_jw", "ssex"] + return conf + + +@pytest.fixture(scope="function") +def matching_conf_namefrst_std_and_unstd( + spark, conf, matching, test_data_blocking_double_comparison +): + """Create conf fixture for testing matching steps using the prepped_df_(a/b) dataframes and populate basic config values""" + + conf["id_column"] = "histid" + full_path_a, full_path_b = test_data_blocking_double_comparison + conf["datasource_a"] = {"file": full_path_a} + conf["datasource_b"] = {"file": full_path_b} + + conf["column_mappings"] = [ + {"column_name": "namefrst_unstd"}, + {"column_name": "namefrst_std"}, + {"column_name": "namelast_clean"}, + {"column_name": "bpl_clean"}, + {"column_name": "sex"}, + {"column_name": "birthyr"}, + ] + conf["blocking"] = [ + {"column_name": "sex"}, + {"column_name": "bpl_clean"}, + {"column_name": "birthyr"}, + ] + + conf["comparisons"] = { + "operator": "AND", + "comp_a": { + "operator": "OR", + "comp_a": { + "feature_name": "namefrst_unstd_jw", + "threshold": 0.70, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namefrst_std_jw", + "threshold": 0.70, + "comparison_type": "threshold", + }, + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.70, + "comparison_type": "threshold", + }, + } + + conf["comparison_features"] = [ + { + "alias": "namefrst_unstd_jw", + "column_name": "namefrst_unstd", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namefrst_std_jw", + "column_name": "namefrst_std", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + ] + + conf["training"] = {} + return conf + + +@pytest.fixture(scope="function") +def blocking_explode_conf(spark, conf): + """Create conf fixture for testing matching steps using the family/neighborhood transforms and populate basic config values""" + + conf["column_mappings"] = [ + {"column_name": "namefrst"}, + {"column_name": "namelast"}, + {"column_name": "birthyr"}, + {"column_name": "sex"}, + ] + + conf["blocking"] = [ + { + "column_name": "birthyr_3", + "dataset": "a", + "derived_from": "birthyr", + "expand_length": 3, + "explode": True, + }, + {"column_name": "sex"}, + ] + + conf["comparison_features"] = [ + { + "alias": "namefrst_jw", + "column_name": "namefrst", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "ssex", + "column_name": "sex", + "comparison_type": "equals", + "categorical": True, + }, + ] + conf["comparisons"] = { + "comp_a": { + "feature_name": "namefrst_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + } + + conf["training"] = {} + conf["training"]["dependent_var"] = "match" + conf["training"]["independent_vars"] = ["namefrst_jw", "namelast_jw", "ssex"] + return conf + + +@pytest.fixture(scope="function") +def matching_household_conf( + spark, conf, datasource_real_households, preprocessing, matching +): + """Create conf fixture for testing matching steps using the family/neighborhood transforms and populate basic config values""" + + full_path_a, full_path_b = datasource_real_households + conf["datasource_a"] = {"file": full_path_a} + conf["datasource_b"] = {"file": full_path_b} + + conf["column_mappings"] = [ + {"column_name": "namefrst", "alias": "namefrst_orig"}, + {"column_name": "namelast", "alias": "namelast_orig"}, + {"column_name": "bpl"}, + {"column_name": "birthyr"}, + {"column_name": "age"}, + {"column_name": "sex"}, + {"column_name": "enumdist"}, + {"column_name": "pernum"}, + {"column_name": "serial", "alias": "serialp"}, + {"column_name": "sploc"}, + {"column_name": "poploc"}, + {"column_name": "momloc"}, + {"column_name": "relate"}, + { + "column_name": "namefrst", + "alias": "namefrst_std", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + }, + { + "column_name": "namelast", + "alias": "namelast_clean", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_prefixes", "values": ["ah"]}, + { + "type": "condense_prefixes", + "values": ["mc", "mac", "o", "de", "van", "di"], + }, + {"type": "remove_one_letter_names"}, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + }, + ] + + conf["blocking"] = [{"column_name": "sex"}] + + conf["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + } + + conf["training"] = {} + conf["training"]["dependent_var"] = "match" + + return conf + + +@pytest.fixture(scope="function") +def matching_comparison_conf(spark, conf, datasource_matching_comparisons, matching): + """Create conf fixture for testing matching steps using the prepped_df_(a/b) dataframes and populate basic config values""" + conf["column_mappings"] = [ + {"column_name": "id"}, + {"column_name": "namelast"}, + {"column_name": "sex"}, + {"column_name": "mpbl"}, + {"column_name": "mother_birthyr"}, + {"column_name": "stepmom"}, + {"column_name": "spouse_bpl"}, + {"column_name": "spouse_birthyr"}, + {"column_name": "durmarr"}, + {"column_name": "mother_namefrst"}, + {"column_name": "spouse_namefrst"}, + {"column_name": "momloc"}, + {"column_name": "sploc"}, + ] + conf["blocking"] = [{"column_name": "sex"}] + conf["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + } + + return conf + + +@pytest.fixture(scope="function") +def matching_conf_counties(spark, conf, county_dist_datasources): + """ Create a fixture for testing name substitution and bigrams """ + pathname_a, pathname_b = county_dist_datasources + conf["datasource_a"] = {"parquet_file": pathname_a} + conf["datasource_b"] = {"parquet_file": pathname_b} + + return conf + + +@pytest.fixture(scope="function") +def matching_conf_nativity(spark, conf, nativity_datasources): + """ Create a fixture for testing name substitution and bigrams """ + pathname_a, pathname_b = nativity_datasources + conf["datasource_a"] = {"file": pathname_a} + conf["datasource_b"] = {"file": pathname_b} + conf["training"] = {} + + conf["training"]["dependent_var"] = "match" + + return conf + + +@pytest.fixture(scope="function") +def training_conf(spark, conf, training_data_path, datasource_training): + """Create the prepped_df_(a/b) dataframes and populate basic config values""" + conf["training"] = { + "dataset": training_data_path, + "dependent_var": "match", + "n_training_iterations": 10, + } + conf["column_mappings"] = [ + {"column_name": "serialp"}, + {"column_name": "namelast"}, + {"column_name": "bpl"}, + {"column_name": "sex"}, + {"column_name": "region"}, + ] + conf["blocking"] = [{"column_name": "sex"}] + conf["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + } + + return conf + + +@pytest.fixture(scope="function") +def hh_training_conf(spark, conf, hh_training_data_path): + """Create the prepped_df_(a/b) dataframes and populate basic config values""" + conf["id_column"] = "histid" + conf["drop_data_from_scored_matches"] = False + conf["hh_training"] = { + "dataset": hh_training_data_path, + "dependent_var": "match", + "prediction_col": "match", + "n_training_iterations": 4, + "seed": 120, + "independent_vars": [ + "namelast_jw", + "namefrst_jw", + "byrdiff", + "ssex", + "srelate", + ], + "score_with_model": True, + "use_training_data_features": False, + "decision": "drop_duplicate_with_threshold_ratio", + "get_precision_recall_curve": True, + "chosen_model": { + "type": "logistic_regression", + "threshold": 0.5, + "threshold_ratio": 1.2, + }, + "model_parameters": [ + {"type": "logistic_regression", "threshold": 0.5, "threshold_ratio": 1.2}, + { + "type": "random_forest", + "maxDepth": 5.0, + "numTrees": 75.0, + "threshold": 0.5, + "threshold_ratio": 1.2, + }, + ], + } + conf["column_mappings"] = [ + {"column_name": "serialp"}, + {"column_name": "relate"}, + {"column_name": "namelast_clean"}, + {"column_name": "namefrst_unstd"}, + {"column_name": "clean_birthyr"}, + {"column_name": "sex"}, + ] + conf["blocking"] = [] + conf["comparison_features"] = [ + { + "alias": "byrdiff", + "column_name": "clean_birthyr", + "comparison_type": "abs_diff", + }, + { + "alias": "ssex", + "column_name": "sex", + "comparison_type": "equals", + "categorical": True, + }, + { + "alias": "srelate", + "column_name": "relate", + "comparison_type": "equals", + "categorical": True, + }, + { + "alias": "namefrst_jw", + "column_name": "namefrst_unstd", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + ] + + return conf + + +@pytest.fixture(scope="function") +def hh_agg_feat_conf(spark, conf, hh_training_data_path): + """Create the prepped_df_(a/b) dataframes and populate basic config values""" + conf["id_column"] = "histid" + conf["drop_data_from_scored_matches"] = False + conf["hh_training"] = { + "dataset": hh_training_data_path, + "dependent_var": "match", + "independent_vars": [ + "jw_max_a", + "jw_max_b", + "f1_match", + "f2_match", + "byrdiff", + "sexmatch", + "mardurmatch", + ], + "score_with_model": True, + "chosen_model": { + "type": "logistic_regression", + "threshold": 0.5, + "threshold_ratio": 1.2, + }, + } + conf["column_mappings"] = [ + {"column_name": "serialp"}, + {"column_name": "relate"}, + {"column_name": "pernum"}, + {"column_name": "namelast_clean"}, + {"column_name": "namefrst_unstd"}, + {"column_name": "clean_birthyr"}, + {"column_name": "sex"}, + {"column_name": "namefrst_mid_init"}, + {"column_name": "namefrst_init"}, + {"column_name": "namefrst_mid_init_2"}, + ] + conf["blocking"] = [] + conf["comparison_features"] = [ + { + "alias": "byrdiff", + "column_name": "clean_birthyr", + "comparison_type": "abs_diff", + }, + { + "alias": "sexmatch", + "column_name": "sex", + "comparison_type": "equals", + "categorical": True, + }, + { + "alias": "namefrst_jw", + "column_name": "namefrst_unstd", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + {"alias": "durmarr_a", "column_name": "durmarr", "comparison_type": "fetch_a"}, + {"alias": "durmarr_b", "column_name": "durmarr", "comparison_type": "fetch_b"}, + { + "alias": "mardurmatch", + "column_name": "durmarr", + "not_equals": 99, + "comparison_type": "abs_diff", + "btwn_threshold": [9, 14], + "categorical": True, + }, + { + "alias": "f1_match", + "first_init_col": "namefrst_init", + "mid_init_cols": ["namefrst_mid_init", "namefrst_mid_init_2"], + "comparison_type": "f1_match", + "categorical": True, + }, + { + "alias": "f2_match", + "first_init_col": "namefrst_init", + "mid_init_cols": ["namefrst_mid_init", "namefrst_mid_init_2"], + "comparison_type": "f2_match", + "categorical": True, + }, + { + "alias": "fn_a", + "column_name": "namefrst_unstd", + "comparison_type": "fetch_a", + }, + {"alias": "fi_a", "column_name": "namefrst_init", "comparison_type": "fetch_a"}, + { + "alias": "fn_b", + "column_name": "namefrst_unstd", + "comparison_type": "fetch_b", + }, + {"alias": "fi_b", "column_name": "namefrst_init", "comparison_type": "fetch_b"}, + { + "alias": "mi_a", + "column_name": "namefrst_mid_init", + "comparison_type": "fetch_a", + }, + { + "alias": "mi_b", + "column_name": "namefrst_mid_init", + "comparison_type": "fetch_b", + }, + ] + + return conf diff --git a/hlink/tests/core/__init__.py b/hlink/tests/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/tests/core/comparison_core_test.py b/hlink/tests/core/comparison_core_test.py new file mode 100644 index 0000000..f2c90da --- /dev/null +++ b/hlink/tests/core/comparison_core_test.py @@ -0,0 +1,267 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.linking.core.comparison_feature as comparison_feature_core +import hlink.linking.core.pipeline as pipeline_core +from pyspark.ml import Pipeline + + +def test_rel_jaro_winkler_comparison(spark, conf, datasource_rel_jw_input): + """ Test the comparison feature data output """ + + table_a, table_b = datasource_rel_jw_input + features = [ + { + "alias": "rel_birthyr", + "column_name": "namefrst_related_rows", + "birthyr_col": "replaced_birthyr", + "comparison_type": "rel_jaro_winkler", + "jw_threshold": 0.9, + "age_threshold": 5, + "lower_threshold": 1, + }, + { + "alias": "rel_replaced_birthyr", + "column_name": "namefrst_related_rows_birthyr", + "comparison_type": "rel_jaro_winkler", + "jw_threshold": 0.9, + "age_threshold": 5, + "lower_threshold": 1, + }, + ] + sql_expr_0 = comparison_feature_core.generate_comparison_feature( + features[0], conf["id_column"], include_as=True + ) + sql_expr_1 = comparison_feature_core.generate_comparison_feature( + features[1], conf["id_column"], include_as=True + ) + + table_a.createOrReplaceTempView("table_a") + table_b.createOrReplaceTempView("table_b") + + df = spark.sql( + f"select a.id as id_a, b.id as id_b, {sql_expr_0}, {sql_expr_1} from table_a a cross join table_b b" + ).toPandas() + + assert df.query("id_a == '0' and id_b == '0'")["rel_birthyr"].iloc[0] + assert not df.query("id_a == '0' and id_b == '1'")["rel_birthyr"].iloc[0] + assert df.query("id_a == '0' and id_b == '0'")["rel_replaced_birthyr"].iloc[0] + assert not df.query("id_a == '0' and id_b == '1'")["rel_replaced_birthyr"].iloc[0] + + +def test_extra_children_comparison(spark, conf, datasource_extra_children_input): + """ Test the comparison feature data output """ + + table_a, table_b = datasource_extra_children_input + conf["id_column"] = "histid" + features = [ + { + "alias": "extra_children", + "year_b": 1910, + "column_name": "namefrst_related_rows", + "relate_col": "relate", + "histid_col": "histid", + "birthyr_col": "birthyr", + "name_col": "namefrst", + "comparison_type": "extra_children", + "jw_threshold": 0.8, + "age_threshold": 2, + } + ] + sql_expr_0 = comparison_feature_core.generate_comparison_feature( + features[0], conf["id_column"], include_as=True + ) + + table_a.createOrReplaceTempView("table_a") + table_b.createOrReplaceTempView("table_b") + + df = spark.sql( + f"select a.histid as histid_a, b.histid as histid_b, {sql_expr_0} from table_a a cross join table_b b" + ).toPandas() + + assert df.query("histid_a == 0 and histid_b == 8")["extra_children"].iloc[0] == 0 + assert df.query("histid_a == 0 and histid_b == 11")["extra_children"].iloc[0] == 2 + assert df.query("histid_a == 0 and histid_b == 15")["extra_children"].iloc[0] == 0 + + assert df.query("histid_a == 4 and histid_b == 8")["extra_children"].iloc[0] == 0 + assert df.query("histid_a == 4 and histid_b == 11")["extra_children"].iloc[0] == 2 + assert df.query("histid_a == 4 and histid_b == 15")["extra_children"].iloc[0] == 0 + + assert df.query("histid_a == 7 and histid_b == 8")["extra_children"].iloc[0] == 0 + assert df.query("histid_a == 7 and histid_b == 11")["extra_children"].iloc[0] == 0 + assert df.query("histid_a == 7 and histid_b == 15")["extra_children"].iloc[0] == 0 + + assert df.query("histid_a == 17 and histid_b == 8")["extra_children"].iloc[0] == 1 + assert df.query("histid_a == 17 and histid_b == 11")["extra_children"].iloc[0] == 2 + assert df.query("histid_a == 17 and histid_b == 15")["extra_children"].iloc[0] == 0 + + +def test_comparison_and_mi(spark, conf, datasource_mi_comparison): + """ Test the comparison feature data output """ + + table_a, table_b = datasource_mi_comparison + features = [ + { + "alias": "mi_old", + "column_name": "namefrst_mid_init", + "comparison_type": "and", + "comp_a": {"column_name": "namefrst_mid_init", "comparison_type": "equals"}, + "comp_b": { + "column_name": "namefrst_mid_init", + "comparison_type": "neither_are_null", + }, + }, + { + "alias": "mi", + "column_name": "namefrst_mid_init", + "comparison_type": "present_and_matching_categorical", + }, + ] + + sql_expr_0 = comparison_feature_core.generate_comparison_feature( + features[0], conf["id_column"], include_as=True + ) + + sql_expr_1 = comparison_feature_core.generate_comparison_feature( + features[1], conf["id_column"], include_as=True + ) + + table_a.createOrReplaceTempView("table_a") + table_b.createOrReplaceTempView("table_b") + + df = spark.sql( + f"select a.id as id_a, b.id as id_b, a.namefrst_mid_init as namefrst_mid_init_a, b.namefrst_mid_init as namefrst_mid_init_b, {sql_expr_0}, {sql_expr_1} from table_a a cross join table_b b" + ).toPandas() + + assert df.query("id_a == 10 and id_b == 40")["mi_old"].iloc[0] + assert not df.query("id_a == 20 and id_b == 40")["mi_old"].iloc[0] + assert not df.query("id_a == 20 and id_b == 50")["mi_old"].iloc[0] + assert not df.query("id_a == 20 and id_b == 60")["mi_old"].iloc[0] + assert not df.query("id_a == 30 and id_b == 50")["mi_old"].iloc[0] + assert not df.query("id_a == 30 and id_b == 60")["mi_old"].iloc[0] + + assert df.query("id_a == 10 and id_b == 40")["mi"].iloc[0] == 0 + assert df.query("id_a == 10 and id_b == 50")["mi"].iloc[0] == 2 + assert df.query("id_a == 10 and id_b == 60")["mi"].iloc[0] == 2 + assert df.query("id_a == 20 and id_b == 40")["mi"].iloc[0] == 1 + assert df.query("id_a == 20 and id_b == 50")["mi"].iloc[0] == 2 + assert df.query("id_a == 20 and id_b == 60")["mi"].iloc[0] == 2 + assert df.query("id_a == 30 and id_b == 40")["mi"].iloc[0] == 2 + assert df.query("id_a == 30 and id_b == 50")["mi"].iloc[0] == 2 + assert df.query("id_a == 30 and id_b == 60")["mi"].iloc[0] == 2 + + +def test_immyr_diff_w_imm_caution(spark, conf): + """ Test the comparison feature data output """ + + data_a = [ + (0, 5, 1900), + (1, 5, 1900), + (2, 5, 1900), + (3, 5, 1900), + (4, 5, 1900), + (5, 5, 1900), + (6, 5, 1900), + (7, 1, 0000), + (8, 1, 0000), + ] + data_b = [ + (0, 5, 1900), + (1, 5, 1901), + (2, 2, 1905), + (3, 3, 1906), + (4, 5, 1910), + (5, 5, 1911), + (6, 5, 1912), + (7, 1, 0000), + (8, 0, 0000), + ] + + table_a = spark.createDataFrame(data_a, ["id", "nativity", "yrimmig"]) + table_b = spark.createDataFrame(data_b, ["id", "nativity", "yrimmig"]) + + features = [ + { + "alias": "imm", + "column_name": "nativity", + "comparison_type": "fetch_a", + "threshold": 5, + "categorical": True, + }, + { + "alias": "immyear_diff", + "column_name": "yrimmig", + "comparison_type": "abs_diff", + "look_at_addl_var": True, + "addl_var": "nativity", + "datasource": "a", + "check_val_expr": "= 5", + "else_val": -1, + }, + ] + + expr0 = comparison_feature_core.generate_comparison_feature( + features[0], conf["id_column"], include_as=True + ) + expr1 = comparison_feature_core.generate_comparison_feature( + features[1], conf["id_column"], include_as=True + ) + + table_a.createOrReplaceTempView("table_a") + table_b.createOrReplaceTempView("table_b") + + df0 = spark.sql( + f"select a.id as id_a, b.id as id_b, {expr0}, {expr1} from table_a a join table_b b on a.id == b.id" + ) + df = df0.toPandas() + + assert df.query("id_a == 0")["imm"].iloc[0] + assert df.query("id_a == 1")["imm"].iloc[0] + assert df.query("id_a == 2")["imm"].iloc[0] + assert df.query("id_a == 3")["imm"].iloc[0] + assert not df.query("id_a == 7")["imm"].iloc[0] + assert not df.query("id_a == 8")["imm"].iloc[0] + + assert df.query("id_a == 0")["immyear_diff"].iloc[0] == 0 + assert df.query("id_a == 4")["immyear_diff"].iloc[0] == 10 + assert df.query("id_a == 7")["immyear_diff"].iloc[0] == -1 + assert df.query("id_a == 8")["immyear_diff"].iloc[0] == -1 + + df0.createOrReplaceTempView("training_features") + + conf["pipeline_features"] = [ + { + "input_column": "immyear_diff", + "output_column": "immyear_caution", + "transformer_type": "bucketizer", + "categorical": True, + "splits": [-1, 0, 6, 11, 9999], + } + ] + conf["training"] = { + "dependent_var": "match", + "independent_vars": ["immyear_diff", "immyear_caution"], + } + conf["comparison_features"] = [] + + ind_vars = conf["training"]["independent_vars"] + tf = spark.table("training_features") + pipeline_stages = pipeline_core.generate_pipeline_stages( + conf, ind_vars, tf, "training" + ) + prep_pipeline = Pipeline(stages=pipeline_stages) + prep_model = prep_pipeline.fit(tf) + prepped_data = prep_model.transform(tf) + prepped_data = prepped_data.toPandas() + + assert prepped_data.query("id_a == 0")["immyear_caution"].iloc[0] == 1 + assert prepped_data.query("id_a == 1")["immyear_caution"].iloc[0] == 1 + assert prepped_data.query("id_a == 2")["immyear_caution"].iloc[0] == 1 + assert prepped_data.query("id_a == 3")["immyear_caution"].iloc[0] == 2 + assert prepped_data.query("id_a == 4")["immyear_caution"].iloc[0] == 2 + assert prepped_data.query("id_a == 5")["immyear_caution"].iloc[0] == 3 + assert prepped_data.query("id_a == 6")["immyear_caution"].iloc[0] == 3 + assert prepped_data.query("id_a == 7")["immyear_caution"].iloc[0] == 0 + assert prepped_data.query("id_a == 8")["immyear_caution"].iloc[0] == 0 diff --git a/hlink/tests/hh_matching_test.py b/hlink/tests/hh_matching_test.py new file mode 100644 index 0000000..9748a86 --- /dev/null +++ b/hlink/tests/hh_matching_test.py @@ -0,0 +1,300 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from hlink.linking.matching.link_step_score import LinkStepScore +from hlink.tests.conftest import load_table_from_csv + + +def test_step_0_filter_and_pair(spark, hh_matching_stubs, hh_matching, conf): + """ Test hh_matching step 0 to make sure hh_blocked_matches is created correctly """ + + conf["id_column"] = "histid" + conf["hh_training"] = {"prediction_col": "prediction"} + path_a, path_b, path_matches, path_pred_matches = hh_matching_stubs + + load_table_from_csv(hh_matching, path_a, "prepped_df_a") + load_table_from_csv(hh_matching, path_b, "prepped_df_b") + load_table_from_csv(hh_matching, path_matches, "scored_potential_matches") + load_table_from_csv(hh_matching, path_pred_matches, "predicted_matches") + + hh_matching.run_step(0) + + # Create pandas DFs of the step_0 blocked matches table + blocked_matches_hh_df = spark.table("hh_blocked_matches").toPandas() + + # Make assertions on the data + assert blocked_matches_hh_df.shape[0] == 9 + assert blocked_matches_hh_df.query("serialp_a == '1'").shape == (9, 4) + assert blocked_matches_hh_df.query("serialp_b == '8'").shape == (6, 4) + assert blocked_matches_hh_df.query("serialp_b == '7'").shape == (3, 4) + + +def test_household_matching_training_integration( + spark, hh_training, hh_matching, hh_training_conf, hh_integration_test_data +): + """ Test all hh_training and hh_matching steps to ensure they work as a pipeline """ + path_a, path_b, path_pms = hh_integration_test_data + + load_table_from_csv(hh_matching, path_a, "prepped_df_a") + load_table_from_csv(hh_matching, path_b, "prepped_df_b") + load_table_from_csv(hh_matching, path_pms, "predicted_matches") + + hh_training.run_step(0) + hh_training.run_step(1) + + assert spark.table("hh_training_data").toPandas().shape == (4349, 60) + hhtf = spark.table("hh_training_features").toPandas() + assert hhtf.shape == (4349, 10) + assert all( + elem in list(hhtf.columns) + for elem in [ + "histid_a", + "histid_b", + "serialp_a", + "serialp_b", + "match", + "byrdiff", + "ssex", + "srelate", + "namefrst_jw", + "namelast_jw", + ] + ) + + hh_training.run_step(2) + + hh_matching.run_step(0) + + assert spark.table("indiv_matches").count() == 526 + assert spark.table("unmatched_a").count() == 1787 + assert spark.table("unmatched_b").count() == 1716 + assert spark.table("hh_blocked_matches").count() == 7681 + + hh_matching.run_step(1) + hh_matching.run_step(2) + + assert spark.table("hh_potential_matches_prepped").toPandas().shape == (7681, 9) + hhspms = spark.table("hh_scored_potential_matches").toPandas() + assert all( + elem in list(hhspms.columns) + for elem in [ + "serialp_a", + "serialp_b", + "histid_a", + "histid_b", + "byrdiff", + "srelate", + "namelast_jw", + "namefrst_jw", + "ssex", + "byrdiff_imp", + "namefrst_jw_imp", + "namelast_jw_imp", + "ssex_onehotencoded", + "srelate_onehotencoded", + "features_vector", + "rawPrediction", + "probability_array", + "probability", + "prediction", + ] + ) + assert ( + hhspms.query( + "histid_a == '95004C6D-041B-4F18-8140-07B3296A16E0' and histid_b == '65FA5A10-2E13-4CC0-BDAE-F0D395950DD7'" + )["prediction"].iloc[0] + == 0 + ) + assert ( + hhspms.query( + "histid_a == '95004C6D-041B-4F18-8140-07B3296A16E0' and histid_b == '6BA3BFF0-AAF5-4C11-AE97-84517DE40234'" + )["prediction"].iloc[0] + == 1 + ) + + assert ( + hhspms.query( + "histid_a == '01301C1A-28FE-41CF-8DAB-83E80DBAC4D6' and histid_b == '18F9B112-3971-4222-B124-593E88F8FF5B'" + )["prediction"].iloc[0] + == 0 + ) + assert ( + hhspms.query( + "histid_a == '24E352C2-9CB3-458E-98F9-70EE2AD82D89' and histid_b == '18F9B112-3971-4222-B124-593E88F8FF5B'" + )["prediction"].iloc[0] + == 1 + ) + + +def test_hh_agg_features( + spark, hh_agg_features_test_data, hh_matching, hh_agg_feat_conf +): + """ Ensure proper creation of aggregate features on hh potential matches """ + + path_a, path_b, path_pms = hh_agg_features_test_data + + load_table_from_csv(hh_matching, path_a, "prepped_df_a") + load_table_from_csv(hh_matching, path_b, "prepped_df_b") + load_table_from_csv(hh_matching, path_pms, "hh_potential_matches") + + hh_matching.training_conf = "hh_training" + hh_matching.table_prefix = "hh_" + + LinkStepScore(hh_matching)._create_features(hh_agg_feat_conf) + + pm = spark.table("hh_potential_matches_prepped").toPandas() + + # Make assertions on the data + assert pm.query( + "histid_a == 'B4DFC0CB-205F-4087-B95D-81992AFBBF0E' and histid_b == '0A9BDD32-CF94-4E60-ACE1-D2745C305795'" + )["mardurmatch"].iloc[0] + assert ( + round( + pm.query( + "histid_a == 'B4DFC0CB-205F-4087-B95D-81992AFBBF0E' and histid_b == '0A9BDD32-CF94-4E60-ACE1-D2745C305795'" + )["jw_max_a"].iloc[0], + 2, + ) + == 0.46 + ) + assert ( + pm.query( + "histid_a == 'B4DFC0CB-205F-4087-B95D-81992AFBBF0E' and histid_b == '0A9BDD32-CF94-4E60-ACE1-D2745C305795'" + )["jw_max_b"].iloc[0] + == 0 + ) + assert ( + pm.query( + "histid_a == 'B4DFC0CB-205F-4087-B95D-81992AFBBF0E' and histid_b == '0A9BDD32-CF94-4E60-ACE1-D2745C305795'" + )["f1_match"].iloc[0] + == 1 + ) + assert ( + pm.query( + "histid_a == 'B4DFC0CB-205F-4087-B95D-81992AFBBF0E' and histid_b == '0A9BDD32-CF94-4E60-ACE1-D2745C305795'" + )["f2_match"].iloc[0] + == 1 + ) + + assert ( + pm.query( + "histid_a == '6244C5B1-DCB6-47F2-992E-A408225C2AE2' and histid_b == '625DA33B-0623-4060-87E7-2F542C9B5524'" + )["jw_max_a"].iloc[0] + == 0 + ) + assert ( + pm.query( + "histid_a == '6244C5B1-DCB6-47F2-992E-A408225C2AE2' and histid_b == '625DA33B-0623-4060-87E7-2F542C9B5524'" + )["jw_max_b"].iloc[0] + == 0 + ) + assert ( + pm.query( + "histid_a == '6244C5B1-DCB6-47F2-992E-A408225C2AE2' and histid_b == '625DA33B-0623-4060-87E7-2F542C9B5524'" + )["f1_match"].iloc[0] + == 1 + ) + assert ( + pm.query( + "histid_a == '6244C5B1-DCB6-47F2-992E-A408225C2AE2' and histid_b == '625DA33B-0623-4060-87E7-2F542C9B5524'" + )["f2_match"].iloc[0] + == 2 + ) + + assert ( + pm.query( + "histid_a == '709916FD-D95D-4D22-B5C0-0C3ADBF88EEC' and histid_b == '51342DAE-AC53-4605-8DD9-FC5E94C235F8'" + )["jw_max_a"].iloc[0] + == 1 + ) + assert ( + pm.query( + "histid_a == '709916FD-D95D-4D22-B5C0-0C3ADBF88EEC' and histid_b == '51342DAE-AC53-4605-8DD9-FC5E94C235F8'" + )["jw_max_b"].iloc[0] + == 1 + ) + assert ( + pm.query( + "histid_a == '709916FD-D95D-4D22-B5C0-0C3ADBF88EEC' and histid_b == '51342DAE-AC53-4605-8DD9-FC5E94C235F8'" + )["f1_match"].iloc[0] + == 2 + ) + assert ( + pm.query( + "histid_a == '709916FD-D95D-4D22-B5C0-0C3ADBF88EEC' and histid_b == '51342DAE-AC53-4605-8DD9-FC5E94C235F8'" + )["f2_match"].iloc[0] + == 0 + ) + assert not pm.query( + "histid_a == '709916FD-D95D-4D22-B5C0-0C3ADBF88EEC' and histid_b == '51342DAE-AC53-4605-8DD9-FC5E94C235F8'" + )["mardurmatch"].iloc[0] + + assert ( + pm.query( + "histid_a == '43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4' and histid_b == '99CF8208-1F3D-4B62-80A4-C95FDD5D41F2'" + )["jw_max_a"].iloc[0] + == 1 + ) + assert ( + pm.query( + "histid_a == '43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4' and histid_b == '99CF8208-1F3D-4B62-80A4-C95FDD5D41F2'" + )["jw_max_b"].iloc[0] + == 1 + ) + assert ( + pm.query( + "histid_a == '43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4' and histid_b == '99CF8208-1F3D-4B62-80A4-C95FDD5D41F2'" + )["f1_match"].iloc[0] + == 2 + ) + assert ( + pm.query( + "histid_a == '43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4' and histid_b == '99CF8208-1F3D-4B62-80A4-C95FDD5D41F2'" + )["f2_match"].iloc[0] + == 1 + ) + + +def test_step_0_1_hh_blocking_and_filtering( + spark, hh_matching_stubs, hh_matching, conf +): + """ Test hh post-blocking filter works on hh_blocked_matches using a comparison feature """ + + conf["id_column"] = "histid" + conf["hh_training"] = {"prediction_col": "prediction"} + conf["hh_comparisons"] = { + "comparison_type": "threshold", + "feature_name": "agediff", + "threshold_expr": "<= 10", + } + conf["comparison_features"] = [ + {"alias": "agediff", "column_name": "birthyr", "comparison_type": "abs_diff"} + ] + + path_a, path_b, path_matches, path_pred_matches = hh_matching_stubs + + load_table_from_csv(hh_matching, path_a, "prepped_df_a") + load_table_from_csv(hh_matching, path_b, "prepped_df_b") + load_table_from_csv(hh_matching, path_matches, "scored_potential_matches") + load_table_from_csv(hh_matching, path_pred_matches, "predicted_matches") + + hh_matching.run_step(0) + hh_matching.run_step(1) + + # Create pandas DFs of the step_2 potential matches table + blocked_matches_hh_df = spark.table("hh_blocked_matches").toPandas() + potential_matches_hh_df = spark.table("hh_potential_matches").toPandas() + + # Make assertions on the data + assert blocked_matches_hh_df.shape[0] == 9 + assert blocked_matches_hh_df.query("serialp_a == '1'").shape == (9, 4) + assert blocked_matches_hh_df.query("serialp_b == '8'").shape == (6, 4) + assert blocked_matches_hh_df.query("serialp_b == '7'").shape == (3, 4) + + assert potential_matches_hh_df.shape[0] == 3 + assert potential_matches_hh_df.query("histid_a == '1004A'").shape[0] == 2 + assert potential_matches_hh_df.query("histid_a == '1005A'").shape[0] == 1 + + assert all(elem <= 10 for elem in list(potential_matches_hh_df["agediff"])) diff --git a/hlink/tests/hh_model_exploration_test.py b/hlink/tests/hh_model_exploration_test.py new file mode 100644 index 0000000..ec1f03c --- /dev/null +++ b/hlink/tests/hh_model_exploration_test.py @@ -0,0 +1,126 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pandas as pd + + +def test_all_hh_mod_ev( + spark, + main, + hh_training_conf, + hh_integration_test_data, + hh_model_exploration, + hh_training_data_path, +): + """ Integration test for hh model eval steps 0, 1, and 2 with two models """ + path_a, path_b, path_pms = hh_integration_test_data + hh_model_exploration.spark.read.csv( + path_a, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("prepped_df_a") + hh_model_exploration.spark.read.csv( + path_b, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("prepped_df_b") + + hh_model_exploration.run_step(0) + hh_model_exploration.run_step(1) + hh_model_exploration.run_step(2) + + prc = spark.table( + "hh_model_eval_precision_recall_curve_logistic_regression__" + ).toPandas() + assert all( + elem in list(prc.columns) + for elem in ["params", "precision", "recall", "threshold_gt_eq"] + ) + prc_rf = spark.table( + "hh_model_eval_precision_recall_curve_random_forest__maxdepth___5_0___numtrees___75_0_" + ).toPandas() + assert all( + elem in list(prc_rf.columns) + for elem in ["params", "precision", "recall", "threshold_gt_eq"] + ) + + tr = spark.table("hh_model_eval_training_results").toPandas() + assert all( + elem in list(tr.columns) + for elem in [ + "model", + "parameters", + "alpha_threshold", + "threshold_ratio", + "precision_test_mean", + "precision_test_sd", + "recall_test_mean", + "recall_test_sd", + "mcc_test_sd", + "mcc_test_mean", + "precision_train_mean", + "precision_train_sd", + "recall_train_mean", + "recall_train_sd", + "pr_auc_mean", + "pr_auc_sd", + "mcc_train_mean", + "mcc_train_sd", + "maxDepth", + "numTrees", + ] + ) + assert tr.__len__() == 2 + assert ( + tr.query("model == 'logistic_regression'")["precision_test_mean"].iloc[0] > 0.9 + ) + assert tr.query("model == 'logistic_regression'")["alpha_threshold"].iloc[0] == 0.5 + assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 5 + assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 0.9 + assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.9 + assert tr.query("model == 'logistic_regression'")["recall_test_mean"].iloc[0] > 0.9 + + preds = spark.table("hh_model_eval_predictions").toPandas() + assert all( + elem in list(preds.columns) + for elem in [ + "histid_a", + "histid_b", + "probability_array", + "probability", + "second_best_prob", + "ratio", + "prediction", + "match", + ] + ) + pm0 = preds.query( + "histid_a == '5DD3EBA2-D1E4-4E22-921A-0FFB59115932' and histid_b == '4B38B1CF-6DCE-4D92-BCEE-A591208D3D68'" + ) + assert pm0["second_best_prob"].round(2).iloc[0] >= 0.90 + assert pm0["ratio"].round(2).iloc[0] >= 1.01 + assert pm0["prediction"].iloc[0] == 0 + assert pm0["probability"].round(2).iloc[0] >= 0.93 + + pred_train = spark.table("hh_model_eval_predict_train").toPandas() + assert all( + elem in list(pred_train.columns) + for elem in [ + "histid_a", + "histid_b", + "probability_array", + "probability", + "second_best_prob", + "ratio", + "prediction", + "match", + ] + ) + pm1 = pred_train.query( + "histid_a == '014470B1-63E5-4A64-BB9A-70F5A9340130' and histid_b == '4E713690-5206-41FD-ABE1-5F545E55A5BB'" + ) + assert pm1["match"].iloc[0] == 1 + assert pm1["probability"].iloc[0] > 0.9 + assert pm1["second_best_prob"].iloc[0] < 0.2 + assert pd.isnull(pm1["ratio"].iloc[0]) + assert pm1["prediction"].iloc[0] == 1 + + main.do_drop_all("") diff --git a/hlink/tests/input_data/19thc_nativity_test_hhs.csv b/hlink/tests/input_data/19thc_nativity_test_hhs.csv new file mode 100644 index 0000000..1fcb34a --- /dev/null +++ b/hlink/tests/input_data/19thc_nativity_test_hhs.csv @@ -0,0 +1,16 @@ +ID,SERIAL,PERNUM,RELATE,MOMLOC,POPLOC,BPL,NAMEFRST,NAMELAST,KEY_NATIVITY,KEY_MBPL,KEY_FBPL +1001,1001,1,1,0,4,27,PAUL,BUNYAN,0,,27 +1002,1001,2,2,0,0,27,PAULETTE,BUNYAN,0,, +1003,1001,3,3,2,1,27,BABE,BUNYAN,1,27,27 +1004,1001,4,5,0,0,27,PAUL SR,BUNYAN,0,, +1005,1002,1,1,0,0,410,GEORGE,BURNS,5,, +1006,1002,2,2,0,0,27,LOUIE,BURNS,0,, +1007,1002,3,3,2,1,27,KEN,BURNS,2,27,410 +1008,1003,1,1,0,0,27,CYRANO,DE LA SOURCE,0,, +1009,1003,2,2,0,0,421,MANON,DE LA SOURCE,5,, +1010,1003,3,3,2,1,27,BEBE,DE LA SOURCE,3,421,27 +1011,1004,1,1,5,4,500,ZHOU,HUA,5,500,500 +1012,1004,2,2,0,0,500,LI,HUA,5,, +1013,1004,3,3,2,1,27,MULAN,HUA,4,500,500 +1014,1004,4,5,0,0,500,ZHOU,HUA,5,, +1015,1004,5,5,0,0,500,LI,HUA,5,, \ No newline at end of file diff --git a/hlink/tests/input_data/19thc_nativity_test_hhs_a.csv b/hlink/tests/input_data/19thc_nativity_test_hhs_a.csv new file mode 100644 index 0000000..9511edd --- /dev/null +++ b/hlink/tests/input_data/19thc_nativity_test_hhs_a.csv @@ -0,0 +1,18 @@ +ID,SERIAL,PERNUM,RELATE,MOMLOC,STEPMOM,POPLOC,BPL,NAMEFRST,NAMELAST,BIRTHYR,NATIVITY,KEY_NATIVITY_CALC,KEY_MBPL,KEY_FBPL,KEY_MOTHER_NATIVITY,KEY_MBPL_RANGE,KEY_MBPL_MATCH,KEY_FBPL_MATCH,KEY_MFBPL_MATCH,KEY_M_CAUTION_1870_1880,KEY_M_CAUTION_1850_1860,KEY_INTERMEDIATE_MBPL_RANGE_NOT_EQUALS,KEY_INTERMEDIATE_MBPL_RANGE_NOT_ZERO_AND_NOT_EQUALS,KEY_INTERMEDIATE_MOTHER_BIRTHYR_ABS_DIFF_5,KEY_INTERMEDIATE_STEPMOM_PARENT_STEP_CHANGE,KEY_INTERMEDIATE_MOMLOC_PRESENT_BOTH_YEARS,KEY_M_CAUTION_CC3_012,KEY_M_CAUTION_CC4_012 +1001,1001,1,1,0,0,4,27,PAUL,BUNYAN,1850,1,0,,27,,0,2,0,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1002,1001,2,2,0,0,0,27,PAULETTE,BUNYAN,1849,1,0,,,,0,2,2,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1003,1001,3,3,2,0,1,27,BABE,BUNYAN,1910,1,1,27,27,1,1,0,0,1,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,1,1 +1004,1001,4,5,0,0,0,27,PAUL SR,BUNYAN,1830,4,0,,,,0,2,2,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1005,1002,1,1,0,0,0,410,GEORGE,BURNS,1840,5,5,,,,0,2,2,1,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,2,2 +1006,1002,2,2,0,0,0,27,LOUIE,BURNS,1845,1,0,,,,0,2,2,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1007,1002,3,3,2,3,1,27,KEN,BURNS,1905,2,2,27,410,1,1,0,0,1,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,TRUE,0,1 +1008,1003,1,1,0,0,0,27,CYRANO,DE LA SOURCE,1841,1,0,,,,0,2,2,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1009,1003,2,2,0,0,0,421,MANON,DE LA SOURCE,1841,5,5,,,,0,2,2,1,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1010,1003,3,3,2,0,1,27,BEBE,DE LA SOURCE,1870,3,3,421,27,5,2,0,0,1,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,0,0 +1011,1004,1,1,5,0,4,500,ZHOU,HUA,1845,5,5,500,500,5,2,0,0,1,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,0,0 +1012,1004,2,2,0,0,0,36,LI,HUA,1844,1,0,,,,0,2,2,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 +1013,1004,3,3,2,0,1,27,MULAN,HUA,1907,1,2,36,500,1,1,1,0,0,TRUE,TRUE,TRUE,TRUE,FALSE,TRUE,TRUE,1,1 +1014,1004,4,5,0,0,0,500,ZHOU,HUA,1825,5,5,,,,0,2,2,1,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,2,2 +1015,1004,5,5,0,0,0,500,LI,HUA,1823,5,5,,,,0,2,2,1,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,2,2 +1016,1005,1,1,0,0,0,27,MOMMA,BEAR,1843,1,0,,,,0,2,2,0,FALSE,FALSE,,,,,,, +1017,1005,2,3,1,0,0,27,BABY,BEAR,1860,1,0,27,,1,1,2,2,0,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,2,2 \ No newline at end of file diff --git a/hlink/tests/input_data/19thc_nativity_test_hhs_b.csv b/hlink/tests/input_data/19thc_nativity_test_hhs_b.csv new file mode 100644 index 0000000..155f4ea --- /dev/null +++ b/hlink/tests/input_data/19thc_nativity_test_hhs_b.csv @@ -0,0 +1,17 @@ +ID,SERIAL,PERNUM,RELATE,MOMLOC,STEPMOM,POPLOC,BPL,NAMEFRST,NAMELAST,BIRTHYR,KEY_NATIVITY_CALC,KEY_MBPL,KEY_FBPL,MBPL,KEY_MBPL_RANGE_B +1001,1001,1,1,0,0,4,27,PAUL,BUNYAN,1850,0,,27,27,1 +1002,1001,2,2,0,0,0,27,PAULETTE,BUNYAN,1855,0,,,27,1 +1003,1001,3,3,2,0,1,27,BABE,BUNYAN,1913,1,27,27,27,1 +1004,1001,4,5,0,0,0,27,PAUL SR,BUNYAN,1830,0,,,404,2 +1005,1002,1,1,0,0,0,410,GEORGE,BURNS,1840,5,,,997,0 +1006,1002,2,2,0,0,0,27,LOUIE,BURNS,1840,0,,,410,2 +1007,1002,3,3,2,0,1,27,KEN,BURNS,1905,2,27,410,27,1 +1008,1003,1,1,0,0,0,27,CYRANO,DE LA SOURCE,1841,0,,,421,2 +1009,1003,2,2,0,0,0,421,MANON,DE LA SOURCE,1841,5,,,421,2 +1010,1003,3,3,2,0,1,27,BEBE,DE LA SOURCE,1870,3,421,27,421,2 +1011,1004,1,1,5,0,4,500,ZHOU,HUA,1845,5,500,500,500,2 +1012,1004,2,2,0,0,0,500,LI,HUA,1844,5,,,500,2 +1013,1004,3,3,2,2,1,27,MULAN,HUA,1907,4,500,500,500,2 +1014,1004,4,5,0,0,0,500,ZHOU,HUA,1825,5,,,999,0 +1015,1004,5,5,0,0,0,500,LI,HUA,1823,5,,,999,0 +1017,1005,1,1,0,0,0,27,BABY,BEAR,1860,1,,,997,0 \ No newline at end of file diff --git a/hlink/tests/input_data/birthyr_replace.csv b/hlink/tests/input_data/birthyr_replace.csv new file mode 100644 index 0000000..b632164 --- /dev/null +++ b/hlink/tests/input_data/birthyr_replace.csv @@ -0,0 +1,6 @@ +histid,birthyr,yearp,age,bpl,state1,state2 +a01,1999,1900,36,3300,washington,washington +b02,,1900,42,3043,washington, +b00,,1900,,4799,washington,washington dc +c03,1901,1900,,40000,,washington +d04,1850,1900,50,100,, \ No newline at end of file diff --git a/hlink/tests/input_data/calc_mfbpl_a.csv b/hlink/tests/input_data/calc_mfbpl_a.csv new file mode 100644 index 0000000..ea8feb6 --- /dev/null +++ b/hlink/tests/input_data/calc_mfbpl_a.csv @@ -0,0 +1,16 @@ +id,mbpl_calc,fbpl_calc,mbpl_range,nativity_calc,key_mbpl_match,key_fbpl_match,key_mfbpl_match,key_m_caution_1870_1880,key_m_caution_1850_1860 +1001,,27,0,0,2,0,0,FALSE,FALSE +1002,,11,0,0,2,2,0,FALSE,FALSE +1003,27,27,1,1,0,0,1,TRUE,FALSE +1004,11,423,0,2,2,1,0,FALSE,FALSE +1005,20,,0,5,1,2,1,FALSE,TRUE +1006,,,0,0,2,2,0,FALSE,FALSE +1007,27,410,1,2,0,0,1,FALSE,FALSE +1008,,,0,0,2,2,0,FALSE,FALSE +1009,23,,0,5,1,2,1,FALSE,TRUE +1010,421,27,2,3,0,0,1,FALSE,FALSE +1011,500,500,2,5,0,0,1,FALSE,FALSE +1012,,,1,5,2,2,1,TRUE,FALSE +1013,500,500,2,4,0,0,1,FALSE,FALSE +1014,,26,0,5,2,1,1,FALSE,FALSE +1015,997,,0,5,1,2,0,FALSE,TRUE \ No newline at end of file diff --git a/hlink/tests/input_data/calc_mfbpl_b.csv b/hlink/tests/input_data/calc_mfbpl_b.csv new file mode 100644 index 0000000..62a479a --- /dev/null +++ b/hlink/tests/input_data/calc_mfbpl_b.csv @@ -0,0 +1,16 @@ +id,mbpl_calc,fbpl_calc,mbpl_range,nativity_calc,key_mbpl_match,key_fbpl_match,key_mfbpl_match,key_m_caution_1870_1880,key_m_caution_1850_1860, +1001,,27,1,0,0,0,0,0,0,0 +1002,,,1,0,0,0,0,0,0,0 +1003,27,27,2,1,0,0,0,0,0,0 +1004,,455,2,0,0,0,0,0,0,0 +1005,21,,0,5,0,0,0,0,0,0 +1006,,,2,0,0,0,0,0,0,0 +1007,27,410,1,2,0,0,0,0,0,0 +1008,,,2,0,0,0,0,0,0,0 +1009,230,,2,5,0,0,0,0,0,0 +1010,421,27,2,3,0,0,0,0,0,0 +1011,500,500,2,5,0,0,0,0,0,0 +1012,15,,2,5,0,0,0,0,0,0 +1013,500,500,2,4,0,0,0,0,0,0 +1014,999,23,0,5,0,0,0,0,0,0 +1015,999,15,0,0,0,0,0,0,0,0 \ No newline at end of file diff --git a/hlink/tests/input_data/county_distances.csv b/hlink/tests/input_data/county_distances.csv new file mode 100644 index 0000000..16cc36b --- /dev/null +++ b/hlink/tests/input_data/county_distances.csv @@ -0,0 +1,5 @@ +county0,county1,state0,state1,distance +170,10,3400,100,1441272.5 +170,570,3400,1200,1613771.4 +1210,10,5500,100,1378066.3 +1210,570,5500,1200,1996158.8 \ No newline at end of file diff --git a/hlink/tests/input_data/crosswalk/hh_predicted_matches.csv b/hlink/tests/input_data/crosswalk/hh_predicted_matches.csv new file mode 100644 index 0000000..f789bb2 --- /dev/null +++ b/hlink/tests/input_data/crosswalk/hh_predicted_matches.csv @@ -0,0 +1,2 @@ +histid_a,histid_b +12,61 diff --git a/hlink/tests/input_data/crosswalk/predicted_matches.csv b/hlink/tests/input_data/crosswalk/predicted_matches.csv new file mode 100644 index 0000000..c3bcc30 --- /dev/null +++ b/hlink/tests/input_data/crosswalk/predicted_matches.csv @@ -0,0 +1,6 @@ +histid_a,histid_b +0,50 +3,51 +4,52 +8,53 +11,59 diff --git a/hlink/tests/input_data/crosswalk/raw_df_a.csv b/hlink/tests/input_data/crosswalk/raw_df_a.csv new file mode 100644 index 0000000..7cb7d39 --- /dev/null +++ b/hlink/tests/input_data/crosswalk/raw_df_a.csv @@ -0,0 +1,28 @@ +histid,serial,pernum,sex,age,state,bpl +0,0,1,1,54,1,20 +1,0,2,2,55,1,20 +2,0,3,1,15,6,99 +3,0,4,1,80,5,99 +4,1,1,1,25,6,99 +5,1,2,1,26,6,99 +6,2,1,2,72,6,99 +7,3,1,2,42,4,99 +8,3,2,1,16,6,99 +9,3,3,1,13,6,99 +10,3,4,2,20,6,99 +11,4,1,1,45,1,25 +12,4,2,2,44,1,25 +13,4,3,1,24,6,99 +14,4,4,1,22,6,99 +15,4,5,1,20,6,99 +16,4,6,2,14,6,99 +17,4,7,2,12,6,99 +18,4,8,2,63,5,99 +19,5,1,2,30,2,10 +20,5,2,2,12,6,99 +21,5,3,2,10,6,99 +22,5,4,2,26,3,99 +23,5,5,2,24,2,3 +24,6,6,1,28,5,99 +25,6,7,2,6,6,99 +26,6,8,2,2,6,99 \ No newline at end of file diff --git a/hlink/tests/input_data/crosswalk/raw_df_b.csv b/hlink/tests/input_data/crosswalk/raw_df_b.csv new file mode 100644 index 0000000..4dbea56 --- /dev/null +++ b/hlink/tests/input_data/crosswalk/raw_df_b.csv @@ -0,0 +1,28 @@ +histid,serial,pernum,sex,age,state,bpl +50,0,1,1,64,1,20 +51,0,2,2,65,1,20 +52,0,3,1,25,6,99 +53,0,4,1,90,5,99 +54,1,1,1,35,6,99 +55,1,2,1,36,6,99 +56,2,1,2,82,6,99 +57,3,1,2,52,4,99 +58,3,2,1,26,6,99 +59,3,3,1,23,6,99 +60,3,4,2,30,6,99 +61,4,1,1,55,1,25 +62,4,2,2,54,1,25 +63,4,3,1,34,6,99 +64,4,4,1,32,6,99 +65,4,5,1,30,6,99 +66,4,6,2,24,6,99 +67,4,7,2,22,6,99 +68,4,8,2,73,5,99 +69,5,1,2,40,2,10 +70,5,2,2,22,6,99 +71,5,3,2,20,6,99 +72,5,4,2,36,3,99 +73,5,5,2,34,2,3 +74,6,6,1,38,5,99 +75,6,7,2,16,6,99 +76,6,8,2,12,6,99 \ No newline at end of file diff --git a/hlink/tests/input_data/female.csv b/hlink/tests/input_data/female.csv new file mode 100644 index 0000000..d9f724d --- /dev/null +++ b/hlink/tests/input_data/female.csv @@ -0,0 +1,4 @@ +catherine,katie +catherine,kitty +catherine,cathy +catherine,kat \ No newline at end of file diff --git a/hlink/tests/input_data/handle_null.csv b/hlink/tests/input_data/handle_null.csv new file mode 100644 index 0000000..2b304b9 --- /dev/null +++ b/hlink/tests/input_data/handle_null.csv @@ -0,0 +1,10 @@ +test_null,id_a,serialp_a,namelast_a,bpl_a,sex_a,region_a,id_b,serialp_b,namelast_b,bpl_b,sex_b,region_b,regionf,namelast_jw,state_distance,match +,10,A,Name,100,1,1,50,E,List,700,2,2,1,0.0,,0 +,10,A,Name,100,1,1,30,D,Last,500,2,2,1,0.5,354.29999,0 +,10,A,Name,100,1,1,10,C,Nameish,400,1,1,1,0.9142857142857143,1427.1,1 +,20,B,Last,200,2,2,50,E,List,700,2,2,2,0.8500000000000001,,0 +,20,B,Last,200,2,2,30,D,Last,500,2,2,2,1.0,3187.2,1 +,20,B,Last,200,2,2,10,C,Nameish,400,1,1,2,0.46428571428571425,2667.5,0 +,30,B,Lest,300,2,2,50,E,List,700,2,2,2,0.8500000000000001,,1 +,30,B,Lest,300,2,2,30,D,Last,500,2,2,2,0.8500000000000001,,0 +,30,B,Lest,300,2,2,10,C,Nameish,400,1,1,2,0.46428571428571425,,0 diff --git a/hlink/tests/input_data/hh_predicted_matches_reporting.csv b/hlink/tests/input_data/hh_predicted_matches_reporting.csv new file mode 100644 index 0000000..81a6984 --- /dev/null +++ b/hlink/tests/input_data/hh_predicted_matches_reporting.csv @@ -0,0 +1,11 @@ +histid_a,histid_b +1,1 +2,2 +7,7 +9,9 +13,13 +15,15 +16,16 +21,21 +25,25 +26,26 \ No newline at end of file diff --git a/hlink/tests/input_data/hh_year_a.csv b/hlink/tests/input_data/hh_year_a.csv new file mode 100644 index 0000000..2fe14a2 --- /dev/null +++ b/hlink/tests/input_data/hh_year_a.csv @@ -0,0 +1,7 @@ +histid,namefrst,namelast,pernum,serialp,birthyr +1000A,Albert,Johnson,1,1,1840 +1001A,Mary,Johnson,2,1,1842 +1002A,Steve,Johnson,3,1,1868 +1003A,Robert,Johnson,4,1,1874 +1004A,Joanna,Johnson,5,1,1872 +1005A,Anna,Johnson,6,1,1892 \ No newline at end of file diff --git a/hlink/tests/input_data/hh_year_b.csv b/hlink/tests/input_data/hh_year_b.csv new file mode 100644 index 0000000..aebe50f --- /dev/null +++ b/hlink/tests/input_data/hh_year_b.csv @@ -0,0 +1,9 @@ +histid,namefrst,namelast,pernum,serialp,birthyr +1000B,Albert,Johnson,1,7,1840 +1001B,Mary,Johnson,2,7,1843 +1002B,John,Johnson,3,7,1870 +1003B,Steve,Johnson,4,7,1869 +1004B,Robert,Johnson,1,8,1875 +1005B,Joanna,Johnson,2,8,1872 +1006B,Jake,Johnson,3,8,1896 +1007B,Steve,Johnson,4,8,1899 \ No newline at end of file diff --git a/hlink/tests/input_data/hhpm_agg_test.csv b/hlink/tests/input_data/hhpm_agg_test.csv new file mode 100644 index 0000000..7255cb3 --- /dev/null +++ b/hlink/tests/input_data/hhpm_agg_test.csv @@ -0,0 +1,381 @@ +histid_a,serialp_a,histid_b,serialp_b +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,7DFCB62B-23B5-4A25-8267-5CDB2337B2FD,1891516 +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,5FE4DD78-CCAC-4C66-B209-5A0A8675D2F9,1891516 +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,94E3236D-7E2C-4E28-9CB6-15106B47BF74,1891516 +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,625DA33B-0623-4060-87E7-2F542C9B5524,1891516 +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,AF835DF6-C33A-4F3C-B94B-B78637C4AE9B,1891516 +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,B955DC88-1E56-45E7-BE38-8786B333C62B,1891516 +709916FD-D95D-4D22-B5C0-0C3ADBF88EEC,1329110,51342DAE-AC53-4605-8DD9-FC5E94C235F8,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,7DFCB62B-23B5-4A25-8267-5CDB2337B2FD,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,5FE4DD78-CCAC-4C66-B209-5A0A8675D2F9,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,94E3236D-7E2C-4E28-9CB6-15106B47BF74,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,625DA33B-0623-4060-87E7-2F542C9B5524,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,AF835DF6-C33A-4F3C-B94B-B78637C4AE9B,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,B955DC88-1E56-45E7-BE38-8786B333C62B,1891516 +46624410-0DCC-45F5-8E08-E4BA17692CD4,1329110,51342DAE-AC53-4605-8DD9-FC5E94C235F8,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,7DFCB62B-23B5-4A25-8267-5CDB2337B2FD,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,5FE4DD78-CCAC-4C66-B209-5A0A8675D2F9,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,94E3236D-7E2C-4E28-9CB6-15106B47BF74,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,625DA33B-0623-4060-87E7-2F542C9B5524,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,AF835DF6-C33A-4F3C-B94B-B78637C4AE9B,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,B955DC88-1E56-45E7-BE38-8786B333C62B,1891516 +4DF9253C-6516-4053-8318-E0C5C872FE80,1329110,51342DAE-AC53-4605-8DD9-FC5E94C235F8,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,7DFCB62B-23B5-4A25-8267-5CDB2337B2FD,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,5FE4DD78-CCAC-4C66-B209-5A0A8675D2F9,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,94E3236D-7E2C-4E28-9CB6-15106B47BF74,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,625DA33B-0623-4060-87E7-2F542C9B5524,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,AF835DF6-C33A-4F3C-B94B-B78637C4AE9B,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,B955DC88-1E56-45E7-BE38-8786B333C62B,1891516 +776C8A4D-CE68-404B-B52B-C6AD9A285F9D,1329110,51342DAE-AC53-4605-8DD9-FC5E94C235F8,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,7DFCB62B-23B5-4A25-8267-5CDB2337B2FD,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,5FE4DD78-CCAC-4C66-B209-5A0A8675D2F9,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,94E3236D-7E2C-4E28-9CB6-15106B47BF74,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,625DA33B-0623-4060-87E7-2F542C9B5524,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,AF835DF6-C33A-4F3C-B94B-B78637C4AE9B,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,B955DC88-1E56-45E7-BE38-8786B333C62B,1891516 +6244C5B1-DCB6-47F2-992E-A408225C2AE2,1329110,51342DAE-AC53-4605-8DD9-FC5E94C235F8,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,7DFCB62B-23B5-4A25-8267-5CDB2337B2FD,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,5FE4DD78-CCAC-4C66-B209-5A0A8675D2F9,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,94E3236D-7E2C-4E28-9CB6-15106B47BF74,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,625DA33B-0623-4060-87E7-2F542C9B5524,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,AF835DF6-C33A-4F3C-B94B-B78637C4AE9B,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,B955DC88-1E56-45E7-BE38-8786B333C62B,1891516 +997F0AB8-E4C5-4382-9531-67361303CFFF,1329110,51342DAE-AC53-4605-8DD9-FC5E94C235F8,1891516 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,0A9BDD32-CF94-4E60-ACE1-D2745C305795,4908719 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,36236D33-B795-4BE5-9516-3D1FB0700BC2,4908719 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,90460A1A-ED76-4AD7-9FA6-AF3B5F630F85,4908719 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,9F24B15F-C21F-4EC7-9292-737D593DE3DA,4908719 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,11E99637-65C7-4944-BAB2-FD8224A7DB0B,4908719 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,99CF8208-1F3D-4B62-80A4-C95FDD5D41F2,4908719 +E4C9D4CF-1F52-4067-8C8F-2740A4A1B8F7,3818316,2124B5F6-E5C8-472E-83F2-B3B3AB95803D,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,0A9BDD32-CF94-4E60-ACE1-D2745C305795,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,36236D33-B795-4BE5-9516-3D1FB0700BC2,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,90460A1A-ED76-4AD7-9FA6-AF3B5F630F85,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,9F24B15F-C21F-4EC7-9292-737D593DE3DA,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,11E99637-65C7-4944-BAB2-FD8224A7DB0B,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,99CF8208-1F3D-4B62-80A4-C95FDD5D41F2,4908719 +2387B227-C124-4935-BBA7-D0024F088940,3818316,2124B5F6-E5C8-472E-83F2-B3B3AB95803D,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,0A9BDD32-CF94-4E60-ACE1-D2745C305795,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,36236D33-B795-4BE5-9516-3D1FB0700BC2,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,90460A1A-ED76-4AD7-9FA6-AF3B5F630F85,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,9F24B15F-C21F-4EC7-9292-737D593DE3DA,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,11E99637-65C7-4944-BAB2-FD8224A7DB0B,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,99CF8208-1F3D-4B62-80A4-C95FDD5D41F2,4908719 +43C3C7F5-39E2-461D-B4F1-A0C5EA1750A4,3818316,2124B5F6-E5C8-472E-83F2-B3B3AB95803D,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,0A9BDD32-CF94-4E60-ACE1-D2745C305795,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,36236D33-B795-4BE5-9516-3D1FB0700BC2,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,90460A1A-ED76-4AD7-9FA6-AF3B5F630F85,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,9F24B15F-C21F-4EC7-9292-737D593DE3DA,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,11E99637-65C7-4944-BAB2-FD8224A7DB0B,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,99CF8208-1F3D-4B62-80A4-C95FDD5D41F2,4908719 +B655F2BB-2890-4D8A-9618-942419568DD5,3818316,2124B5F6-E5C8-472E-83F2-B3B3AB95803D,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,0A9BDD32-CF94-4E60-ACE1-D2745C305795,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,36236D33-B795-4BE5-9516-3D1FB0700BC2,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,90460A1A-ED76-4AD7-9FA6-AF3B5F630F85,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,9F24B15F-C21F-4EC7-9292-737D593DE3DA,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,11E99637-65C7-4944-BAB2-FD8224A7DB0B,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,99CF8208-1F3D-4B62-80A4-C95FDD5D41F2,4908719 +B4DFC0CB-205F-4087-B95D-81992AFBBF0E,3818316,2124B5F6-E5C8-472E-83F2-B3B3AB95803D,4908719 +456BF5A7-F211-4931-BF69-83456A5A9844,6371418,0DBAA064-3089-49DA-B9ED-CD2D0127DECD,7888369 +456BF5A7-F211-4931-BF69-83456A5A9844,6371418,DC04E38A-6CD4-4206-9C03-9D3440FAFDBB,7888369 +456BF5A7-F211-4931-BF69-83456A5A9844,6371418,390FC7AC-D98B-449C-8CDF-0340C1EE5D7E,7888369 +456BF5A7-F211-4931-BF69-83456A5A9844,6371418,31C46B49-5CF4-4603-B8E0-B65B7E1B16E8,7888369 +BD2A28F0-5461-4013-A963-955B180CE66C,6371418,0DBAA064-3089-49DA-B9ED-CD2D0127DECD,7888369 +BD2A28F0-5461-4013-A963-955B180CE66C,6371418,DC04E38A-6CD4-4206-9C03-9D3440FAFDBB,7888369 +BD2A28F0-5461-4013-A963-955B180CE66C,6371418,390FC7AC-D98B-449C-8CDF-0340C1EE5D7E,7888369 +BD2A28F0-5461-4013-A963-955B180CE66C,6371418,31C46B49-5CF4-4603-B8E0-B65B7E1B16E8,7888369 +4CC30AEC-171E-49E0-92E4-AB2A946CE939,6371418,0DBAA064-3089-49DA-B9ED-CD2D0127DECD,7888369 +4CC30AEC-171E-49E0-92E4-AB2A946CE939,6371418,DC04E38A-6CD4-4206-9C03-9D3440FAFDBB,7888369 +4CC30AEC-171E-49E0-92E4-AB2A946CE939,6371418,390FC7AC-D98B-449C-8CDF-0340C1EE5D7E,7888369 +4CC30AEC-171E-49E0-92E4-AB2A946CE939,6371418,31C46B49-5CF4-4603-B8E0-B65B7E1B16E8,7888369 +60324C2C-C972-492F-8C3D-7F5D0153CF50,6371418,0DBAA064-3089-49DA-B9ED-CD2D0127DECD,7888369 +60324C2C-C972-492F-8C3D-7F5D0153CF50,6371418,DC04E38A-6CD4-4206-9C03-9D3440FAFDBB,7888369 +60324C2C-C972-492F-8C3D-7F5D0153CF50,6371418,390FC7AC-D98B-449C-8CDF-0340C1EE5D7E,7888369 +60324C2C-C972-492F-8C3D-7F5D0153CF50,6371418,31C46B49-5CF4-4603-B8E0-B65B7E1B16E8,7888369 +A664E829-0E9E-446F-AE8A-9C43D49F1664,6371418,0DBAA064-3089-49DA-B9ED-CD2D0127DECD,7888369 +A664E829-0E9E-446F-AE8A-9C43D49F1664,6371418,DC04E38A-6CD4-4206-9C03-9D3440FAFDBB,7888369 +A664E829-0E9E-446F-AE8A-9C43D49F1664,6371418,390FC7AC-D98B-449C-8CDF-0340C1EE5D7E,7888369 +A664E829-0E9E-446F-AE8A-9C43D49F1664,6371418,31C46B49-5CF4-4603-B8E0-B65B7E1B16E8,7888369 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +E4783732-D08E-4FDC-A1CB-4AF6CC938447,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +0EF37C23-9D24-4DA1-9D52-F67DA283F014,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +7D6E4B65-C17A-48DB-BD7D-5C7AC32C15C6,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +8946CB92-5BCC-4517-8D91-A78750BE84C9,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +DDA00EE5-2D43-4513-9C93-27ADBA800351,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +3D83EEC4-7A0A-49C4-B374-258E5E037AFD,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +6265C5EE-FC87-42E5-99AD-BB7A4982E5A7,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +33A693BD-93D0-43E5-8EA7-49E17A5597E6,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,4974137D-7399-40E7-83A0-6AE21222AB35,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,7E36A7A8-3FDD-4374-912D-F10EEBBB32FC,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,04008AA6-FBC2-4457-B118-A16602EF369B,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,9AAD9E9F-D28D-45BA-BCB0-B01E15426C8D,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,AFE6A522-F698-4D15-8DA7-806E1C360305,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,27224C64-9FAD-455E-8F81-44B0F3166E56,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,9229C375-599E-428D-9C8A-813785CC1C6E,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,C777105C-972A-4849-B2B4-7D0C20CBB258,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,AA2700CD-A620-499D-B038-C07116B67B00,10883910 +391F22C9-E098-436A-B06E-F61D90880910,8944451,8FE04DAE-A33F-4100-A262-784BF23A0445,10883910 +4A9A80D0-D316-4E55-AC2B-B3CE1612A432,12050449,D685434C-F9CB-43B2-930B-00204D110035,14395052 +4A9A80D0-D316-4E55-AC2B-B3CE1612A432,12050449,519F9FE3-A239-4CD1-81BB-76B7807BA01B,14395052 +4A9A80D0-D316-4E55-AC2B-B3CE1612A432,12050449,CC3F9605-45B5-4C71-B6E0-0931F7B45B33,14395052 +05529857-578B-4EE1-8F40-D7174FE0E286,12050449,D685434C-F9CB-43B2-930B-00204D110035,14395052 +05529857-578B-4EE1-8F40-D7174FE0E286,12050449,519F9FE3-A239-4CD1-81BB-76B7807BA01B,14395052 +05529857-578B-4EE1-8F40-D7174FE0E286,12050449,CC3F9605-45B5-4C71-B6E0-0931F7B45B33,14395052 +72778E4E-A67E-4093-9F2F-EBAE1DFC9D37,12050449,D685434C-F9CB-43B2-930B-00204D110035,14395052 +72778E4E-A67E-4093-9F2F-EBAE1DFC9D37,12050449,519F9FE3-A239-4CD1-81BB-76B7807BA01B,14395052 +72778E4E-A67E-4093-9F2F-EBAE1DFC9D37,12050449,CC3F9605-45B5-4C71-B6E0-0931F7B45B33,14395052 +7D163636-2ECA-40F6-ACF5-AF91EEE34073,12050449,D685434C-F9CB-43B2-930B-00204D110035,14395052 +7D163636-2ECA-40F6-ACF5-AF91EEE34073,12050449,519F9FE3-A239-4CD1-81BB-76B7807BA01B,14395052 +7D163636-2ECA-40F6-ACF5-AF91EEE34073,12050449,CC3F9605-45B5-4C71-B6E0-0931F7B45B33,14395052 +10B4D999-DE88-45BA-B8C2-51A9857DDA20,12050449,D685434C-F9CB-43B2-930B-00204D110035,14395052 +10B4D999-DE88-45BA-B8C2-51A9857DDA20,12050449,519F9FE3-A239-4CD1-81BB-76B7807BA01B,14395052 +10B4D999-DE88-45BA-B8C2-51A9857DDA20,12050449,CC3F9605-45B5-4C71-B6E0-0931F7B45B33,14395052 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +B66D127D-332E-4908-8B4D-719A3B282ECA,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +8276410E-3FE0-426A-A401-947B9EED64BA,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +E5FCBF18-FFC6-4C9B-9461-EBCEBE6DC87E,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +A1A11BC1-E8E0-4DEA-B328-902C0241D90D,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +68DABA7D-6290-4B1E-8016-726E8D99B128,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +731A122F-5788-48B9-9864-BDE3937D9154,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +30CB09AA-C469-449C-ADD4-68DB95C8BA3C,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +1CF479F3-49B2-4F5D-AF49-59CE733401BC,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +BD432715-9B47-4C8D-A867-042FDDBA3D3D,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +B0761D16-614F-444F-A2FE-4E3702DBA4B7,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,2E967F05-C9CE-4846-A59C-E47868106EEE,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,2FB83D06-C191-473B-81DF-6CF29C0BF5F7,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,E9F0C2AD-4637-465D-85E8-8172FD587B84,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,D103646E-47B2-4403-A023-73303461C4B7,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,A7F9E130-2DE9-483D-9587-86999F8DF0CD,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,217569AD-FA10-4B4F-B13F-03E0B41A9566,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,E1EF25D3-4AD8-49C8-95C9-C8FFF6050917,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,2F69B71C-1932-474A-89D7-B1B90C33147C,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,D4295B6B-8A32-4FD2-A4AC-ED52D284BCA6,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,83095C7A-1B0F-4A14-8FDB-2066B8789E96,15592992 +46541714-DE54-4B67-BFF2-C72E21A6DC44,12602449,493EC863-5908-44C6-A686-45D7404A5E07,15592992 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,80E9A115-66C5-4AC0-885B-3B988577A6A0,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,0FC881C9-44E5-4FF9-8AF7-77205E4AABE5,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,750C230A-A798-493B-B82E-7C1BF40FF126,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,C24D946A-3D0C-412C-8DA1-2045F7EA6DB5,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,8ED8EFEE-7AFF-4DA7-9882-3ACF68D06945,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,C1DC057D-51B8-4DB9-808F-7CC51751A9E9,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,360EAA32-3065-412B-A7B7-48E589E52A2B,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,67866F72-8F04-460D-AC8A-A55DB9B5605E,16906690 +DFE61C5E-C15A-47F7-B81A-94BD259EF0C1,13968023,80AFC921-A87C-4796-BDDF-DDE87E50A7F0,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,80E9A115-66C5-4AC0-885B-3B988577A6A0,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,0FC881C9-44E5-4FF9-8AF7-77205E4AABE5,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,750C230A-A798-493B-B82E-7C1BF40FF126,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,C24D946A-3D0C-412C-8DA1-2045F7EA6DB5,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,8ED8EFEE-7AFF-4DA7-9882-3ACF68D06945,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,C1DC057D-51B8-4DB9-808F-7CC51751A9E9,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,360EAA32-3065-412B-A7B7-48E589E52A2B,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,67866F72-8F04-460D-AC8A-A55DB9B5605E,16906690 +90A11827-6671-4FCB-99F9-DB61731C18C8,13968023,80AFC921-A87C-4796-BDDF-DDE87E50A7F0,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,80E9A115-66C5-4AC0-885B-3B988577A6A0,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,0FC881C9-44E5-4FF9-8AF7-77205E4AABE5,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,750C230A-A798-493B-B82E-7C1BF40FF126,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,C24D946A-3D0C-412C-8DA1-2045F7EA6DB5,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,8ED8EFEE-7AFF-4DA7-9882-3ACF68D06945,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,C1DC057D-51B8-4DB9-808F-7CC51751A9E9,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,360EAA32-3065-412B-A7B7-48E589E52A2B,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,67866F72-8F04-460D-AC8A-A55DB9B5605E,16906690 +28B3FE03-0BB3-4D37-A086-30C213294473,13968023,80AFC921-A87C-4796-BDDF-DDE87E50A7F0,16906690 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,49771B49-FEB6-4AFD-8ABC-8C4DB2760764,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,8B57F185-20C9-4452-B730-870EDBA9BBF2,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,F596FB6A-4F8F-403D-BD32-B04DD50CB64F,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,B4DC346B-7420-408F-BF16-F588A37C6BC1,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,D3507516-3790-407D-865C-02C5F1961804,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,BB1DD92E-FAF1-4E39-840B-C3D88419940F,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,3D40E21A-0B1C-4A5B-A1C9-3AAF4D25CB52,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,DD0D86D7-94D5-4F0E-BD69-EFB255E8B111,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,9653D53C-AB04-4D41-9537-97B280F6DBD2,3915643 +D52FFBD9-F049-456D-B0A0-EA5EEFD4AE7F,2965482,E03223B9-DC84-4105-9B04-412E3BE2B7A3,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,49771B49-FEB6-4AFD-8ABC-8C4DB2760764,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,8B57F185-20C9-4452-B730-870EDBA9BBF2,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,F596FB6A-4F8F-403D-BD32-B04DD50CB64F,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,B4DC346B-7420-408F-BF16-F588A37C6BC1,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,D3507516-3790-407D-865C-02C5F1961804,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,BB1DD92E-FAF1-4E39-840B-C3D88419940F,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,3D40E21A-0B1C-4A5B-A1C9-3AAF4D25CB52,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,DD0D86D7-94D5-4F0E-BD69-EFB255E8B111,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,9653D53C-AB04-4D41-9537-97B280F6DBD2,3915643 +762F7DB1-FF3B-4A36-BFAA-DD5315289EA5,2965482,E03223B9-DC84-4105-9B04-412E3BE2B7A3,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,49771B49-FEB6-4AFD-8ABC-8C4DB2760764,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,8B57F185-20C9-4452-B730-870EDBA9BBF2,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,F596FB6A-4F8F-403D-BD32-B04DD50CB64F,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,B4DC346B-7420-408F-BF16-F588A37C6BC1,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,D3507516-3790-407D-865C-02C5F1961804,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,BB1DD92E-FAF1-4E39-840B-C3D88419940F,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,3D40E21A-0B1C-4A5B-A1C9-3AAF4D25CB52,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,DD0D86D7-94D5-4F0E-BD69-EFB255E8B111,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,9653D53C-AB04-4D41-9537-97B280F6DBD2,3915643 +15F20CE4-172A-4808-AE59-AEB580064FA4,2965482,E03223B9-DC84-4105-9B04-412E3BE2B7A3,3915643 diff --git a/hlink/tests/input_data/households_a.csv b/hlink/tests/input_data/households_a.csv new file mode 100644 index 0000000..367083f --- /dev/null +++ b/hlink/tests/input_data/households_a.csv @@ -0,0 +1,31 @@ +id,SERIAL,NAMEFRST,NAMELAST,AGE,SEX,BPL,ENUMDIST,PERNUM,SPLOC,POPLOC,MOMLOC,RELATE +b5689d06-edd3-498e-8b5b-e04f2fa2f2a9,1062582,Catherine,Beebe,44,2,10,2345,01,00,00,00,01 +a7118f06-949d-4d02-be0a-db33a6f8f3a8,1061605,Frances E,Bird,40,2,10,2345,01,00,00,00,01 +85d089c0-b907-4d9c-95ab-c5fa4a3dd2bb,1029271,J S,Luff,49,1,10,2345,01,00,00,00,01 +cddd9455-48e0-4b48-89a5-9ee315e00087,1237122,John,Smith,26,1,10,2345,01,00,00,00,01 +8cb74256-6dfa-4d17-913a-59fa646c388a,1022156,Saml H,Russell,77,1,10,2345,01,00,00,00,01 +1f8e1a74-d486-44ad-8d5c-51aedf86208e,1025771,Charles,Robertson,26,1,10,2345,01,00,00,00,01 +61a1590f-1d3a-4666-8406-3d4aaf0770b4,1027559,John,Dickinson,42,1,10,2345,01,00,00,00,01 +92277f0b-1476-41f5-9dc8-bf83672616d0,1028383,Joseph,Shissler,36,1,10,2345,01,00,00,00,01 +322291a1-de91-439d-bba0-45fc2f47a2eb,1029335,David,Hall,71,1,10,2345,01,00,00,00,01 +136f7105-ff59-4eac-9d95-44b002cbb448,1031229,John,Decame,52,1,10,2345,01,02,00,00,01 +1138ab41-e234-4c72-b812-eaaf0fc5f76c,1031229,Nancy,Decame,53,2,10,2345,02,01,00,00,02 +066ea4e1-f340-4231-b505-ec7bb9a07103,1031229,Peter N,Decame,15,1,10,2345,03,00,01,02,03 +b7d96336-404e-490c-8c45-61f2287b52ff,1031229,Annam,Decame,13,2,10,2345,04,00,01,02,03 +24bdff6a-5590-4494-8e8a-ac4a549c8890,1031229,Sarah,Decame,10,2,10,2345,05,00,01,02,03 +c1fedaab-f026-4aa4-9320-e10f2432d539,1031230,James,Carney,22,1,10,2345,01,00,00,00,01 +43a6ebe5-752b-4054-818d-6f6f75cc89e7,1031235,Alfred,Dell,27,1,10,2345,01,00,00,00,01 +0d693015-2349-4363-9667-45036af7d0db,1031760,Chas,Syaex,40,1,10,2345,01,00,00,00,01 +1d586e26-aac1-49df-a2ad-fe0a385a26bf,1031767,Sarah,Russell,13,2,10,2345,01,00,00,00,01 +93b7ac89-f9db-49b2-a1f2-c189fecc14ae,1034579,Wm H,Hazard,29,1,10,2345,01,02,00,00,01 +e51c36c9-570c-466d-aac1-bf380c9c20f1,1034579,Martha,Hazard,30,2,10,2345,02,01,00,00,02 +9250341a-8336-494a-bc84-2b803efe64c6,1034579,Willie May,Hazard,8,2,10,2345,03,00,01,02,03 +a70679f0-9313-4ef3-bf87-5dfe81beed5d,1034579,Samuel,Hazard,4,2,10,2345,04,00,01,02,03 +4715bbf6-d3e2-4260-9ddd-6aece147e5c1,1034579,Samuel,Morgan,32,1,10,2345,05,00,00,00,12 +77378570-5214-4ac5-8258-c5156e8b99b3,1034648,J Clauson,Mcfarland,20,1,10,2345,01,00,00,00,01 +6542b541-6e10-411f-9b2a-7c0b93b0aa68,1034648,Eugene,Mcfarland,18,1,10,2345,02,00,00,00,07 +396c4077-6a70-4a17-97fb-f8a0c06fdafe,1037015,Anna,Preston,39,2,10,2345,01,00,00,00,01 +7e9dde5e-3fad-4b2e-b367-643c0dc8cabb,1038208,Rebecca N,Alexander,49,2,10,2345,01,00,00,00,01 +f7d9e25f-c390-4222-ac24-4e93d72daa05,1038222,Martha,Ellis,37,2,10,2345,01,00,00,00,01 +24b7afa1-8c49-4833-8292-c545c85d3b89,1039117,Otillia,Zeider,34,2,10,2345,01,00,00,00,01 +4b416874-0c5c-4233-81ec-39223bc66f4f,1048673,Mary,Doyle,64,2,10,2345,01,00,00,00,01 \ No newline at end of file diff --git a/hlink/tests/input_data/households_b.csv b/hlink/tests/input_data/households_b.csv new file mode 100644 index 0000000..906259f --- /dev/null +++ b/hlink/tests/input_data/households_b.csv @@ -0,0 +1,27 @@ +ID,SERIAL,NAMEFRST,NAMELAST,AGE_ORIG,AGE,BIRTHYR,SEX,BPL,ENUMDIST,PERNUM,SPLOC,POPLOC,MOMLOC,RELATE +a499b0dc-7ac0-4d61-b493-91a3036c712e ,2484121,ANNIE ,FAUBLE ,26,16,1884,2,10,2222,01,00,00,00,01 +ae7261c3-7d71-4ea1-997f-5d1a68c18777 ,2485245,MARY ,REESE ,35,25,1875,2,10,2222,01,00,00,00,01 +ad6442b5-42bc-4c2e-a517-5a951d989a92 ,2485245,MARY ,REESE ,11,1,1902,2,10,2222,02,00,00,01,03 +9e807937-de09-414c-bfb2-ac821e112929 ,2485411,JOHN ,SHIELDS ,21,11,1889,1,10,2222,01,00,00,00,01 +426f2cbe-32e1-45eb-9f86-89a2b9116b7e ,2485411,ANNE ,FAUBLE ,26,16,1884,2,10,2222,02,00,00,00,11 +a76697d9-b0c8-4774-bc3e-12a7e403c7e6 ,2485601,JOHN ,COLLINS ,17,2,1893,1,10,2222,03,00,00,00,10 +3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ,2485601,MAGGIE ,COLLINS ,16,5,1894,2,10,2222,04,00,00,00,10 +49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ,2485601,MARY ,COLLINS ,12,2,1898,2,10,2222,05,00,00,00,10 +50b33ef6-259d-43af-8cdc-56a61f881169 ,2486481,WILLIAM H. ,SEWARD ,54,44,1866,1,10,2222,01,00,00,00,01 +952754a5-48b4-462a-ac57-e4a059a9ef98 ,2486951,ESTHER ,BIERHAHN ,40,33,1870,2,10,2222,01,00,00,00,01 +ea6d77b3-2e2d-4c59-a0ac-6b297e8898e3 ,2488461,CHARLES ,CLEVELAND ,45,35,1865,1,10,2222,01,00,00,00,01 +60a5052e-6d67-455a-a3aa-bb79560c7d8d ,2489211,SUSAN ,WILSON ,60,50,1850,2,10,2222,01,00,00,00,01 +0d4472ec-6378-4aeb-b6c7-17e1c388bb94 ,2489301,ARCHER ,HARVEY ,20,11,1893,1,10,2222,01,00,00,00,01 +65ccbeb7-2c79-4fb0-b354-c67f150ad80c ,2489831,ELIZABETH ,MC LEAN ,42,32,1868,2,10,2222,01,00,00,00,01 +72cbe5fa-f558-4393-8423-1842fadf7f11 ,2490611,MARY A. ,FLEMMING ,73,63,1842,2,10,2222,01,00,00,00,01 +bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ,2491501,THOMAS ,GRAHAM ,64,64,1846,1,10,2222,01,00,00,00,01 +a7b10530-b7c9-44d5-9125-c603f392d6d3 ,2491801,EDWARD ,DEKAY ,35,25,1875,1,10,2222,01,00,00,00,01 +1e635c1c-7faa-4270-acf3-a22635884b90 ,2492069,NATHEN ,THORPE ,74,64,1836,1,10,2222,01,00,00,00,01 +d3217545-3453-4d96-86c0-d6a3e60fb2f8 ,2492741,JOB ,FOSTER ,26,18,1884,1,10,2222,01,02,00,00,01 +2a35bae5-3120-4e2c-87da-694d4419c9ce ,2492741,JEZEBEL ,FOSTER ,22,12,1888,2,10,2222,02,01,00,03,02 +94460fc2-954b-469d-9726-f7126c30e5e2 ,2492741,ELIZA ,GOODWIN ,39,31,1871,2,10,2222,03,00,00,00,06 +620b6ebb-82e6-42db-8aae-300ca2be0c00 ,2492741,MARY ,GOODWIN ,17,7,1893,2,10,2222,04,00,00,03,08 +bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ,2492741,JO ,GOODWIN ,15,6,1890,1,10,2222,05,00,00,03,08 +7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ,2493841,PHINEAS ,TAYLOR ,39,29,1871,1,10,2222,01,00,00,00,01 +a0f33b36-cef7-4949-a031-22b90f1055d4 ,2494397,MARY A. ,LORD ,54,44,1856,2,10,2222,01,00,00,00,01 +1a76745c-acf8-48a0-9992-7fb10c11710b ,2494421,E.B. ,ALLEN ,21,11,1889,1,10,2222,01,00,00,00,01 diff --git a/hlink/tests/input_data/households_b.parquet b/hlink/tests/input_data/households_b.parquet new file mode 100644 index 0000000..63c15f3 Binary files /dev/null and b/hlink/tests/input_data/households_b.parquet differ diff --git a/hlink/tests/input_data/integration_a.csv b/hlink/tests/input_data/integration_a.csv new file mode 100644 index 0000000..e330eb0 --- /dev/null +++ b/hlink/tests/input_data/integration_a.csv @@ -0,0 +1,4 @@ +id,serialp,namelast,bpl,sex,region,age +10,A,Name,100,1,1,1 +20,B,Last,200,2,2,2 +30,B,Lest,300,2,2,3 diff --git a/hlink/tests/input_data/integration_b.csv b/hlink/tests/input_data/integration_b.csv new file mode 100644 index 0000000..c256330 --- /dev/null +++ b/hlink/tests/input_data/integration_b.csv @@ -0,0 +1,4 @@ +id,serialp,namelast,bpl,sex,region,age +10,C,Nameish,400,1,1,1 +30,D,Last,500,2,2,2 +50,E,List,700,2,2,3 diff --git a/hlink/tests/input_data/jw_blocking_test_a.csv b/hlink/tests/input_data/jw_blocking_test_a.csv new file mode 100644 index 0000000..b0b98ae --- /dev/null +++ b/hlink/tests/input_data/jw_blocking_test_a.csv @@ -0,0 +1,7 @@ +histid,namefrst_unstd,namefrst_std,namelast_clean,sex,bpl_clean,birthyr +0a,maggie,margaret,jones,2,27,1923 +1a,marge,margaret,jones,2,27,1923 +2a,margaret,margaret,jones,2,27,1923 +5a,margaret,margaret,jupiter,2,27,1923 +3a,ginny,ginny,jones,2,27,1923 +4a,m,megan,jones,2,27,1923 \ No newline at end of file diff --git a/hlink/tests/input_data/jw_blocking_test_b.csv b/hlink/tests/input_data/jw_blocking_test_b.csv new file mode 100644 index 0000000..b492197 --- /dev/null +++ b/hlink/tests/input_data/jw_blocking_test_b.csv @@ -0,0 +1,2 @@ +histid,namefrst_unstd,namefrst_std,namelast_clean,sex,bpl_clean,birthyr +0b,margaret,margaret,jones,2,27,1923 \ No newline at end of file diff --git a/hlink/tests/input_data/male.csv b/hlink/tests/input_data/male.csv new file mode 100644 index 0000000..3c4c4bc --- /dev/null +++ b/hlink/tests/input_data/male.csv @@ -0,0 +1,2 @@ +bernard,barney +bernard,barry \ No newline at end of file diff --git a/hlink/tests/input_data/matched_men.csv b/hlink/tests/input_data/matched_men.csv new file mode 100644 index 0000000..4d7db3d --- /dev/null +++ b/hlink/tests/input_data/matched_men.csv @@ -0,0 +1,527 @@ +serialp_a,serialp_b,histid_a,histid_b,match +5831865,7332795,D04F610D-D729-4ED5-93F5-969893BD91F8,FF92566E-A5FB-42DD-9CB1-74491A47A738,1 +9769888,12034779,EFF9C201-295D-4431-A0C7-E558957FCFE2,6314FA0C-ADCC-4364-AA46-B6E9FF3EE86F,1 +8850502,10865485,2925BE8A-A560-4C16-BF33-E63BE7A01038,BC4B0412-A3CE-4717-ABAB-A331F3B12D0B,1 +1415078,2063748,6601DBC9-2EC2-4D1D-A707-C5AC84DFD058,962C4844-C250-429F-8B59-1AFA69CCC666,1 +4618056,5827784,FB9B47DC-403C-486B-90E1-B10C9C39E739,58E4DDC8-F7EA-45CF-9548-81DABC80D179,1 +4771041,6008777,AA467116-33FB-4C12-98FB-4DA21E3C11E0,F0A2CE5D-EB65-42A2-A37A-94B7DF9E3676,1 +1530717,2132013,6C651C14-7502-4BDA-A9AC-C029EC476DE4,87A4F72E-99E9-4434-A599-2A6C4BB10211,1 +8484167,10415056,DCF89BEC-4507-4344-88B2-BA5AAFF18D33,99461DA3-3363-4C7B-B3F2-A472FC15C074,1 +16404564,20934766,9EA6ECDA-B3F8-419D-8913-52624C8ACB3C,FA171C32-C263-4BC4-A9CD-DEE46E766A2E,1 +1503157,2106208,44E35B89-A479-4B21-99FB-CD83D1744EB3,1B3D0819-5A8B-43D3-B8ED-529C0971C734,1 +642062,796398,D18561A3-4AB8-449C-9E45-B88AD455A74D,E8B532F0-8D6A-4EE3-A081-52A9443B50BD,1 +1112870,1593260,66C469BD-07ED-49FA-99D1-ECAF7098690E,D391F0C4-4BE1-4D53-9A9C-38AC5726B68F,1 +14351793,17909772,D39A6D02-4399-4B7C-81F6-4A6EB0CF8E4B,623F31B5-1486-48EE-BD36-904C4AF7CCEF,1 +7853554,9612188,22E576EC-0514-4448-8FC4-E91FAF0D89AA,BB3C1A1F-1318-48FB-BD45-FA6A7C5FFBD8,1 +8897924,10873579,0AFA2C96-2A76-4E84-864B-25A86326BBC9,3CAEBDC7-915A-484C-BE21-2E56B2321085,1 +9339632,11795787,6A59725B-A73B-40AE-A81A-77308E3217AD,14295EDF-59AC-46D1-90A5-6039C2A4E66C,1 +8939944,10935120,AC4B590A-FD02-490B-8316-281AB249D529,6FCF78DA-FD38-46F5-9DE2-D14EB4639F4F,1 +12580842,15529691,145F6B34-35B9-49B6-A2D1-64A9A674D2D3,244655A5-646E-41A5-9E3B-DBE10DF91E79,1 +3525748,4545460,DC805DC7-0462-4822-BE4D-093F2C522D15,2765BDE0-0F4E-47FF-9B61-65A041080CDF,1 +216802,205980,BB7CB31F-E775-486B-930D-A907EFC9763E,A23110DD-E630-464B-A057-E7B2D0540E3D,1 +2383764,3215321,990F399E-4A7B-434F-BBDD-C45EA6B4E232,AE0E1678-6487-45AE-AC95-E1237752F987,1 +806439,1487881,D9BBCC7A-DC28-4B3A-A712-2872F98B648C,C7F82244-BA1E-40DD-A6EA-E75166B26505,1 +8675295,10445388,78B67EDF-F6BD-4ABA-B2A9-DB4D76517B39,F414FDB0-4409-41CF-B8C0-53C0F7CCDD64,1 +7914559,9656906,F1BBF770-E4D1-480A-B96A-DE3C4948DC9F,9831D948-76A4-4D5E-9D5F-479B136DA6FB,1 +6984203,8407305,5998556E-8B2F-41DB-B019-FBE1B26E11DF,1635E0BC-A2DD-410E-8AFF-2EE49E4B11C0,1 +8506478,10396161,1D3CAC1B-D197-4A3A-B2D0-B80899AA05EF,0957006F-27D5-400B-9902-5E4377212C33,1 +2654810,20359170,EB30982B-12D9-4661-B5F8-B2190E357AF9,4727719A-AED1-45D3-90B3-BB19A710607B,1 +16783239,21229946,E31B93B2-CF1C-42A7-A7E9-5C912AFD33E1,34011ACA-358F-4902-8BD8-DB0CEE31E3D7,1 +2653164,3459878,192E4127-C5CD-4AE7-9C6E-1BC18E43DB05,DCEDF7E6-DD81-484F-A05A-B61BA795A21E,1 +7114480,8769255,AE9B1643-43F4-4158-A570-B758E316B781,57CB695D-A6EF-4745-881C-923069033354,1 +555677,542825,ABA799B3-F976-4F41-B5F1-451609843E32,999C48D1-1F10-461B-86DC-029D6434CE3C,1 +1580946,2219186,EAB642F9-57D6-4393-A7D2-8E95E9380DAE,925630DC-5BBA-4F1D-82FE-1180B95B8653,1 +3024066,3994203,D331DFAB-F6EC-41A8-B0D7-D5A69E09143B,8289BD6C-258B-43BF-A83F-DD67B043836C,1 +6353572,7850535,A0DCCD3C-795E-40E7-A488-982E14BFEBFC,AFD35B21-F7EA-4DAA-BE25-28DB8CF57357,1 +15646077,19581762,14233F2F-1620-470B-A41A-D560ACCF1E7E,4680DD95-2658-4477-AF93-AC2E06BDB0EE,1 +5723754,7120776,BF28F8BA-56F6-4CB0-AA4D-F3E7FA4DD41D,4658A14C-74E3-4CE0-AC03-C023A3E383A9,1 +9048289,11054730,219A6AA7-6FF9-4ECD-9022-1F6841BB6607,D52312A9-82D5-4649-91EE-0C782023A160,1 +10553059,12073408,C05E6D63-95F0-44B8-BB87-835594C3CA64,22681E59-149A-4277-9396-C7AE21C61A4B,1 +16614701,20568402,6643C4B1-35EC-4B8E-8EA0-9CD707E0C4A3,93C369BA-831D-4568-8731-65203745FCA1,1 +9003295,11106086,C66C4F76-80F8-4E9F-9B72-012170166EF4,271DA1B8-2D3B-4B45-B57A-24E2F49F3E7F,1 +14514427,2991821,96C99999-E90F-4F38-8DA9-DA8519898346,2F3A39D3-35A7-4700-8E84-34EB563363F1,1 +1199410,1776599,F77DFA6F-A245-4255-969B-B4664ADC5AD8,A6367438-25C2-49A4-B945-3CB93CD226D4,1 +1329110,1891516,62469153-E6E0-4E77-A618-3CCA6A2369E3,E8E3E60B-E960-42F5-A969-1D84879286E2,1 +7418667,9152191,9D9BD70F-27D4-40A1-AC44-6883CE762E0E,2C3DF83B-7A2F-40DD-A986-14A1FCC0EA0D,1 +15627741,19614472,F5CA1B83-1961-43EA-8CCD-9DD564EC4479,E062D62D-3100-44D7-A70F-A78D425E5907,1 +8236865,9936214,3264D5A4-63E0-4D54-BE64-4453AF9EAA70,6D5CDEAC-E7D3-4D57-A99E-BBA8B30F2CB2,1 +6282905,7776240,150C4DCB-2930-44F9-AA76-324D03E5B692,8455E057-AC37-4175-B8CD-873301CABBF1,1 +1525835,2135111,C5697438-1566-429E-9D5E-91655802B20B,1EC5DD7E-43B1-4308-9A50-868644C4CE90,1 +7601022,9253267,8012F7DF-033E-4D5E-9A43-F59F0B20E36F,5CB2CC45-D11D-4ACC-983C-DC76BCDE2018,1 +14107001,17159306,33FECEE4-B637-416C-8F18-4DFD81A9D012,0DA1C5E2-8AE1-428F-9A2E-F2614A849774,1 +14524333,10990083,596D1029-185C-4DA2-A80D-7F5A627AB3E2,B151CFB6-F6D1-4FD9-A99C-9E69DCABB2FA,1 +7289626,8936770,3DC794D2-5733-4392-91D3-99944D65A1F0,5E001D41-A3EF-4841-963B-560C8173987A,1 +11177399,12096571,A3BA9583-2157-4BD6-97B1-7BD7DE7A890A,D9D7AD46-B675-4FA4-ACAD-7AFEBA0A1DB0,1 +14883709,18582026,78BCD892-6225-46E0-A287-BFB28918493F,4EDFC519-A125-4E00-BC6B-4318BEA8F361,1 +11690744,15051962,71AD695C-7207-4005-B845-D8F452173531,F6442E92-4BA0-4E58-8950-61A71142E063,1 +7960512,9791991,54861BAD-9D64-475B-992B-CF50CBAE10E7,6043C719-ED89-4F1C-8607-B8B17088907B,1 +697631,1336643,9BA6213D-485A-495D-B7A2-5CD5BA9A4DAF,308C6CDE-C991-4646-958F-F0A0B8D95E99,1 +3260691,4776145,1126FE36-FC68-45A2-A41A-EB0B1589363A,EFC060D8-4D23-4F20-A29C-A71F34CA92EC,1 +9762400,12031005,22B9D67B-0277-4D83-8EB1-65BF6DE982EA,2221B050-FBBB-4601-A1AB-6D21ABC5EE25,1 +1137215,1643932,BC47FAE1-2FF5-4063-B663-4A33AE614FCC,AD876857-98B5-4F8B-9B11-076531E2EDA7,1 +2638430,3448337,55A5326B-7ED7-4657-BB38-001673D4CFA6,02FF50A0-0658-4348-ADDF-43042A3FC69D,1 +8902832,10949223,D33C495C-EE7B-43DD-94AD-DEEF731EFE51,2E44D096-54A0-491A-91C2-20F3D8874520,1 +15752235,20057346,6E73A049-EF73-463D-865B-D95B361A9915,7335667D-2BCB-4D3E-B095-1E70B8C25EF2,1 +8947526,10886994,8C52C9C8-3CC4-4B65-8DEF-F12DC34B84FA,BE8BD719-518A-48C2-90C7-2D116198A61E,1 +2639748,15829963,92F70BCB-0539-4C67-9FA2-3A288E119859,BB67BB10-A9B2-450F-8182-2B5A2577AA91,1 +4099229,5224571,41F4A172-17EF-4142-AF4F-80A58E1C297F,5E779982-C2C9-4DF1-9871-0EBDBA02B845,1 +4204785,15143714,B140A5EA-0779-4EE6-A33C-DCDF7CED499A,C5973F74-2C84-4F36-8D9F-C5F87258DF20,1 +454626,732348,5B1359AA-DACB-465B-BE17-D33F06DDE6FC,95FBFED3-E860-442C-86FB-EF786F6DFBCB,1 +9469404,11605921,11B783DA-6404-4F5C-B5C9-6FAC8F7F1574,BE612DB7-331F-475B-8D10-E1B4BB39FD92,1 +1564480,2245900,504E9860-0B63-4EBC-BBBC-B7546CA61FC4,37430DC4-5BDB-4D57-B107-24AB08D13E38,1 +16718135,11995202,482104C9-7D55-4715-AA51-368DF8E66AF0,4CD269FF-3F88-4C70-BD86-CF4A65106677,1 +1338259,1918288,5B083876-F2BD-4283-B5A1-201F6A89A880,CA505067-972F-443E-84EE-1FFB7D09FAB6,1 +215821,297781,11175E38-92B9-4473-972C-0B55CB0F3B7B,EEAB2BB0-AF57-466C-963F-A8486D7A1B24,1 +2656290,3485400,28F1894E-5855-4449-9695-F2FC07ABC620,82945EF0-A5D7-4C82-A541-52BCCBA0BEEC,1 +16758015,21201562,EF2CEE02-33F4-4FD8-AC4A-BE9C5EF5250F,B2F668BF-F86B-4C7E-8B8A-CC3DE0333B2E,1 +6156402,7639030,8742254C-CE97-44D9-924E-3A66B1091AB5,36AC0D1B-D163-445F-92C0-903643C6C2B9,1 +5389750,6599420,885A71BB-3D13-4411-A8AA-918097A76FDD,08A2B9F4-F061-4523-B54D-3B3DD6C009BF,1 +6454118,7961189,A110B490-D166-45AC-9209-52D04A0A6FE3,A592F76B-C678-43A9-A785-E1E2A1B7AAFC,1 +3966141,5049644,9C55697B-DC9A-4488-8F37-68FF7CDEE716,EC34EEA3-ED05-47B6-9B82-9B984F6F5844,1 +14237711,18017052,5F4AE291-014F-4DAB-A7F7-18EFA7CA0E88,B1D7EE89-2683-45D5-A566-86501599169A,1 +8637976,10521154,F32C28E3-2849-4DFC-AA66-3AD5A58A52CE,3CD3C238-C073-4644-B8BD-F310DF76237D,1 +894675,3499379,2341F9C1-BEB6-4620-B0C9-C7E224A32007,F2DC3F55-04AC-442F-ADE6-ED846B339F24,1 +4509662,5678541,F212D147-D37D-41E7-86E1-FB1B28D55200,22958292-23F0-4CDB-8947-677995744C54,1 +7375118,9074001,AED35702-6420-4EAD-9FFE-37454D7800D2,B98E5D4A-5573-4912-BAFB-082E1B789291,1 +8480039,10369933,8D557182-6679-401C-92F1-60758F875F2E,BD4AB70C-83BE-48ED-A2CB-58B819A8A0FE,1 +307927,402522,6A163F61-0D32-4C51-88FE-D5CB2D8D9168,6F2B1E6F-6220-4605-BC5A-98A84D6A6FA7,1 +9343439,11815757,A0F10DA4-1780-48F5-A681-68976DE2D4C2,8C25E0BF-1579-4DF2-B436-F917D4219D79,1 +1150363,1665705,031205B5-1157-4AD0-A303-45193DC4C988,CA9B1418-B0A3-40C8-A11F-3CF208D707BC,1 +4231553,5368262,A40624FE-39C3-4C18-BB7B-E761291891ED,BE50410E-F7AF-45A8-B79E-8A3F9E37D229,1 +9029666,11137788,83C78E4D-D19D-42CA-9E47-0AB682FA847F,44277136-41C1-4F41-9B35-732813F42CE8,1 +891387,971256,A4F80788-9BED-4441-81CD-AFA52EAC85F6,7CFF9E35-93C6-4444-B9A6-0F8779335EDF,1 +8931566,10972674,0F906455-2A3B-44A3-B044-F8BD7CF110A4,898B07E4-2F05-48B4-AE41-F38E8D419E8C,1 +8912444,10958392,69684FC6-9600-42CC-ABF1-5030EBDFC3C8,FD245079-E6A8-4EEB-9443-FE2300546B60,1 +656887,9532664,1015278D-7E83-4D40-B676-7EB0EC39219E,509EFF61-0D2C-44E9-8E82-B01620E189C6,1 +1083673,1587971,22B6898A-1118-4CCF-811B-998A6DBD3750,C751B428-CC5A-4B59-921F-BA19D3BD73DA,1 +15501046,11146995,0D185BEF-E7FB-4029-B37A-47EE680A8B67,9E24B9C2-8D40-417B-A3E7-BF9505489A57,1 +9119616,11037964,50563806-AE23-4B66-A1CE-C5BA19DC7923,740C227D-0B53-4120-B7FE-38A0ADD9C30E,1 +9156509,11233486,707FF1F6-8039-4220-9F80-5398D836D5B4,0151ADBF-DCB0-42B8-B20A-5842DF0C71C9,1 +9066751,1123360,44E34819-CE03-4CCC-85C2-B089A5A10AEA,F6AC3D60-7631-402F-B869-57AFA1ADEB0D,1 +894203,970770,89C45EE0-8246-4C7A-8DAF-0FE8403AFCE0,69E3C7EC-3D8D-46FA-B099-C111E65BCCAA,1 +5695700,7443881,DB83BF39-57A5-46D0-8875-902D13B16A98,0DE2ADA8-0ACF-42AF-BD3A-2B40F3749416,1 +1428168,2015286,5D339508-7FA8-489E-83A7-C752C4948C84,90A58E1D-660F-4685-B03B-9CBE6D1017C9,1 +4442268,5603510,503EE4B7-B7C8-489F-B410-25B2DBB54C92,5D360E6B-3D36-4FD2-8FCB-23CF8C546953,1 +16910220,1287613,A52B6EA4-E77C-454E-80A0-ACD8229E1B7E,6FB409E8-5853-4042-9BCE-487B4AD8F47A,1 +8907860,10963802,C39F8BBF-9641-4E27-831C-D85EFF5E9CFE,91EE7454-1764-48BD-B7A0-9BEED787E0B9,1 +1086587,1578978,FA4B6B04-6FCE-4C62-A6B2-CF86DA3ED2C6,8C3E6C4C-077A-4FF0-95BE-854785BAB034,1 +13293714,16106828,5BF3BE69-55EC-4B4F-8C1A-FD9E90C19C49,2EB01487-CF62-4D1B-A1D1-22397A30CA00,1 +14525162,18163426,1898312C-28CA-4FB8-AE76-CC56B624DF87,58E6E59B-3694-458A-91B3-07FA711B7D6E,1 +7352407,9048583,F9CD4C7B-6D43-4D3B-B264-0E83581A60AC,16F817BF-CFC6-42B0-A7CA-132F932C878B,1 +6635317,8688428,DF839661-039D-449A-A909-BA65476D993C,0116DB9E-9DA9-459F-8CC0-CF7115C359F8,1 +9571509,11595518,6444A2D1-72C8-4295-924C-25AAB4A12595,93EA6E81-116A-4804-891D-21E59FD5F57C,1 +5492325,6813987,E7B6A36D-1229-4743-9751-6A46DDA084B1,440704AD-4744-4783-A7B1-4DA945C69483,1 +416626,690256,8717621B-2624-4B60-A971-0187F25DF060,F04A960E-E90E-4E84-8D31-EE24D0AD650F,1 +539256,697707,B764A72F-B496-4A45-8BDF-48F92DA104B1,7DFB9E2F-0F14-49D2-BF37-56BDC11E045F,1 +988479,1176023,801F3475-892E-4491-BBAA-501952A87727,32A395D0-BCEA-4B47-87DA-5A8D6EBDE17B,1 +9116123,11034447,3013808D-D502-4BC0-9E03-4D42BA9D3E43,58D0967C-C5FE-4D18-B0C5-3574A394D707,1 +7439971,9143001,2BD8D365-743D-433D-900C-280719E07485,826BA8AE-9DAF-4634-90BE-6E51BE85FBB4,1 +7238242,8909164,2F010E32-2DA2-404B-B1BA-9912B02E096B,BBE876D3-06CE-4FAD-8E67-B870B1AE03AD,1 +15970457,19755756,2464E671-7CB9-41D2-A60A-1ABFC604D8E8,F331EB9D-720B-41E8-8FC2-9BF5003E3927,1 +4786327,6034004,34DC7B13-61F2-4D1A-B229-EF26A00DCD7D,B52DA1E9-D446-43B3-AD4F-C3404D7AE055,1 +7234135,8804923,D81EE3A4-2D35-433E-9753-8F08A6C01E91,BAFCF1EB-097E-4A50-856A-59EF8FD99BBD,1 +2151303,2922448,0F56DA30-86DA-4254-82BD-A313C12B2BD4,09458F45-385E-4C94-B013-10DD81E52BF8,1 +12719545,16016061,DBAB2262-B652-4CEF-B336-D620A595AB8C,9987AE8E-BCFC-4193-8CE6-101C38E3B6A2,1 +1254540,1836108,D82D1A8D-B46B-40A9-BE50-A23F14E294A9,AFB4E8E1-69EB-4865-957C-1D9CB34B45AC,1 +5776840,7213770,99711C11-8B49-46F3-BA69-EA8678043C2B,155582C4-4E8B-4811-8434-47AAE6DFDD57,1 +12724497,16018468,C023CEEE-6487-4E1A-9F3E-78567709A704,1BD92EF9-0723-42F8-AF8B-70070A15F8BD,1 +9158980,11017175,B1619248-AF76-48F5-A02E-A3D571E7C16F,0F4D1F2E-2400-4E96-A7D7-0862AB1E7869,1 +16122434,15925342,0C5DCEA9-4006-4AD8-8440-0D9A9DFCC534,52E05AFB-246F-4E37-99F5-EF90D3DF2B50,1 +665581,856988,D008243E-92DD-49D3-8D1B-32F631A468BF,1F672F45-3CD7-474F-B431-A77B45C2D5CB,1 +4180878,5306724,96BC3F0F-3BA8-4170-BBC8-C0D9232B0164,790D98CB-4463-44C7-BD8A-7AC0A757D2D1,1 +1161522,1617081,64B931E1-536B-4806-B91A-FD3B532AF899,34C44E28-5146-405C-BE52-C69514C70B6B,1 +8921080,10879261,F156599A-A0D7-4955-B8D8-AC53E6D5CCF8,3BE704F7-F90B-41D2-9ECB-25FFBDA7B60C,1 +15353390,19223082,C92BEB0E-FD40-4B41-9D39-CA7CDA8FC693,7FCBF2A3-4502-4D9F-BAF3-44D4A2C5769A,1 +12698617,15873805,1AB239EF-E399-4710-92BE-B11F4CA687CE,704F715E-5596-4F6A-8BC6-0C56E5060D83,1 +6607498,8644518,DD5D70CA-D31D-4F0D-A01C-7F3C18C43D6F,1C9E4E05-B51A-4C30-8971-6E63D29C6C58,1 +15090341,19191854,CB02E22E-8C7E-4AB5-BA5A-18C00956C892,1F165DF2-0DCB-484F-828A-28F5DC24D63C,1 +1288802,1853941,4A35EE9F-9065-4724-8D12-19F0BA1ABE33,E040074A-5FA2-43B1-AF5D-C2916EA2F5B3,1 +13341231,17254380,00E5A16C-FBD6-4048-9AFB-656AD92B7475,A5273F4E-4DA4-433C-8828-AE27E13CAE71,1 +14535845,18275004,C1D409B3-703D-4FB8-9000-34964B9C38B1,D86F2F1A-3B84-4E4E-B0E1-5D9D6015451D,1 +4865116,6115763,566AD2E1-5E44-4810-A80E-AAF2B7E2650B,65597985-EAA4-4E8A-8661-C215271EAF93,1 +8733830,10568361,432881FA-A05A-4C0F-8D73-8C007096BE5E,CEAFE835-CE23-48D7-8E84-1FB64FFAEBB1,1 +2380559,3163612,929D7402-7007-4386-8837-04291E49E8B2,438DBC16-F1EC-4641-BEE6-147249A5B962,1 +2190829,3023924,FD4F5987-A95E-4E27-8AD2-A395E925B90F,516D3F79-A829-4845-A23A-4723F89F8789,1 +8901550,10877559,EFF778CA-1F51-40A3-993F-255F54E5F3AE,BA7BC4BB-776E-45E5-BBE0-90A5DA9A489C,1 +14189436,17794126,A5E58855-4067-4DDA-B15F-64168EE34270,FEFADB9C-1EB0-4BF2-ADCA-8DED08FC8547,1 +13115506,16747947,78924951-EECC-40E0-B0B7-E77DA1080E09,E5957921-5578-4BF1-9FB1-FFA479E758D6,1 +16226094,3479540,3AAF2887-7794-4F10-88DB-694D0DB2F15D,409DFE29-209E-47B6-AED7-3D891D5D1D39,1 +9104042,11014458,61EFCD93-7BC7-486E-8AC0-2715C58243C6,5F0637EE-FA3B-4563-A4FB-BAAFA25ED6B6,1 +6139759,7598889,7BFEB712-CCDF-4B46-BEFB-C253DFAD2BC2,27721E9E-6515-4AFF-9ADE-22CC7C101C2B,1 +7093254,8718373,CAA16729-8C44-4430-970D-3E3FEEF9A8B7,C76FEFAD-61BB-4FEE-8CA3-BE3F4F6DBE5D,1 +1089814,1600527,F45BAABA-5FE2-464B-84C0-0DF63029DA94,2C8756CF-4789-4888-BDCE-6B3B807BD9BA,1 +9045388,11155772,24C6EFCF-462A-43EE-AD56-F755A954271B,7A821E1B-8B80-4715-9A42-29936D3A12B7,1 +6135873,7611478,545B6D05-35E7-4C8F-9BD4-8BA16805E109,8E66B149-1D1B-44B1-8E81-B0AC6E1BDC63,1 +1622343,2220120,FF1B240A-C0BC-408D-9431-86C89F3DF136,49CC9326-85B2-4C0A-93C8-8D8477B89EA9,1 +14185518,1889589,42BE7376-3D80-4A68-9A99-E6BCB0408F83,4BFD7667-7051-4939-9F81-86E004B67B10,1 +12558434,986115,3C5514EA-AFD4-450F-8823-2DBA367A855E,2ABEAA37-E390-4B58-B76A-385DDCB5D339,1 +1542534,2161710,19E1EF37-E9B0-4FC6-8F48-85E0A2DC2A1A,7DE0D80A-01DC-4AB4-815F-514B0517685B,1 +16912578,21289272,1BDE5A5E-F1AB-4D8D-BF42-9AEDB1974540,F6E35AA6-0C0D-4945-92F6-77F9C90D7F00,1 +1445788,2019416,DC7A8914-4A05-4D84-9D5F-66F45FF3C5DB,D0DC4E34-0E4D-4BC0-A88F-0C6FF25C85DB,1 +4626881,5845312,51E0ECBD-E73B-4176-98FD-E506C21EC273,29E12A9A-6484-4D59-8A5E-F46A04F9B9BC,1 +12180211,14653291,2964A1E5-6230-4F71-A8D6-C91705CC785E,A843D99B-4799-4C7D-9F13-4C783324CADF,1 +14522371,9043761,F4788ADE-2621-458E-88FE-DDCC30D0ECF6,92834E02-0FEC-475A-B022-C80D47F75209,1 +8496210,10398371,98F2A455-15F7-49DB-B75C-8EC94A8293D4,F67E0CAD-6770-444E-ACF8-6EA77E2D34D2,1 +1992613,2741130,F68D070D-C7F5-40BD-B34F-22923923BB48,127CEEDD-6D98-4AB1-8ADE-28AE410FB805,1 +491788,647589,5963E2F0-B382-4B51-8858-CABA7B425DFF,8ABAA9DB-7E50-4776-9EFD-0C4B4B7F0A54,1 +15518607,19396096,F60842E2-A212-4971-A29A-EF56F770146A,481AFE64-8862-4A76-9347-82925A687464,1 +2874603,3832707,24CC637F-2EC2-44F8-B2E8-C2A391C470FD,DB2B3470-EB0E-4424-8A2A-F18447C1120D,1 +8942423,10934635,EBC76DDB-3C56-4A38-AEAD-75A7BB36C33F,6C83537A-56F2-4DB8-803B-F627622B1B3B,1 +185805,238823,E5BF1858-73DF-40BE-95F1-8F7124A998E1,A3C47FFA-E6E8-4FF6-BC2B-C96FB1C58037,1 +12541111,15774944,80D6A61D-77FC-4BC9-9A1D-1CC85B4E21FE,42DCB503-1127-48F8-B3D1-33C502EE2B40,1 +3824125,6137561,8E162187-3FBE-4E00-9A78-80AD91F014D9,BBCBEDAE-ECB9-4F77-B94E-03B17E21F42F,1 +3818316,4908719,42859042-C2BC-47A2-9826-8CD1D06F2FE1,66EAEB55-A525-415C-8B3E-40CD9E2CFD64,1 +5665889,7343363,94441D2D-A813-4315-90BF-9AE422C1FC9F,4F844BDA-B094-411E-85DC-5F10FED5DAAA,1 +9776159,12053097,530A5FDC-4A50-4DD1-9379-2EA28E782D4E,8BA2C4C1-2F87-47C0-811A-9C19A6F7FE1E,1 +9452770,11411040,70B5BA62-1603-49F1-BAA2-96A78546B00A,CA57A54A-DC50-4143-9963-3F70207CD3CE,1 +663101,15880528,214DCB4C-46DF-4519-BA7E-BD7EF08BB2A1,9A50536C-0365-4667-877A-4C9EAC7ACB8E,1 +10644176,13223132,58C62D7A-3ABF-488D-89E2-9DC7634F8D18,3BD2C0D5-45ED-4DCF-8208-FC20F0B43CEC,1 +7093130,8718298,9BA22823-AC9B-49CA-B179-A562711B986E,CF8D4BB4-92E1-4BC1-B33C-58CD829620B0,1 +3000609,3978107,840B3CC4-0388-473E-942B-72EE376E6434,88597F5F-B29E-4ECB-BA00-8C9D5AE0FFC4,1 +6354937,7852140,CC4578F8-7424-48B0-941A-19A9A5217667,A929C16F-FBA4-4288-AF32-22364FFDEFEE,1 +12900377,16259496,834D00E4-B351-4834-851A-43296F9967BA,6FA854A8-E41A-4E24-8AFD-4F3AB1DAE026,1 +12602449,15592992,C9EDA92F-52E4-428D-A413-65732C91C595,29A809C6-55DB-4BBF-B2B5-366D4C038F77,1 +2670087,3513345,281EA8DF-A021-4F84-A451-1CFBB9719482,E7F8F7BD-9AFF-4883-BDD8-D6FF6D4B6309,1 +1435319,2006959,4BECF6F4-B8F4-48BA-B618-346BCA97FBFB,90FFCDBD-913F-4B85-803B-5D1337F0B362,1 +12680494,15944431,DBD42282-08EB-4B7C-A9FC-023D1E08734B,1AF424F1-F418-40B4-9381-3B6C1830FB3B,1 +577889,572307,48436C8F-68BC-4912-A94F-0405642B3775,329FAB50-D289-4E85-B1A2-F83FBFB9DACE,1 +15429335,15493194,66BE5F95-A2A9-4BCD-A89D-A46539169184,F954DF0B-47BA-448A-825E-BB21A64E2B12,1 +6997144,8401474,5FC8175A-AEA5-4781-97B8-43497250D143,07BA152A-B9C2-4B80-A184-B62307F8300E,1 +2669337,3511943,54E6551E-7D5F-4FB5-B26C-F4C947715CC2,84C2EB05-57B5-4C33-8F25-964BC26D9008,1 +2635744,3444092,EE1BFF9A-D5CF-4AB0-BB26-604CA921DC44,87D71238-7EB3-4F0C-9033-B05DBEC6A076,1 +8273137,6729787,30079FE7-2D46-46B1-AC3E-38286D5C28BF,5A0113E8-37F8-4F2B-BBAC-FFA88F18BCD5,1 +3153044,3752513,A11030BA-F71C-4EF5-97E2-9B38713249B7,6D03C962-FA53-4444-A4A3-AC37BCFFCF5C,1 +1058050,1330004,F771E781-421C-425E-8259-5BCB68EBDFFA,5F534A0B-7C6D-4E35-AF22-C4812D0A2A72,1 +1041270,1262292,132A4884-F02C-4FDE-8984-A2DA444009B9,2E7E614C-F210-4848-9586-FEBF795F1E27,1 +2637316,3448653,2F2F96EB-69E6-465C-9011-7FDEB663EC8B,37AB89F2-78B6-40E3-9E54-61A5211D3E9D,1 +1097834,1558145,665CB79E-6E14-43BE-94A3-56375F82FCAD,49D04040-D686-46B3-B246-30F6BEA78A9B,1 +8700009,10645259,CADFFB03-6BDE-4BC8-B647-711451006499,B715CB34-CB35-4C85-9A0C-5ED410D75961,1 +3579758,4612689,9007BD21-A072-427B-96B0-5AD063A28E6F,7C5A1393-AF4F-43FE-8C56-8DDB02B262A5,1 +14632812,18392026,CE4CA55F-4B15-4735-8BDA-854AABA99F2A,F912C3B0-C29A-41A6-BCB4-C30B8227E9AD,1 +15595117,19625208,5DAD609A-DD7E-48CB-B1C4-1C62B1485527,F73A9CD7-3CF9-4F12-81A7-51BF80978365,1 +9765115,12050362,3C7C6ECF-60CC-47C7-AE3F-40A405AB6C54,5FD5E891-B19A-4261-81F9-ABDA2E13DC8A,1 +381141,484861,A62EA1A7-C129-4479-9257-CFC6FE8C05CB,AC351C4D-4D26-4A91-AA34-67FE8299DA5C,1 +882328,946416,113581DA-BBDF-4AD8-A185-8D6DA3C5D6BC,BC71DF8C-BB9C-4E16-9C68-F7504B04A826,1 +15942944,19661910,516E7F71-0C84-4F82-8F5F-C622661D78E7,3CA0A85C-0C6A-4630-839F-A4F50EABF828,1 +5178838,6437441,2B277195-17C5-4C72-8391-4A028644AD04,716ABD31-51C1-4F42-AC7A-8C40BACFA8F0,1 +6658416,8021000,D2007618-F896-4A94-8528-56ADD1CE0332,379892D7-5206-4775-95DE-74D9D010615B,1 +7287407,8976531,36CEB931-B9B5-4A2D-9A28-8A74E95F88B5,AC217887-1EFE-4C4F-B9F5-D33D6ECD7461,1 +15302227,18804932,AF0E6CED-6625-4A2A-8F4E-26EFC34BB8F9,AE0B40EC-2129-446B-99D9-DC38123AFA83,1 +13968023,16906690,4265309A-7FA0-443B-BAE4-9176C422033E,07D13BDB-D12A-40F7-BB02-4DC649F72B73,1 +9776232,12052987,A5A3EFA8-880D-4A03-9AB8-D5A545E9EF17,8828FB24-2344-4B58-B48E-7EC79BAA786C,1 +849980,892235,7BB3FEB5-8CC8-4B8D-A3EC-6F9C73074C1D,F30C57A8-C2BE-4D6D-BC5E-28345C9D8F79,1 +415982,689828,F7DDE0F6-14FF-4412-80F5-C076A7146ADB,544D8D95-F813-4945-BD49-3D998A4DF6C0,1 +15534319,19414684,2265E43F-0D79-4809-80D1-A77A54F51D3F,73A645FD-996C-492C-B28D-C1DB376DB757,1 +1637508,2316569,152D1F3B-F62A-4CAC-98DA-083FB3B3889C,845C0FCA-780D-4E0E-8A31-8EC527C4A154,1 +8473564,10356206,79681660-C498-46DD-9150-24E58A955751,EE52CF30-5062-4A88-AA22-2F6E658D7823,1 +16165577,20212936,82AC113F-D951-4BDB-9CB3-4C5637640D3D,BDA24F71-C286-4BBB-B0D1-8C4E39595179,1 +1438610,2069635,2BE55D91-7B84-4D2F-86C3-8A550DA4BD64,2CA96300-4E53-4F72-9206-67E848F32ADE,1 +15637135,19569594,642C1225-3F2C-4779-9F87-5C72CE59EF6F,FDC26CD9-153C-4600-819C-EC993015BB1B,1 +4936716,6157017,C3B07FB8-0415-4150-BD67-0D2C44FC4C37,70B2A01D-0E1D-4BC5-BBC3-44B4D63F5839,1 +8062101,9849265,07DDC895-D5DD-4736-8C10-3059201CC22D,2DE93B77-272B-4BF3-A5B2-BCD512D8ADD9,1 +6301876,7791931,A6188C63-1338-450A-8D0A-BB08065A7495,75467A08-AF59-4F39-8F0A-097392D3C50C,1 +7929649,9542794,71DCA589-EB23-4A64-9A17-BB09AF7E045B,073C80DB-C0AA-416C-85B8-63FBF1487CDC,1 +11839508,14526757,3D5DD4F3-A664-4827-B637-B61ED7562B99,A75B8EDC-425F-40FB-9408-D10B92A2B3B1,1 +9735640,11977305,845FEF57-BCB5-43A0-9DA2-F30B2B0A8C39,CE20D995-3B4B-4026-93C8-6116560B7C8E,1 +15609539,19596350,321682CD-77B9-4779-A652-AC68CF52FF0D,9F93CAD0-BEBA-403A-A05E-6B9FDCDF0DA2,1 +4378582,5532864,5DFFE896-F4F9-48C7-B360-F10AF358CA5F,9FD314EA-5DFD-45FF-B72A-B2D91CB2449A,1 +12624570,15829076,CB3AEC76-95FA-4AE9-953E-867266B7ACCD,F607B084-251D-4797-98DC-E67DFC1A6096,1 +12573456,18918814,E8298B15-E5EE-4907-9B2D-E430666FDE2F,30F75D16-3C84-4960-9F38-E63483B44B6C,1 +4631632,5853569,243F8194-AD64-4FA0-934C-6C918A190AB3,EF814173-6520-40BD-B614-3A02055415FD,1 +1563190,2191626,DA172AC6-26C1-4A0E-B561-BFD2FF3D8BA0,9FE42641-6EBE-4408-8765-2F65B8C6E701,1 +849995,908928,4C398853-1679-45F6-A789-DFDDFF8C3DB5,329B3326-B30D-4F65-AE96-58879914A008,1 +4168893,5294947,342C8CD6-568F-4D33-934D-55B343D3417F,C3692915-9A11-4126-8EC3-30BA4D902BAE,1 +3773863,4842727,4C8F1670-BC67-422F-9CB7-6F339925D188,42932E93-ED2D-458C-AC2F-4BDFD7598416,1 +14563286,18255774,7016C2A4-F881-4716-961A-927C2C3171CE,4D3D0527-3D4D-44A3-BC44-443971734FF9,1 +5577317,6955679,A3FD4CFB-712B-41CC-9769-636EF34A64BD,0C4E3717-E94E-46F7-8C55-26F231E17B23,1 +12647790,15910666,40A6BB76-E2E3-405F-8EF9-06326FB47C11,ECB38060-36BB-41F3-85D6-8EA58B28C63F,1 +9121088,11063618,DFB3A216-EA41-4E8E-929F-042C4DB9F4BE,E88CA64F-4247-4273-A664-C81137528BE2,1 +654563,964020,82C12D56-F442-4E0B-8B43-0E06ABCF2C13,35E39483-2F4E-43BC-A467-1CE4D877D550,1 +12605192,15522170,234DBF9D-650C-454C-B469-DB74A2F0B3D4,348E3874-1CED-4318-BCC7-8B9DDE1C7BBB,1 +6274421,7738690,9DFC5CB5-A1F4-4993-BD14-357DEC7A02DA,1AF4E05C-E43C-49D1-B5C6-92285A836134,1 +16372978,20727098,5925526A-1B50-4D2E-A176-8BB305B723A7,AFE4CE7A-6EE6-4CBB-B367-F516E8504BEE,1 +665403,857064,60568B7E-9D35-4C07-ADB5-D2A2AFFE65E6,33F61FE0-6DA0-47BF-B96A-58FDE4AE3E26,1 +8894837,10949354,1AFC821C-C926-477B-A3A0-0B91CBAEF4B9,E53A2B8B-845C-426A-8E5C-CFD878FAA35B,1 +9708544,11915591,CBF96426-6B1C-440C-81AA-D2AC10CCC2EF,4E4DE1E0-31AE-4BBE-997F-782B84A18302,1 +4416407,5573206,FE746734-4BD8-4BA8-B2E0-2A7D8A3D948D,356363E5-76BD-486D-84D9-7D33407A3914,1 +1048493,1260630,9376BC56-6D59-4302-B340-843EA7FB2B9F,699CE5DF-C5BF-4764-940F-F01EA3F3515A,1 +1728502,2277865,8AC6C022-33A1-487B-90EB-AE9221197564,1033CADE-C59D-4851-9628-E61C9FAB4998,1 +16088292,6903879,88119A72-6CA5-4DBD-82B3-488D370754A9,BA035AC9-2C4A-483B-866E-A1C1935412DC,1 +6588277,8614280,342FA556-0E8B-4B53-A5D1-AD60C3E20612,877E7177-1B22-4546-BEB9-BACB20E31137,1 +1600568,2245503,F93A28D9-93CE-485B-B8C2-88EB0E1F877F,C8982D41-3DC7-4E3B-A421-32A7C3A779B1,1 +9765172,12035615,AD443A63-EAC8-4464-B7E5-57E9A071CBCE,1AA9401A-B55F-4A54-95A2-98D651AACBD1,1 +5355135,6647691,52471410-102A-45C3-B122-85356379BA5A,5EB772BD-BF76-4692-BCAD-5188BA3E11CF,1 +531873,15574966,977B63CE-1E7A-4212-A541-C54031B71B38,C9C4C93A-2630-49B1-9BD6-900E5696F9DA,1 +10875177,12620823,595610A8-00FC-4640-BA63-94F0AA8E17D7,D6AD868D-4FA3-4994-8D6E-EB2953C5B5A7,1 +4362383,5903662,65ED95AD-67DB-437C-A690-1A5C3ECEEB4D,1A737478-47FA-4DBC-8C4E-1B098FD616F7,1 +1524323,2132874,C1AE1503-B4BF-4F2E-81BD-383D8EB5117E,E47C261F-3F0F-4CFE-B5B0-E184D2AD0F12,1 +10834913,12595674,01E4EE24-30EA-4B46-91B2-2CB9CE886726,6F9B2806-93AC-4A77-AA01-9D65C0C65702,1 +490508,15476679,834739DC-95B4-4411-B8F5-C68CD90C1B79,B71BF802-92F7-4B08-82F4-54B8AE7D74A7,1 +10961583,11892580,73C236E3-6518-4009-8890-3B1D9B66B50C,627D7127-1061-40F4-8BA8-C3B8C8BF3A04,1 +7592219,15733245,257FCB38-68A1-4835-B0EC-BFCB93DC5653,6BFB8C16-ED87-4F29-98F7-49DAEF40BC36,1 +548554,534678,8735D65D-68C4-4852-8D80-28695DFB764B,56ED9DD8-5390-43A2-8BD0-C0D129AAF438,1 +12582975,15556747,1299CFCE-CD53-42F2-B5A6-B1A4EE3661E3,E3B86D7A-D229-4D24-BCA6-143D2F03C8B0,1 +9759365,11998965,024CDBF8-7DF0-41F9-9D47-E2D80833EA78,329E3E29-789B-4D45-9471-EAFEA8B45A0D,1 +900933,1528036,B728273E-4D32-4F76-94E4-11357B27ED21,0B7199D4-10A0-4871-8490-57A7427D9DE6,1 +5905485,7231920,6B6C90F4-32C3-4CD0-BA4E-8468C1D81D15,C88E57FE-8062-417E-83A4-EAFA993FA91A,1 +7568626,15487427,6797306F-A5F9-4304-92EC-3DE2D0D7574A,57CAB510-C549-4C71-9A2C-34161B67E089,1 +7382584,9103328,C05B7F4E-E206-4AC0-9340-C851BD6B86E9,788DF36D-34F7-4C1E-B0C2-EE169347BCA7,1 +16893867,21307294,919E79EC-5042-47F5-A81B-2D25FA27D924,8E61F6C7-EF79-4590-8330-5E1B1C4C2851,1 +12441821,14681860,BD3F2A80-CACB-40A8-88D5-7C4D46269767,BA6B1B63-BC3B-4D69-9F4B-AC1F6CA108B6,1 +3541833,4445065,F6372D19-4B51-4103-8B17-AABDE267BB3F,B99E8586-6BF2-4686-A382-C12AC78848B9,1 +7440740,9143561,BBA7CAD9-B449-4992-9EEF-2951C628CCA3,6BB09A79-06EB-419F-ABC8-2AE9863CDBEE,1 +14559793,18223710,B9458604-0E40-4BED-A98C-C86FFD8CE56B,C633E2F0-4280-4844-B4AC-A8BFE3D4A5F0,1 +922455,1026988,0103C882-81F7-440B-BC11-1C17728E15DB,3E549863-DA5D-4B9A-9C89-9FEF3484E7A9,1 +4682793,5890346,F62E93CA-1C63-4A89-ADB1-E3BAE075550E,AEB370A5-1119-4A77-BEA1-3D36BBB388DB,1 +3757111,4826837,B1257FF7-7E9D-4BD3-8BB9-3DA9EAB555A5,1DDA5489-8861-4F8A-94E6-D012E72613CE,1 +4579648,5790896,70BA2A81-C7DB-4D91-B393-93AA360BB5D1,87D03814-6BF2-4268-B49C-3238C4D530CF,1 +6270180,7715637,D4FCEA7C-6415-4DF8-9D4C-3C17D004E651,10A9E215-4498-4CDE-B1F9-12D66E9EF244,1 +7227693,8797475,1F417DF3-6821-49E8-8265-AE234748B8A9,EC159136-E0F7-426C-B64F-E456B1C33CB4,1 +4537327,5819258,3F350652-CA82-4902-A73F-BE7D64BA842D,D71B6F14-B57F-468C-B323-D24383267558,1 +16908377,21314404,FF6539F9-5BAA-42B7-8C5D-E5602963A404,D2337725-141B-4E28-A218-DAF7406D4D41,1 +12638083,20390036,FB4D3D41-B61E-49FE-8D22-7C6020ADF949,22006DFE-9293-4DF6-A6BB-ABB0221CB017,1 +2212392,3052231,A83C0178-343F-4480-9EA1-F05F2EA50C40,2A85030D-8252-4B33-89AF-811F75F14FAA,1 +10897420,17728626,3B042710-867E-4326-B5CA-6C21F95747CF,4826910A-3595-4DD5-A4F1-86D68D86EF0B,1 +6517308,8636472,36F17C82-CBB9-466B-9E30-E1CD5D31F1B4,9DC7969C-016D-42A4-BE89-BCE8CEBFDEA6,1 +1141314,1655758,42BE5249-1F5F-4ED2-95D1-047F64C51B8D,46DC4541-7207-47F5-A72E-C44E35A5BFBE,1 +12174654,15443211,63637470-512C-4237-8B87-EA2CE8122C39,4CC91461-FFB6-4F8F-9E3D-88FE7B30D4F2,1 +4506820,5855966,17AC5989-A3B9-4BE9-B2C0-E666668F0F4A,653EE3EE-443E-4816-B035-D6AF9714B685,1 +1383260,1971838,46B0DE73-0688-4E75-8646-502229B5E326,7E333A24-3ABC-4403-B323-08DEE961D75C,1 +7739026,9678537,6DB8165D-F72B-4141-A3A5-616C82018501,CD5FFA4D-8D31-41CC-97D9-E13D49EFC787,1 +7346218,18201998,2597C2F0-CCF9-4E94-8D5D-F54C3296E087,4FA055CC-34E9-43D0-8A30-6F5468B5BCFA,1 +8019530,9819917,A5E20BE0-2B48-41BF-B147-C8F2209C2933,625F7891-3C5F-416A-AB58-155F483B31D1,1 +600161,598927,576B5A16-BB34-425B-B545-271B4D4A7C90,82FBC13E-E5AB-48D3-850F-88A58A4B7E50,1 +7038105,8464187,EA404251-775D-471A-9854-989872B7591E,80C129F4-7E52-470B-B93B-3761EB2E9F13,1 +1988440,2331952,7F816D26-0293-4887-BFA7-25A8A28CE528,37C7ED4E-8D26-4DAC-A559-216B15EB7E0C,1 +535800,517829,19FE7C44-491C-422A-BC38-A4D3709A5FDE,7053A584-763E-4277-BDBD-11D418EC6607,1 +15522957,19338692,0472C8AE-8051-4A44-99FC-5FEF6439D49A,7BEA93C8-66AA-4C36-91E3-FA308150A1A4,1 +13401320,17430014,54F7C72F-330A-47C5-AFD3-1C149D5225E6,D8FB8063-76FB-4C3A-8C23-EC1A5B7D4673,1 +12849612,16024625,263C8111-FE1D-4F50-B331-4C6EC55607E4,1C3E8ED4-081E-4339-BF1F-B51DC6576531,1 +1135845,1641539,83FCB590-10BB-4910-A1CA-D89B46E99909,8E76EFB4-7797-4728-A9A9-B5FD76E238E4,1 +14298476,17991036,CD87C945-97B6-488E-ACCE-F88214E1C1FF,1D80A40E-FAC9-4ED0-8FFF-299BEA64DD60,1 +12546326,15784210,F9965A7B-CE0E-460A-84E0-51F64FDB4716,4B724330-39D7-40FC-96CE-47BD60C292D4,1 +2648515,3477960,694FF588-AD48-4A13-B7AB-8704D0AE6F01,3C214F58-56C9-4540-8923-9AB49B2E70F1,1 +2499658,3282527,59DBDCFF-A243-4C2D-9D2E-C9E646AB303F,AE4DDF48-A615-4251-85DB-F68098294EB0,1 +12557333,15742384,8C15B324-C35C-48B2-8161-A44EA52087C8,B3C9473D-2AB5-4D0E-91F0-F96FEA6FF6B3,1 +6145791,7631284,D26336F8-5710-471C-A9F6-B6FED29EB050,6FDC4CFA-CF4B-4BFE-AD96-3D2A501CF3ED,1 +5019373,6250522,05125E9D-5588-4964-9414-2DF66D2AB8ED,36D22E98-7527-4B27-A24F-9968D90479AB,1 +6992608,8415171,EF9E1E28-76E8-4BD6-8C16-37C0D680292D,B4B96DE4-2351-4540-A3AA-90F9CAD26B27,1 +9577978,11586338,0C0F9A5E-0419-4D68-B33F-A36088822CA5,95226A74-E32E-490C-8679-F84CCB82B870,1 +4394346,5549000,F908BCC4-BA19-4056-8005-EFC56862A2DD,5FED01C9-2DFB-4793-9AB3-F4DF90A51B51,1 +2663388,3500773,92247E51-FA11-45C0-8C53-117D6383E7C3,277220EA-808A-4CA4-B066-4638BCD0068C,1 +10590137,12082478,FA2A1864-F6F7-442E-8573-C06ABDC4228D,77EDF770-3568-4FE6-ABFC-7E25BA16B479,1 +8489247,10388514,2CD42CD8-3415-4CD9-B496-6E584B23FC41,39BDA987-0B26-49A7-B593-9813F665D3D1,1 +700369,1323443,470D317D-39E4-4EB8-AFA7-2433A16DADF0,D42AB0AE-15A0-464F-8F77-4ADD9974AA06,1 +9803925,12870842,574D46D9-5F47-4BD5-AEA7-7F117D8D777F,213CEBBD-F272-4229-A9BF-AF1013985DC3,1 +5712365,7108653,3CEF68F2-D422-484E-9095-0C78CF094F72,781639DC-8F2D-468D-AAF8-E759F8FB3168,1 +8363456,10046208,9443DE58-BC6F-432C-B729-E4D3B62F2BE8,03D6AF8E-3F3B-46BC-BA6A-847329E5F392,1 +1554645,2180505,797DB670-FE3D-4772-B900-076E0FE3A321,18312FA5-6E84-43A7-938D-04A3978FB6A9,1 +5022984,14830142,7FB9540D-2E92-438C-BAF0-FF18ADD0EE6D,8899950D-6421-480B-A8EC-156406C8E8C3,1 +9774693,12054258,4B5FD117-6A92-47A2-933E-7D61242496E8,5D9047F1-9697-43ED-A565-7D3E57A527B4,1 +5949913,7384802,0F54BB8F-6E27-4CFF-9000-D84D9CCA7955,E69C7289-AFC5-4147-B100-C69148C0EC69,1 +5693681,7085478,5ED97157-98A8-4AA3-A5BC-DC88CA25DD02,2F79A1E9-CEE3-4627-BE0D-8E52596E5F2A,1 +16228823,20217254,05314437-03A3-4B71-AF64-B47922601F44,3F0A3FB7-6BEA-458E-8565-F1C9747342D8,1 +74173,110688,120A8F5A-4CAD-4101-B2B9-CD12300F339B,52E96437-6B06-412A-88F0-0918B014CE9D,1 +5613525,6994269,A3BABC8E-17ED-48FB-A8C5-20473A94D2C6,B2563C07-E547-4127-9D98-642B86A2701C,1 +4162851,569744,E632A81C-C7AF-47E5-8F74-0A7159C9DA0B,F5504955-9B20-4E19-B291-7F4400583EAE,1 +10118760,13778378,96530647-9EC9-4509-90A7-B4F07D73CAD1,B3B62E36-9E42-4947-B46F-4B84CAF2D7D1,1 +8491551,10379258,B00C56BF-FAF7-4FC5-9A61-0A65E5ACF00A,32B54E9D-7F8A-4618-A852-3E6F4D613E59,1 +7078054,3473958,EB81A99C-DEFC-49D2-973A-8812EC9C3A52,DFFF7564-AD04-48A1-9701-85F0FEF1807B,1 +9298776,11890672,3BBB36A7-324B-4463-96B5-54043AC8A31B,BF20189A-FCD4-4381-AC97-2C6A7DFCAE7A,1 +8736271,10567539,1A57A764-8DB4-49D2-8F55-DCB86B6BAEBF,962AD418-82C5-4360-B023-D6EEE2E1533C,1 +1858767,2826952,251EA87D-CE0E-4C70-918C-894772ED89AC,6749675B-B9AE-4F66-9FF2-CE4B0A79966B,1 +1500877,2085601,B32A23FA-5677-40D9-ABEE-2E7C2FF421A7,140243ED-01F6-4BD0-A18D-A156A5BCFEEE,1 +1577783,2218578,0761DCBE-FD8C-406B-9BC5-0A4E6FE434C3,1FA985EC-22ED-4E1C-8CF1-9E4B7D83152C,1 +1503709,2091814,3AC24A55-B2C0-46CF-8CA6-8D996E0228B3,FBFF2FA2-5336-4CF2-81D6-48ACF80ADB9E,1 +6446729,7959496,5F72F40B-F77C-4B5E-8825-E1F3184FE3E9,EE2336B4-EF2C-4DA3-A72D-55C94F29AD35,1 +9489474,11466740,100C0E15-C6B2-4D50-8C71-D2B6A9BDC6CA,5D36207D-F0AA-4325-9964-F38253A71F4B,1 +15614710,19605146,EBF70A46-173D-4C75-BDF4-9DBB0E9C2969,A8777BF7-D17D-4C8B-9C04-3C4DD70E3668,1 +640657,799198,5F2FFC52-5746-45C6-B200-8E5E3F1CCCDE,FD169076-2C1A-42DF-9F4A-5DD7ECCA7324,1 +15155957,15581290,1A7AC6F5-F460-40BA-96E5-D6E2F02E1E1A,5652BF89-60C7-4A8D-9921-49F2373CB1DA,1 +1702918,2395406,8F0D9DB8-422B-4131-866A-E2F08D62F88B,A1212988-BACD-42AB-BFC2-C62AEABADD56,1 +13378842,17234152,5E1F1DE2-D74D-4509-874A-DBDBF6B63F64,618BBCBA-7B2C-4936-9B28-7A8B67331BA8,1 +6640334,8577337,1BA20DDE-F04D-4D73-891D-327E84048A72,E0DAC00E-2A2B-4777-9587-14B6F29AEB6F,1 +3925966,5410212,FD31FCBB-B507-423B-9CEA-C69FE205E050,02A65126-326A-4D90-BBB4-16EA368B0246,1 +9769610,12045459,64968D63-1DD8-4F5F-9316-5C7B48F0D8D1,EECEEB15-3D30-47C0-9F35-4BBA0C18C6B7,1 +7922924,9670972,3619E948-484C-4053-B8F0-ABE557B380D1,A980EC78-DEE5-41EE-91EE-5E9602A7D62E,1 +651487,827472,9772B06E-1040-4B85-8F73-84D3F355693D,9C8DD536-9B42-4432-9EC0-BD0D9DAA17BC,1 +1042088,2853761,C9766A2C-AD3B-4529-ADEE-5871D8F8D849,1330EE1A-564A-4726-90C6-F84E96E93A1F,1 +12569464,5836712,383DCA88-3C8F-47C1-B4B4-2BA635DFCFA4,D46C918C-58C3-477B-ABF0-95881F164770,1 +1103085,1544460,0B3A0FB2-092C-4E2E-AC0B-1DE3A07E7932,16DDCB29-CDDC-41F3-916A-1E386E221026,1 +2965482,3915643,2B558E4E-3A41-49EF-8E37-A67A796972DD,2DCC36C5-5605-4864-86D4-2943DEE20B82,1 +8947776,10885240,E4139760-7655-45CE-B0BD-B2DB46B0A128,FA17BB11-D630-4D72-ABF3-5D81E528AE97,1 +4554841,5736728,13D396EC-16FD-458E-8E47-20B99A4A94EA,B71E5011-B171-4358-9D06-06F9468EA050,1 +10726769,12833401,D6EECB15-30E4-4D60-A4A7-01932DC78965,CB9D8DFA-6B48-4833-8135-D943C678A9FF,1 +9765081,12045672,2A5B433D-C30E-455B-83C0-3E8556A7D5E3,CAC24F8C-F343-4BA8-9E49-8BED6839B90B,1 +6374644,7875195,BFE73169-25F1-4D57-ABF7-9399230FA820,EB21297E-04AD-4D8F-B995-FF2F2086D7A2,1 +261233,352759,20F9BC4F-62BC-46A7-A161-1543F695A398,5A8F81B5-8A6F-4235-94A6-DDF9E5D9F97F,1 +2100131,2862713,EE911855-8E6A-41A6-B80C-63BCC6726DFB,E3C7D1E6-2B7A-489D-8474-1A64718C9D82,1 +8897680,10873036,6EE3AE60-C2E7-4357-BBE7-C9294766E23E,0D6F76E2-EE2F-4037-BAEA-F74871C4550B,1 +1343588,1925886,9923B232-DBA2-4182-8CD3-761AD38AC850,D397F1FB-9F6F-4B94-B6B4-17628FDC6D1C,1 +650589,822129,A679AD2F-CD39-429B-98E7-5575087902B7,3F394079-1BF7-4C67-9812-9BBFFD2B8902,1 +828187,1309805,00C89FDD-F1C0-4C4E-AF15-9616A25C21DE,38FA917A-805B-4419-9602-A3B35E9D5171,1 +12122834,14325468,25BBE22A-205E-4474-A474-2A46419BFFB3,3F4B0BB1-E968-4422-B50E-3FD76164AC8A,1 +14506402,18231426,5BF4CCE3-FDC3-4305-9DF1-11356666F8C6,A108E26C-0388-4340-B472-0D4E51197E4C,1 +7995380,9780235,6950016D-38C0-48F6-8EA4-15A2DB1DDB16,EFF1F32A-61DA-45A7-8FE2-9F7BCBAE4D58,1 +3469028,3922821,B2A36301-AA71-4C00-8269-957EB74E1384,EDC52E77-B882-404B-B783-D2CF5EB2E20C,1 +16402418,20935772,25F9BFF4-F839-4F61-821D-66AE0E1A0B16,EA912C76-7C85-4C82-8EA9-8806F348DF5D,1 +674998,20195396,8A2BB454-F1D0-4719-A328-9A168906B22F,A6EFA291-4F57-464A-A1FD-C2F43DFBDBB1,1 +1035282,1275744,73D0DCE1-8921-4FEC-B1A9-788D7613ACAA,8F71C024-2824-4B43-A973-8B50CB69EE40,1 +8450141,10322063,C9929F23-9CE6-4EAB-8E8A-BA9B51AF510E,AB9991B2-9963-489F-B763-70DD1E82F92A,1 +4662067,5883905,BADFFE12-AA69-4489-91A1-FCEF6C476CF0,62AE248A-7161-4AFE-89CE-85D4A5B73E20,1 +12676297,15848296,AB39083C-9B93-4F22-A657-0CB7164992EC,50D487BB-8175-4008-BD7B-A03ADB2E4404,1 +1051067,1270737,3182C607-DE65-4197-9D58-8A29227F5879,B862D41C-0A3A-442E-8C0A-176023B5DA47,1 +2132592,2873915,BE60B88A-FA25-4624-AC60-CC0A682AB20C,46A74D51-DFFA-407A-BEB9-A18F3FB85300,1 +668851,862364,2D4DBFA8-1BFC-4A13-9C79-2A12EF0F5BF7,10D0ADB4-6879-4F06-A1AA-02FE5CC34068,1 +9260542,11351705,4F523A0A-AFA0-4BCC-8A0C-DA3D2A1F5889,720B51B0-36BB-49C7-AC3E-267C0E0F5F5E,1 +3975011,5057470,100BC814-8B63-45CD-B9F1-CC2AE1823017,C59A4647-1360-4A56-8EA6-6D7C773473CD,1 +15597964,19585754,3237D991-9575-4786-B0C6-343437B61CF0,3C2ADEB3-2DAE-4BA7-8B03-934445E30D86,1 +665408,857100,99641859-1B2F-4C27-B143-C4882EAEA944,938693A4-50F7-4BCE-BB56-AFBE0F9C9FF9,1 +9089176,11002888,E32483F8-85DC-4D21-B838-6F687C5020C9,288AA563-5D89-4183-920C-ABCE28686642,1 +16192751,20233910,7E7DD2CF-0DC6-4D99-8680-4BD1CD4A3427,4B1DDF5F-E9DC-4A22-BFC4-4385EB0A1DCC,1 +327529,2741177,0C71F762-78D7-44A5-942D-F210F96FD120,37F7958F-EC75-42A8-8433-E33AB6AAAD45,1 +14498120,18187352,6F3865B5-10DF-419C-9BEA-4F2B6D64F2B3,A29C825B-461D-40B9-A883-72A7658F5E2C,1 +15431772,19485652,7E5C79D9-491A-42D2-997B-92819B6421DC,C0D839D5-BFD0-4D8C-A1B5-DABBA7787216,1 +1891895,2594136,E8C83DB3-EE5F-429E-B333-EF539B940D5D,04BDA365-09E6-4A60-A439-9CD9D94B58BD,1 +6318452,7811316,7CA68065-D8C9-435E-BD00-0A60DFFA14C5,F9A32462-75DC-4E26-BCD8-CBAD1295CE6C,1 +1522889,2141717,FEB5EC76-09B8-4FF8-8430-C5E72D64B746,300F7D98-240E-4C21-923B-AA11BBB0A50C,1 +12103663,15313246,398A3A5F-8517-4747-8538-D45D8D67CB39,815D5763-B051-4B7E-B58E-3F740C67BF25,1 +7380768,8932240,0899EB7A-CC83-4F64-A98F-464504825412,2023B810-6745-407F-9A68-35B75AB6EEE9,1 +8932951,10888348,81C31581-F9D4-46EE-A093-B2D2E967A5C1,0A33746D-3E97-42D6-9A5D-5DD1ECD2E8EB,1 +9483336,11455803,D06D938A-3B1F-4BF8-9693-99D2D17D28DF,C26A209C-E179-4D59-AE74-77CF638E2D4C,1 +9751962,12016838,F472D277-ADA7-4C7F-B900-9A1E5B502BDB,CF7F0341-CEAA-466F-B313-957ECA01F9F3,1 +8499489,10404620,96EB4525-E241-46AD-BC71-762E5B9F279F,AF1868A2-D633-49A4-855A-790816DFBE85,1 +16629695,20757196,3AD9EF66-5E60-4CBD-9348-8DCCE7F76020,D9CEDF93-FC7F-46E0-B8BE-1FD7FC158899,1 +9191166,11275606,F75DA974-A09E-411F-BB24-7CD0D234EE13,528BE614-5C98-4CC0-813C-336E25D6EDC5,1 +6474166,7983035,F6B8CD56-A8B5-411A-8D73-F238181EEA6D,48990901-757F-412D-A596-56DF3ADF71EC,1 +2107240,2898164,78590915-8E1A-422D-8BA5-63564A2EAF4B,58304375-E2FA-4B9B-A11B-1FE8BDE19067,1 +4812468,6061611,FD7199EC-3477-4278-AC86-614B1F45EED6,87BDEF5F-74FB-44E6-AB3F-1F452B00670C,1 +654088,21287582,FA5B8B45-00F4-490B-8C92-3B5AE1308958,631AF661-1A4A-4250-8CC7-B51808A2E1C9,1 +1608012,2256777,390E329D-2C1E-405C-8BCF-F5588C0FD1BA,5CD8A2E9-6D63-40C1-AD21-DAFC892115C2,1 +16422111,20247264,44E9D537-4C3D-4AF9-96AE-66D79FC8C1D4,D7BB44A9-3E73-4B3E-8815-AAE1C67773CD,1 +1128631,4794888,7926086D-DD24-47CA-9784-237EB1D707D6,27DFD587-0F21-476B-8E9B-5F27C1F78874,1 +3845473,4921467,7CE304C0-598C-455F-BFE3-A360628A5133,8D4A9954-48DA-4083-87A8-CA4E297D6106,1 +13039692,16710174,8964B12C-0F0A-4269-951C-E30818B9AD47,BC869CD9-85D3-46A5-B28A-7C1A88126982,1 +1623654,2280269,E7386D54-AF33-46C8-832E-EB79F2B63C92,C8D13335-1B55-487E-9DD7-82DC0FA55CAA,1 +7602814,9815198,3C26FBB2-F6B7-44D4-93CD-05A87889B04A,15F40E5A-828A-4B02-A2E5-32767180A786,1 +2397312,3178364,773F8BC8-E824-462B-BF34-F7279FE854C0,EC8ED635-E08E-4C4F-A165-4311DA55BAFE,1 +2443294,3223491,5581C8A6-C4FD-4B27-80F6-90FCBA29F40C,07A57156-1C91-4AC5-8CA2-FEE4A394204E,1 +5337584,6429124,6C734136-9021-49E3-AD9A-403CB267DD40,9BB7626E-5A7C-4671-AEC9-EE155B6F006A,1 +15602927,19628124,4D44202F-5184-45A6-99CC-7CB66FFCEFB3,8B9BE878-7DE3-4847-AD99-385A44F3D932,1 +11212571,11386342,60222C0E-F837-4129-B19C-0DA06BC053EC,3B084733-D5AE-48A5-AF2F-333A21B3467F,1 +9260637,11351732,CE71113D-386E-4AE3-BFEA-6DD6F807B69D,C31E7272-46E6-4E37-AE34-1148B140A20F,1 +981244,1168628,D2160B98-7D8D-492E-8E83-37C848B9D8AF,884F1192-06FE-417C-ABBF-DB886F6D65B5,1 +10636349,1758588,4C93C91B-87D6-4427-B930-F3DCD9AA0388,B880311B-E040-4744-83C4-F1B1E1E597DC,1 +12584344,15525213,82E0A8E2-7E52-46BE-8C1E-8BBA853E5444,7BDC80E2-A1AA-43AC-A752-2045A913F481,1 +1504668,2092667,3B966E98-ECA3-4BF9-AE26-9D2A62152041,ECEDAA50-0EDA-46D4-B93D-1D44C6B77383,1 +14468126,18146200,7AC888D0-0F35-4787-AEBE-6A617E6BD722,DAE24FAE-4E20-40D9-BF76-99B5CD7173D2,1 +1349115,1933524,0B6C4567-0BA8-4853-BFB6-69359DF86BB6,12D976C6-7006-4B98-AB76-22A35C0976EF,1 +1221001,1743764,4BFE05C6-C459-47C0-92A3-B8657CA84F87,4E1DFE6A-E51B-48ED-994F-95E23D2D2227,1 +3625371,4678488,670AADD0-9471-4194-841F-EC1B5538F0BD,2117D4A2-DAC7-4D9E-A582-1C3295D2F7EB,1 +14814455,18685150,FC3F7208-417B-4256-B2C4-30A96C68C166,DAF5068D-39E4-47DB-A4A9-0874CCE5457A,1 +14565030,18257670,4C500E6F-D3EF-490E-8BB8-B5E1FCC22331,27E31A6C-0DCF-4D9D-89EE-77A994CAFE4A,1 +8484185,10408952,C0B73ABF-E522-45A4-BCD6-EECE14BEA37F,77D94CD8-CA3A-4DD6-8DB5-A221D63D05C7,1 +13920064,17662446,A66C1CB8-A302-44D9-8B99-BEA682DCDDB4,4D18EFEF-F12B-47DE-928B-C78D77BC5060,1 +1491456,2111238,53E6E92B-346C-4C42-8A08-B7FE8DCEBC3E,61EAD1F2-1D5D-4EEF-BF29-79FB0AFB4D5D,1 +7286833,8975705,D8FD15D8-30F7-47A4-8CEC-84757F70779D,EF480CF9-C83A-457E-B63D-1BF34309CDFE,1 +4046750,5148424,AA84ED81-6F9D-4AFB-9F41-95AB4E89BC3B,470A669B-B845-41CE-B9A8-E6657CD05ECC,1 +9023606,3127674,2A802850-6369-40D4-9FAA-57266C1FD14A,6CE594A4-CE0F-4FD5-AC2E-D2E8B7065A1F,1 +1631294,2291584,4B03C0D0-0AFA-4CB3-A5CF-249D2616C3FA,41B2D355-BEED-46E4-BF0F-E8A993F41018,1 +1510312,2117584,68D396C6-59FE-497F-A207-2692BBE503E8,BEF7E936-211B-438A-BCDA-020991959E57,1 +14184044,17830372,7AB90477-5ADE-4BE6-BC92-25D2C4153425,02CF79E5-B07E-4001-BFD4-A7B6987AE6CD,1 +10681717,11651335,D6B1DE1E-79C5-4C0E-A4BB-18658E856B24,564B13BC-F740-4580-B415-6D44F44BAF7C,1 +1556523,2181268,11E2C516-3697-49A9-AFDE-B485605BDD16,40D41D81-4B86-45DA-AE29-AA5A9B05A4F7,1 +11556622,14473889,C5765525-D0C5-47E2-B151-096A0AA57589,841E46FF-416E-4CC2-A582-B26BC829432B,1 +12716823,15943481,2E6D21B7-33E9-49C3-A36C-558255C9EF36,176566DF-3E43-4402-AF1E-CE3939197CF0,1 +4608274,5786787,7FE4CF3C-81DC-4E57-83E8-3A782A7FE6C7,E22D818D-D1B9-4567-9D29-480C0D89CAAA,1 +14451685,18051644,B9AEC5FE-33FF-48B4-A978-D7E6AD112778,09E469AF-3543-48B3-A3D2-D6209A1FC000,1 +7092800,8717971,E6D06033-5680-4474-8888-3F104E158E59,B579EBE6-0830-4431-819C-517DB15B41D4,1 +652942,822997,BB667130-894F-4B45-967A-827196D5E1DF,C7EF0F7C-00FE-4574-9C3D-0FCF0F9A0429,1 +11341967,7454619,FDA402DC-363D-45F4-B907-23A3A674BD91,716AA2BF-EDEA-438D-AAAC-871B76D89440,1 +16870677,21257214,5CA29C33-DCF3-4657-8A1E-D72C76164586,F78CBBDE-BE30-47D0-A2D2-5FAF969ED4B9,1 +7022578,8427459,277B9C4F-1AB9-4512-B31E-C0EBF3339EC6,162F0EC6-37C9-4C77-847F-FD471A4CA420,1 +9401034,11864668,C06E1386-4AFD-4601-A0AA-93DA16FBC0AC,7C1EF683-07F3-45BB-8D41-0C05C193EF03,1 +12703153,20436010,E6D391FB-A72E-4E1C-A534-96B5397F44F9,54FD3583-97AE-4810-8443-EC8843DAB3CB,1 +671716,867409,AFB06221-B623-4EF0-86D1-DF42B3836D7F,E8414782-3059-44B9-89C8-FB499B5C617B,1 +5156618,6766491,2977C7FB-D007-4FE6-B308-5CB0CE1AF0D2,CA0493C7-87E4-48D3-8463-2BF6865686BD,1 +8920192,10879740,19E4E030-5CA9-417B-BF3F-02AEC7D2CB08,3C8BB477-4C40-44E6-A97C-67D310D22C66,1 +1190818,1725299,708FC5FF-1B66-40C6-9BB6-89998B10050A,662215FC-C328-4829-9971-B61BBDAB332E,1 +12568882,15554462,6CE9D169-B7E2-49CD-B565-FB5D82180ABF,12040003-21C5-4CB0-9700-8CBE6F02C734,1 +201463,282531,F4E603E0-1662-4299-BCD1-1319935D1A9F,31A1A52F-2928-4F11-8766-3B05C90BEC4A,1 +16877137,21246012,E33498BD-CFDC-484F-AD04-3B6C291DCD4C,85EB305A-8793-44A2-A589-F39B4E4CA6DB,1 +16488194,1046669,A84ACEFA-6536-410A-A0CE-85D6EDECD3B1,B19D65DD-0157-4876-BC7E-B4EEF44AA766,1 +2564287,3361545,E06FA808-84A2-4A86-ABD6-EC8BD66F6924,9979D70B-F83C-4F33-9C85-398BB6C039EF,1 +2641043,3450359,CEE92BC2-7FDF-4926-9371-FBD2DF0A7248,65CF9B9B-C734-4C15-8E76-B95E63A8C6B5,1 +5915887,7348879,75DF9A60-739F-48CF-9EB6-7C94D1EE26A9,753927EB-4A70-4ECF-9687-4786DDD68855,1 +7015985,8439453,659853FF-F627-4E34-9EDA-79035768AD8A,9F10A272-F77B-4196-86A7-4A4F731D8001,1 +12050449,14395052,395E4896-0B9D-4DE8-A22A-68D323CE11A0,627FAF06-1E42-4D4B-A048-00AEE25E1989,1 +4795881,6026740,46517E20-AEB0-48E7-9F52-E0F766D3FF3F,A32E3507-6985-4E9E-B579-20A720DB9407,1 +965802,1472376,A15892EA-6808-47A8-AC1B-4AD83EB19575,6CB9F3A4-7A33-43E4-8582-8334CCBFA554,1 +12705576,15986454,041147F1-BB39-4E32-BB36-E4294A67B2B9,34D8500B-08B7-41A9-A16F-CBFC876000B0,1 +2491466,4626248,9CA9F53E-287C-490A-97B1-08A34274206D,BC5E990A-7CD7-4613-B981-E1DC12FE1D0F,1 +2172995,2946154,71E419E4-0295-4EAA-8885-27893CAB4493,288C8C9F-CF9E-49B3-A235-15E59AA80CF2,1 +16879144,21043428,DCFD3EC9-DD04-467B-9E06-1ED0E9A18758,60C87895-1533-46BB-B0AD-82E6D8E162DF,1 +5526574,6889113,9F96FC25-5211-4A5B-8691-0F0098F2C60E,6E96723F-DAA4-4E7C-A69D-438ACF365717,1 +4157133,5283798,9E0E33DC-B1A7-48A2-8A11-3025095E99C9,68E8D03F-7682-4DA2-9C27-69E9424C3841,1 +7223293,8913733,76FAF62F-4DD7-4474-BA18-4B8A9B7F49AE,CA4E830A-60F5-4615-AF84-23CF73CB8982,1 +365222,263585,D3EB170D-564B-481B-A589-22A4EA362FB7,327B9E7F-7400-4936-AC0C-7DE3ADAFC796,1 +8495290,10408325,7B3B20F8-778C-418E-9B64-E2A1F1AB299A,953A8B9E-87BF-4C53-BE02-29B4EF2F610D,1 +16109626,20326472,0E265ADF-FFA1-4A0E-B546-56881E214AE7,BE8DCE11-E822-48A0-8D64-3267847A4EEC,1 +1571362,2207043,E8BFC660-6287-4261-878E-5E2535A96262,EBB930FC-CD1D-4FD3-9579-C874238C8F83,1 +8472502,10357952,0E3081E2-DC38-4E58-BAD8-780E8DE2ED53,5FD3FB4B-FC95-4F72-8286-022E2AEE5CA1,1 +696140,1320436,1897CBD7-2D79-4FEA-9836-2461ABB754C3,924B1714-40E5-4B53-9DAC-1111AD0FE654,1 +16628043,20754532,01222EAB-C4DC-4146-A50C-A8AAFBF560F8,9FBDBF6D-6433-4B39-96E1-E82EF37DFECA,1 +199144,9996036,2E26AF49-F52A-4741-893A-B16F43A5C7C2,EE7891B2-15E2-4039-B947-DAA109D909E6,1 +5713778,7108828,FBA295DE-4FCF-4BA6-B277-A2C5EA994214,A73CF722-AF91-418A-A175-3C900FD7B248,1 +15399182,19213976,2DC7D9D2-ECC6-4762-92BC-CD76C2F5C230,DBEF8588-6FCB-46A8-B228-A354B8FAA602,1 +1387678,14146077,2A4C86C6-906D-4BE6-A2ED-EE4A6262EEBF,6641DA6F-D400-43E2-9A81-31D4FE9DCBA4,1 +2847697,3812269,54B4C117-3DAD-417B-9FCF-F6AD6D44DD41,E2A81C1C-52F7-4B0C-8081-349FF671611D,1 +4473976,5649316,FC319E69-9F99-41BB-BE44-3D4EB5F8BC86,9A0DC4A5-C5FD-4D96-862A-0C9ADCB3A56F,1 +7409287,9129859,C9A94577-6FE3-410B-BBA1-7C07D3F0440C,D8518039-4657-4484-9A8E-89D329A21A7A,1 +15322495,19453944,209CA70C-BC86-4563-B95B-AA1756D359A8,7570ECC7-24EF-41DE-9B5C-4390972C96E1,1 +9286068,11703527,C18954FA-A811-46BC-86B3-EC78214AD708,93387C86-C978-4595-98F9-7572002D0EC1,1 +8944451,10883910,4B618B24-EA3C-4EF8-9571-2635E9BD6627,725C7FEC-B088-4B8B-8D32-59E94E0A08BD,1 +12616372,15740034,021B8C8C-85C4-486D-A14D-943F58B179C2,71FDA186-F7CE-4757-BBDB-FD1A05C49750,1 +9082245,5558339,DDD90445-A77D-46F7-8B55-C61F05728D35,1E24EDDF-8CEA-4130-BE61-0980A504C9FA,1 +1683865,2365803,0C859208-CEDE-483C-9F2D-B6580BA6F616,6C580A1D-0DC2-4E37-A996-B433B7A075CC,1 +2663514,19619270,838E8AE5-79F0-460B-817E-AF61C444B34E,A4C0141B-0361-4E68-85E4-DB91496293C5,1 +7478735,9178850,3A91F8CD-F643-4133-A260-BFAA9601040E,3FDDE39C-79A4-43D0-A2FC-DF8884335DB8,1 +16908502,21314382,B86480AD-B6BD-456B-A873-CF50722624C9,62CE175E-063A-430F-8193-8A1C9A165D71,1 +6714646,8083433,34E9A4AA-F18C-49C6-88ED-A9E39FEDE83D,3E930AEF-B195-43BE-95AD-8DC979B89C0E,1 +8917650,10985667,F1E3306D-E51E-4C5F-9AF9-EE5B9F5B46CD,7B6DFE3E-27F0-4AC9-B68B-E046D41D5611,1 +10176167,12980353,5971CB65-7209-4A09-BD98-82EB5CD6257C,3C181DA2-DA7B-417C-B57D-3305CC0109EA,1 +1511953,2123166,8AAD33A2-37E3-4133-BDB4-D32B3C46641B,404BA763-7F56-490D-B472-3B38E3B248A7,1 +2536652,4629317,7BA01363-1041-405B-941F-3E82931AEA23,58B9C6D9-BD7A-4A4B-893D-3F7BE3072AB3,1 +3435520,4443694,4B2E33A8-B26D-4994-990C-C85EC89EAA94,F5BF9921-1286-4BC2-ABAB-F9A4F3401F85,1 +10317809,13118611,9350169B-4A99-46DD-9749-54DA81C5B0BA,2BC0BDA4-F5D8-4A51-8728-9285EB17F911,1 +12631886,15832340,E92DEC78-BA13-44CD-8443-BBC1C638C373,FC542E24-190B-4769-9DFD-C1A8BFB56E08,1 +16406322,20927958,BCBC093E-2BB5-4F86-B631-B1F6201EDED1,888DCA56-8DC5-4307-B322-59A954C26C8B,1 +6371418,7888369,FF823C55-C3EC-46A9-9EA0-BABCFAFD482E,70D07E28-A32F-45BB-B69B-00464DB33171,1 +13682474,16414374,C9C009AC-2BD2-4628-9CBF-F74637E3FDDF,4016BF2E-EFAE-4141-AE56-347B03DFAE8F,1 +7303747,8927562,770AC45E-F610-4808-8195-35DBBE70A6FC,5251B975-9FDC-4820-BAE6-D7498A0E7878,1 +5278925,6557147,7F0EA777-9E57-4FC7-90EE-BCCE833328B5,8B752891-ADD3-49A5-BBB6-9EE844B26FDE,1 +16239282,1441469,CB917EBF-419D-4ADC-A746-AF510C06F24E,F7891DDE-4A87-4AA7-A6D7-24A99B9F03D3,1 +1563736,2126259,86E115F2-D01E-4AFD-9871-2775FD811BE4,7DB9A547-B03E-4033-8494-82AEE80F0FE2,1 +7001625,14631561,BCDECC56-3942-4AF5-B6DD-6D6D019BA9BA,A11ECF8B-8EED-4F51-BC0F-FB12206D2DCC,1 +1567680,2202077,A8AB9450-AB19-40AB-AEDF-98E746FC4C99,E80CC996-7570-4874-89E8-8466FFCB9015,1 +6589324,8621054,70C07EFE-6DE5-4D96-95AD-1DFD0A0F8FCF,F7DBAABD-AAB7-4B02-B31F-8CD9465818AA,1 +8347109,10018146,D12C4B2F-7269-4A92-A2B3-3A245792E0F9,7238E6E5-5E08-4ED0-9E91-0C508FF33EDC,1 +1215961,1780134,BF5BC383-AA41-4A82-A9AD-EDFF3C2867CF,C3B941A3-B8B0-4974-8793-AFE029B3E38B,1 +15377692,19315198,53A26044-FE6F-4813-8DC4-0429F1B83FDA,856A1E25-B43B-4CE5-8FCE-41FD1E3DBBD6,1 +10529562,13467593,5DF2C6C6-3611-451C-B543-B69B6AB7AFA5,59055075-1843-4A07-8F38-F8D2974A6A9B,1 +1055898,1277590,D0B283DC-C461-43AE-9D6D-A489CE0132D3,4512229E-CD73-42E0-A044-78C008986DE5,1 +2123554,2888878,010383B3-2D98-4BEE-9AF5-B44324FD5B73,3055FBDA-6F9E-4154-B183-818675A38EBC,1 +882375,944789,3A455A81-59CF-4FF0-8AAE-DB2618A5D11B,0D93E1D6-CE85-4B58-A07F-A16F51986874,1 +6235338,7550574,7D288831-8D42-4F3D-BA43-45DDA9335E08,6B805EF0-A4F4-44D0-8287-9FE9F7BCD0B5,1 +4452111,5616636,76593E0E-8E1E-417D-BACF-B69C6A2760B8,7C17B669-8404-4E0D-883D-5FED2F4A1035,1 +12664759,15993428,562D0F70-8469-45BF-97D4-B86027A9C9C7,9DA068AE-790D-48F0-B8FD-C9E45461C839,1 +4458015,5624799,940BCA63-DA23-446E-8763-9C038C4DB40D,0C492E3D-3813-48E5-98CD-CEFD2B738A57,1 +5230111,6500664,60E8EF2B-3A50-45B7-B853-CF80BB669689,BE9A401F-9A51-4303-8CA1-A587A3FF3F86,1 +6631768,8685710,31FC2ACE-8020-47F3-B1B2-D15F035CFE85,DE9857E4-0A0E-4CF5-9DA5-BE9FB70343CE,1 +12654868,15846976,A4812518-C06C-4B1C-ABC1-6EE836F0AC46,28036AD6-049F-481C-8CF4-7BD00ECF40D3,1 +484086,761724,1ED2086D-219A-4967-ACC7-F41C6B248676,FECD5670-A932-4C91-B52B-A4A406C7D5EB,1 +7260968,10984312,2ED05003-E93B-4210-AE53-89A442D6DD99,28F47E7F-760E-4992-AD46-B846A2445048,1 +6303526,7794837,C8F49E66-CB2C-48BF-8D93-861BD8D9BACB,CBBB1E4E-F1A5-45D1-BC32-1CDE1E07C35D,1 +2635307,19595480,455F0749-94D8-42E3-BCE4-ABEA4A34850B,62D7A576-30D1-4D41-ABAD-D73263B8993E,1 +16247979,15850868,55661B74-370C-4A36-A122-647521D8485D,6E46458F-65C2-497E-996F-6E2004AA84F3,1 +11631410,14303673,0AACB99B-1226-4F23-9FB2-AD93FE6C69D9,9FB75EC2-83EC-44E3-A491-1B7939D776AD,1 diff --git a/hlink/tests/input_data/matching_test_a.csv b/hlink/tests/input_data/matching_test_a.csv new file mode 100644 index 0000000..b20ed20 --- /dev/null +++ b/hlink/tests/input_data/matching_test_a.csv @@ -0,0 +1,59 @@ +id,namefrst,namelast,birthyr,sex +b5689d06-edd3-498e-8b5b-e04f2fa2f2a9,Catherine,Beebe,1866,2 +a7118f06-949d-4d02-be0a-db33a6f8f3a8,Frances E,Bird,1870,2 +85d089c0-b907-4d9c-95ab-c5fa4a3dd2bb,J S,Luff,1861,1 +cddd9455-48e0-4b48-89a5-9ee315e00087,John,Smith,1884,1 +8cb74256-6dfa-4d17-913a-59fa646c388a,Saml H,Russell,1833,1 +1f8e1a74-d486-44ad-8d5c-51aedf86208e,Charles,Robertson,1884,1 +61a1590f-1d3a-4666-8406-3d4aaf0770b4,John,Dickinson,1868,1 +92277f0b-1476-41f5-9dc8-bf83672616d0,Joseph,Shissler,1874,1 +322291a1-de91-439d-bba0-45fc2f47a2eb,David,Hall,1839,1 +136f7105-ff59-4eac-9d95-44b002cbb448,John,Decame,1858,1 +1138ab41-e234-4c72-b812-eaaf0fc5f76c,Nancy,Decame,1857,2 +066ea4e1-f340-4231-b505-ec7bb9a07103,Peter N,Decame,1895,1 +b7d96336-404e-490c-8c45-61f2287b52ff,Annam,Decame,1897,2 +24bdff6a-5590-4494-8e8a-ac4a549c8890,Sarah,Decame,1900,2 +c1fedaab-f026-4aa4-9320-e10f2432d539,James,Carney,1888,1 +43a6ebe5-752b-4054-818d-6f6f75cc89e7,Alfred,Dell,1883,1 +0d693015-2349-4363-9667-45036af7d0db,Chas,Syaex,1870,1 +1d586e26-aac1-49df-a2ad-fe0a385a26bf,Sarah,Russell,1897,2 +93b7ac89-f9db-49b2-a1f2-c189fecc14ae,Wm H,Hazard,1881,1 +e51c36c9-570c-466d-aac1-bf380c9c20f1,Martha,Hazard,1880,2 +9250341a-8336-494a-bc84-2b803efe64c6,Willie May,Hazard,1902,2 +a70679f0-9313-4ef3-bf87-5dfe81beed5d,Samuel,Hazard,1906,2 +4715bbf6-d3e2-4260-9ddd-6aece147e5c1,Samuel,Morgan,1878,1 +77378570-5214-4ac5-8258-c5156e8b99b3,J Clauson,Mcfarland,1890,1 +6542b541-6e10-411f-9b2a-7c0b93b0aa68,Eugene,Mcfarland,1892,1 +396c4077-6a70-4a17-97fb-f8a0c06fdafe,Anna,Preston,1871,2 +7e9dde5e-3fad-4b2e-b367-643c0dc8cabb,Rebecca N,Alexander,1861,2 +f7d9e25f-c390-4222-ac24-4e93d72daa05,Martha,Ellis,1873,2 +24b7afa1-8c49-4833-8292-c545c85d3b89,Otillia,Zeider,1876,2 +4b416874-0c5c-4233-81ec-39223bc66f4f,Mary,Doyle,1846,2 +a499b0dc-7ac0-4d61-b493-91a3036c712e ,ANNIE ,FAUBLE ,1884,2 +ae7261c3-7d71-4ea1-997f-5d1a68c18777 ,MARY ,REESE ,1875,2 +ad6442b5-42bc-4c2e-a517-5a951d989a92 ,MARY ,REESE ,1899,2 +b0b6695f-dfa5-4e4d-bc75-798c27195fff ,SALLY ,REESE ,1901,2 +9e807937-de09-414c-bfb2-ac821e112929 ,JOHN ,SHIELDS ,1889,1 +426f2cbe-32e1-45eb-9f86-89a2b9116b7e ,ANNE ,FAUBLE ,1884,2 +a76697d9-b0c8-4774-bc3e-12a7e403c7e6 ,JOHN ,COLLINS ,1893,1 +3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ,MAGGIE ,COLLINS ,1894,2 +49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ,MARY ,COLLINS ,1898,2 +50b33ef6-259d-43af-8cdc-56a61f881169 ,WILLIAM H. ,SEWARD ,1856,1 +952754a5-48b4-462a-ac57-e4a059a9ef98 ,ESTHER ,BIERHAHN ,1870,2 +ea6d77b3-2e2d-4c59-a0ac-6b297e8898e3 ,CHARLES ,CLEVELAND ,1865,1 +60a5052e-6d67-455a-a3aa-bb79560c7d8d ,SUSAN ,WILSON ,1850,2 +0d4472ec-6378-4aeb-b6c7-17e1c388bb94 ,ARCHER ,HARVEY ,1890,1 +65ccbeb7-2c79-4fb0-b354-c67f150ad80c ,ELIZABETH ,MC LEAN ,1868,2 +72cbe5fa-f558-4393-8423-1842fadf7f11 ,MARY A. ,FLEMMING ,1837,2 +44693008-fd6f-48fe-9c52-e6c07baff361 ,BESSIE ,CHAMBERS ,1908,2 +bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ,THOMAS ,GRAHAM ,1846,1 +a7b10530-b7c9-44d5-9125-c603f392d6d3 ,EDWARD ,DEKAY ,1875,1 +1e635c1c-7faa-4270-acf3-a22635884b90 ,NATHEN ,THORPE ,1836,1 +d3217545-3453-4d96-86c0-d6a3e60fb2f8 ,JOB ,FOSTER ,1884,1 +2a35bae5-3120-4e2c-87da-694d4419c9ce ,JEZEBEL ,FOSTER ,1888,2 +94460fc2-954b-469d-9726-f7126c30e5e2 ,ELIZA ,GOODWIN ,1871,2 +620b6ebb-82e6-42db-8aae-300ca2be0c00 ,MARY ,GOODWIN ,1893,2 +bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ,JO ,GOODWIN ,1895,1 +7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ,PHINEAS ,TAYLOR ,1871,1 +a0f33b36-cef7-4949-a031-22b90f1055d4 ,MARY A. ,LORD ,1856,2 +1a76745c-acf8-48a0-9992-7fb10c11710b ,E.B. ,ALLEN ,1889,1 diff --git a/hlink/tests/input_data/matching_test_b.csv b/hlink/tests/input_data/matching_test_b.csv new file mode 100644 index 0000000..1fb788d --- /dev/null +++ b/hlink/tests/input_data/matching_test_b.csv @@ -0,0 +1,27 @@ +id,namefrst,namelast,birthyr,sex +a499b0dc-7ac0-4d61-b493-91a3036c712e ,ANNIE ,FAUBLE ,1884,2 +ae7261c3-7d71-4ea1-997f-5d1a68c18777 ,MARY ,REESE ,1875,2 +ad6442b5-42bc-4c2e-a517-5a951d989a92 ,MARY ,REESE ,1902,2 +9e807937-de09-414c-bfb2-ac821e112929 ,JOHN ,SHIELDS ,1889,1 +426f2cbe-32e1-45eb-9f86-89a2b9116b7e ,ANNE ,FAUBLE ,1884,2 +a76697d9-b0c8-4774-bc3e-12a7e403c7e6 ,JOHN ,COLLINS ,1893,1 +3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ,MAGGIE ,COLLINS ,1894,2 +49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ,MARY ,COLLINS ,1898,2 +50b33ef6-259d-43af-8cdc-56a61f881169 ,WILLIAM H. ,SEWARD ,1866,1 +952754a5-48b4-462a-ac57-e4a059a9ef98 ,ESTHER ,BIERHAHN ,1870,2 +ea6d77b3-2e2d-4c59-a0ac-6b297e8898e3 ,CHARLES ,CLEVELAND ,1865,1 +60a5052e-6d67-455a-a3aa-bb79560c7d8d ,SUSAN ,WILSON ,1850,2 +0d4472ec-6378-4aeb-b6c7-17e1c388bb94 ,ARCHER ,HARVEY ,1893,1 +65ccbeb7-2c79-4fb0-b354-c67f150ad80c ,ELIZABETH ,MC LEAN ,1868,2 +72cbe5fa-f558-4393-8423-1842fadf7f11 ,MARY A. ,FLEMMING ,1842,2 +bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ,THOMAS ,GRAHAM ,1846,1 +a7b10530-b7c9-44d5-9125-c603f392d6d3 ,EDWARD ,DEKAY ,1875,1 +1e635c1c-7faa-4270-acf3-a22635884b90 ,NATHEN ,THORPE ,1836,1 +d3217545-3453-4d96-86c0-d6a3e60fb2f8 ,JOB ,FOSTER ,1884,1 +2a35bae5-3120-4e2c-87da-694d4419c9ce ,JEZEBEL ,FOSTER ,1888,2 +94460fc2-954b-469d-9726-f7126c30e5e2 ,ELIZA ,GOODWIN ,1871,2 +620b6ebb-82e6-42db-8aae-300ca2be0c00 ,MARY ,GOODWIN ,1893,2 +bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ,JO ,GOODWIN ,1890,1 +7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ,PHINEAS ,TAYLOR ,1871,1 +a0f33b36-cef7-4949-a031-22b90f1055d4 ,MARY A. ,LORD ,1856,2 +1a76745c-acf8-48a0-9992-7fb10c11710b ,E.B. ,ALLEN ,1889,1 diff --git a/hlink/tests/input_data/nativity_test_data_a.csv b/hlink/tests/input_data/nativity_test_data_a.csv new file mode 100644 index 0000000..62d1e70 --- /dev/null +++ b/hlink/tests/input_data/nativity_test_data_a.csv @@ -0,0 +1,7 @@ +histid,pair_no,nativity,county,state,street +a1,0,0,100,10,4th Street +b2,1,0,100,10,Broadway +c3,2,2,100,10,Bway +d4,3,2,100,10,4th Avenue +e5,4,5,100,10,Main Street +f6,5,1,100,10,Main Street \ No newline at end of file diff --git a/hlink/tests/input_data/nativity_test_data_b.csv b/hlink/tests/input_data/nativity_test_data_b.csv new file mode 100644 index 0000000..8e6847e --- /dev/null +++ b/hlink/tests/input_data/nativity_test_data_b.csv @@ -0,0 +1,7 @@ +histid,pair_no,nativity,county,state,street +300c,0,1,200,10,4th Street +48b,1,1,100,20,Broadway +29i,2,5,100,10,Broadway +ll42,3,2,100,10,4th Street +t4r,4,5,100,10,Main Street +cas,5,1,200,20,Main Street \ No newline at end of file diff --git a/hlink/tests/input_data/popularity.csv b/hlink/tests/input_data/popularity.csv new file mode 100644 index 0000000..88ad191 --- /dev/null +++ b/hlink/tests/input_data/popularity.csv @@ -0,0 +1,9 @@ +id,sex,namefrst,namelast,birthyr,bpl +0,2,molly,brown,1900,2300 +1,2,molly,brown,1910,2300 +2,2,molly,brown,1903,2300 +3,2,molly,brown,1898,2300 +4,2,molly,jones,1900,2300 +5,2,molly,brown,1898,4600 +6,1,molly,jones,1900,2300 +7,1,molly,brown,1898,4600 \ No newline at end of file diff --git a/hlink/tests/input_data/potential_matches.csv b/hlink/tests/input_data/potential_matches.csv new file mode 100644 index 0000000..df90c6c --- /dev/null +++ b/hlink/tests/input_data/potential_matches.csv @@ -0,0 +1,25 @@ +namelast_clean_a,namelast_clean_b,id_a,id_b,bpl_a,bpl_b,namefrst_unstd_a,namefrst_unstd_b,sex_a,sex_b,namefrst_jw,namelast_jw,regionf,state_distance,exact,exact_all +cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 +symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00 +abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 +eilbatt,eilbott,6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,4700,4700,reginald,reginald,1,1,1.0,0.9428571428571428,6,0,1.00,0.00 +knopke,knopke,EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,2100,andrew,andrew,1,1,1.0,1.0,6,0,1.00,1.00 +caldwell,caldwell,AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,15010,15010,daisy,daisy,2,2,1.0,1.0,99,0,1.00,1.00 +sonnenschein,sonnenschein,8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1700,1700,max,max,1,1,1.0,1.0,3,0,1.00,1.00 +gibson,gebson,F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,5500,5500,dwight,dwight,1,1,1.0,0.9,3,0,1.00,0.00 +hegewald,hegewald,D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,5600,karl,karl,1,1,1.0,1.0,8,0,1.00,1.00 +king,king,CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,3800,virgel,virgil,1,1,0.9333333333333333,1.0,4,0,0.00,0.00 +looney,looney,4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,4700,4700,sadie,sadye,2,2,0.9066666666666667,1.0,6,0,0.00,0.00 +rydstrom,rydstrom,CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,1700,hubert,hubert,1,1,1.0,1.0,3,0,1.00,1.00 +mugrdickian,mugrdichian,2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,3600,3600,misak,misak,1,1,1.0,0.977961432506887,2,0,1.00,0.00 +brightman,brightman,195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,3900,austin,anstin,1,1,0.9,1.0,3,0,0.00,0.00 +harman,harman,74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,5400,5400,eston,estan,1,1,0.9066666666666667,1.0,5,0,0.00,0.00 +oglesby,oglesby,F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,4000,stephen,stephen,1,1,1.0,1.0,7,0,1.00,1.00 +kassik,kassek,6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,5600,5600,james,james,1,1,1.0,0.9333333333333333,8,0,1.00,0.00 +wood,wood,EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,1700,dudley,dudley,1,1,1.0,1.0,3,0,1.00,1.00 +foulkrod,foulkrod,47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,4200,s,s,1,1,1.0,1.0,2,0,1.00,1.00 +huges,hughes,7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,100,keneth,kenneth,1,1,0.9666666666666667,0.9611111111111111,6,0,0.00,0.00 +caldwell,caldwell,A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,5000,nathan,nathan,1,1,1.0,1.0,1,0,1.00,1.00 +platta,platts,E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1200,1200,norman,norman,1,1,1.0,0.9444444444444444,5,0,1.00,0.00 +lipscomb,lipscomb,671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,1300,roy,roy,1,1,1.0,1.0,5,0,1.00,1.00 +woodburne,woodburn,81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,200,walter,walter,1,1,1.0,0.9925925925925926,9,0.00,1.00,0.00 diff --git a/hlink/tests/input_data/potential_matches_agg.csv b/hlink/tests/input_data/potential_matches_agg.csv new file mode 100644 index 0000000..1416be9 --- /dev/null +++ b/hlink/tests/input_data/potential_matches_agg.csv @@ -0,0 +1,31 @@ +namelast_clean_a,namelast_clean_b,histid_a,histid_b,bpl_a,bpl_b,namefrst_unstd_a,namefrst_unstd_b,sex_a,sex_b,namefrst_jw,namelast_jw,regionf,state_distance,exact,exact_all +cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 +cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 +cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00 +symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00 +symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00 +abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 +abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 +abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 +abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00 +eilbatt,eilbott,6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,4700,4700,reginald,reginald,1,1,1.0,0.9428571428571428,6,0,1.00,0.00 +knopke,knopke,EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,2100,andrew,andrew,1,1,1.0,1.0,6,0,1.00,1.00 +caldwell,caldwell,AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,15010,15010,daisy,daisy,2,2,1.0,1.0,99,0,1.00,1.00 +sonnenschein,sonnenschein,8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1700,1700,max,max,1,1,1.0,1.0,3,0,1.00,1.00 +gibson,gebson,F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,5500,5500,dwight,dwight,1,1,1.0,0.9,3,0,1.00,0.00 +hegewald,hegewald,D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,5600,karl,karl,1,1,1.0,1.0,8,0,1.00,1.00 +king,king,CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,3800,virgel,virgil,1,1,0.9333333333333333,1.0,4,0,0.00,0.00 +looney,looney,4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,4700,4700,sadie,sadye,2,2,0.9066666666666667,1.0,6,0,0.00,0.00 +rydstrom,rydstrom,CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,1700,hubert,hubert,1,1,1.0,1.0,3,0,1.00,1.00 +mugrdickian,mugrdichian,2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,3600,3600,misak,misak,1,1,1.0,0.977961432506887,2,0,1.00,0.00 +brightman,brightman,195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,3900,austin,anstin,1,1,0.9,1.0,3,0,0.00,0.00 +harman,harman,74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,5400,5400,eston,estan,1,1,0.9066666666666667,1.0,5,0,0.00,0.00 +oglesby,oglesby,F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,4000,stephen,stephen,1,1,1.0,1.0,7,0,1.00,1.00 +kassik,kassek,6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,5600,5600,james,james,1,1,1.0,0.9333333333333333,8,0,1.00,0.00 +wood,wood,EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,1700,dudley,dudley,1,1,1.0,1.0,3,0,1.00,1.00 +foulkrod,foulkrod,47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,4200,s,s,1,1,1.0,1.0,2,0,1.00,1.00 +huges,hughes,7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,100,keneth,kenneth,1,1,0.9666666666666667,0.9611111111111111,6,0,0.00,0.00 +caldwell,caldwell,A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,5000,nathan,nathan,1,1,1.0,1.0,1,0,1.00,1.00 +platta,platts,E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1200,1200,norman,norman,1,1,1.0,0.9444444444444444,5,0,1.00,0.00 +lipscomb,lipscomb,671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,1300,roy,roy,1,1,1.0,1.0,5,0,1.00,1.00 +woodburne,woodburn,81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,200,walter,walter,1,1,1.0,0.9925925925925926,9,0,1.00,0.00 \ No newline at end of file diff --git a/hlink/tests/input_data/potential_matches_ids_only.csv b/hlink/tests/input_data/potential_matches_ids_only.csv new file mode 100644 index 0000000..6ed7b95 --- /dev/null +++ b/hlink/tests/input_data/potential_matches_ids_only.csv @@ -0,0 +1,10 @@ +id_a,id_b +10,10 +10,30 +10,50 +20,10 +20,30 +20,50 +30,10 +30,30 +30,50 \ No newline at end of file diff --git a/hlink/tests/input_data/potential_matches_sql_condition_marst_warn.csv b/hlink/tests/input_data/potential_matches_sql_condition_marst_warn.csv new file mode 100644 index 0000000..b4d8b7d --- /dev/null +++ b/hlink/tests/input_data/potential_matches_sql_condition_marst_warn.csv @@ -0,0 +1,37 @@ +id_a,id_b +0,36 +1,37 +2,38 +3,39 +4,40 +5,41 +6,42 +7,43 +8,44 +9,45 +10,46 +11,47 +12,48 +13,49 +14,50 +15,51 +16,52 +17,53 +18,54 +19,55 +20,56 +21,57 +22,58 +23,59 +24,60 +25,61 +26,62 +27,63 +28,64 +29,65 +30,66 +31,67 +32,68 +33,69 +34,70 +35,71 \ No newline at end of file diff --git a/hlink/tests/input_data/predicted_matches_reporting.csv b/hlink/tests/input_data/predicted_matches_reporting.csv new file mode 100644 index 0000000..c30319a --- /dev/null +++ b/hlink/tests/input_data/predicted_matches_reporting.csv @@ -0,0 +1,11 @@ +histid_a,histid_b +0,0 +3,3 +4,4 +8,8 +11,11 +14,14 +17,17 +19,19 +20,20 +24,24 \ No newline at end of file diff --git a/hlink/tests/input_data/predicted_matches_test.csv b/hlink/tests/input_data/predicted_matches_test.csv new file mode 100644 index 0000000..f20ecc0 --- /dev/null +++ b/hlink/tests/input_data/predicted_matches_test.csv @@ -0,0 +1,6 @@ +histid_a,namefrst_a,namelast_a,serialp_a,histid_b,namefrst_b,namelast_b,serialp_b,prediction +1000A,Albert,Johnson,1,1000B,Albert,Johnson,7,1 +1001B,Mary,Johnson,1,1001B,Mary,Johnson,7,1 +1002A,Steve,Johnson,1,1003B,Steve,Johnson,7,1 +1002A,Steve,Johnson,1,1007B,Steve,Johnson,8,1 +1003A,Robert,Johnson,1,1004B,Robert,Johnson,8,1 \ No newline at end of file diff --git a/hlink/tests/input_data/prepped_df_reporting.csv b/hlink/tests/input_data/prepped_df_reporting.csv new file mode 100644 index 0000000..578008e --- /dev/null +++ b/hlink/tests/input_data/prepped_df_reporting.csv @@ -0,0 +1,28 @@ +histid,race_div_100,relate_div_100,region,bpl_clean,namefrst_unstd,namefrst_std,namelast_clean,statefip +0,1,1,2,36,bruce,bruce,wayne,36 +1,1,2,3,17,julie,julie,wayne,36 +2,1,3,2,36,tony,tony,wayne,36 +3,1,5,10,150,emmett,emmett,wayne,36 +4,1,1,2,36,billy,william,kaplan,36 +5,1,11,2,36,teddy,theodore,altman,36 +6,2,1,4,19,jennie,jennifer,jones,36 +7,1,1,4,27,maggie,margaret,dewey,27 +8,1,3,4,27,david,david,dewey,27 +9,1,3,4,27,orson,orson,dewey,27 +10,1,12,4,27,janie,jane,schmidt,27 +11,1,1,4,27,obadiah,obadiah,oconnor,27 +12,3,2,4,90,jenny,jennifer,oconnor,27 +13,3,3,4,27,max,maxwell,oconnor,27 +14,3,3,4,27,tom,thomas,oconnor,27 +15,3,3,4,27,billy,william,oconnor,27 +16,3,3,4,27,margaret,margaret,oconnor,27 +17,3,3,4,27,lucy,lucy,oconnor,27 +18,3,6,4,90,margaret,margaret,pine,27 +19,2,1,4,27,penny,penelope,jones,27 +20,2,3,4,27,laura,laura,jones,27 +21,2,3,4,27,linda,linda,jones,27 +22,2,7,4,27,paula,paula,wayne,27 +23,2,12,4,27,lisa,lisa,price,27 +24,8,1,10,404,john,john,kaplan,27 +25,9,3,4,27,eliza,eliza,kaplan,27 +26,9,3,4,27,anna,anna,kaplan,27 \ No newline at end of file diff --git a/hlink/tests/input_data/raw_df_reporting.csv b/hlink/tests/input_data/raw_df_reporting.csv new file mode 100644 index 0000000..c5ac99f --- /dev/null +++ b/hlink/tests/input_data/raw_df_reporting.csv @@ -0,0 +1,28 @@ +histid,serialp,sex,age,marst,durmarr,sei +0,0,1,54,1,20,96 +1,0,2,55,1,20,96 +2,0,1,15,6,99,96 +3,0,1,80,5,99,96 +4,1,1,25,6,99,65 +5,1,1,26,6,99,70 +6,2,2,72,6,99,35 +7,3,2,42,4,99,28 +8,3,1,16,6,99,28 +9,3,1,13,6,99,28 +10,3,2,20,6,99,14 +11,4,1,45,1,25,28 +12,4,2,44,1,25,28 +13,4,1,24,6,99,0 +14,4,1,22,6,99,0 +15,4,1,20,6,99,0 +16,4,2,14,6,99,0 +17,4,2,12,6,99,0 +18,4,2,63,5,99,12 +19,5,2,30,2,10,31 +20,5,2,12,6,99,0 +21,5,2,10,6,99,0 +22,5,2,26,3,99,26 +23,5,2,24,2,3,14 +24,6,1,28,5,99,54 +25,6,2,6,6,99,0 +26,6,2,2,6,99,0 \ No newline at end of file diff --git a/hlink/tests/input_data/region.csv b/hlink/tests/input_data/region.csv new file mode 100644 index 0000000..5a1fbba --- /dev/null +++ b/hlink/tests/input_data/region.csv @@ -0,0 +1,161 @@ +region,bpl +6,1 +9,2 +8,4 +7,5 +9,6 +8,8 +1,9 +5,10 +5,11 +5,12 +5,13 +9,15 +8,16 +3,17 +3,18 +4,19 +4,20 +6,21 +7,22 +1,23 +5,24 +1,25 +3,26 +4,27 +6,28 +4,29 +8,30 +4,31 +8,32 +1,33 +2,34 +8,35 +2,36 +5,37 +4,38 +3,39 +7,40 +9,41 +2,42 +1,44 +5,45 +4,46 +6,47 +7,48 +8,49 +1,50 +5,51 +9,53 +5,54 +3,55 +8,56 +10,90 +11,99 +12,100 +12,105 +12,110 +12,115 +12,120 +13,150 +13,155 +13,160 +13,199 +14,200 +14,210 +14,250 +14,260 +14,299 +15,300 +16,400 +16,401 +16,402 +16,403 +16,404 +16,405 +17,410 +17,411 +17,412 +17,413 +17,414 +17,419 +18,420 +18,421 +18,422 +18,423 +18,424 +18,425 +18,426 +18,429 +19,430 +19,431 +19,432 +19,433 +19,434 +19,435 +19,436 +19,437 +19,438 +19,439 +19,440 +20,450 +20,451 +20,452 +20,453 +20,454 +20,455 +20,456 +20,457 +20,458 +20,459 +20,460 +20,461 +20,462 +20,463 +20,465 +20,499 +21,500 +21,501 +21,502 +21,509 +21,510 +21,511 +21,512 +21,513 +21,514 +21,515 +21,516 +21,517 +21,518 +21,519 +21,520 +21,521 +21,522 +21,523 +21,524 +21,530 +21,531 +21,532 +21,533 +21,534 +21,535 +21,536 +21,537 +21,538 +21,539 +21,540 +21,541 +21,542 +21,543 +21,544 +21,545 +21,546 +21,547 +21,548 +21,549 +21,550 +21,599 +22,600 +23,700 +23,710 +24,900 \ No newline at end of file diff --git a/hlink/tests/input_data/regioncode.csv b/hlink/tests/input_data/regioncode.csv new file mode 100644 index 0000000..6d98564 --- /dev/null +++ b/hlink/tests/input_data/regioncode.csv @@ -0,0 +1,52 @@ +region,bpl +6,100 +9,200 +8,400 +7,500 +9,600 +8,800 +1,900 +5,1000 +5,1100 +5,1200 +5,1300 +9,1500 +8,1600 +3,1700 +3,1800 +4,1900 +4,2000 +6,2100 +7,2200 +1,2300 +5,2400 +1,2500 +3,2600 +4,2700 +6,2800 +4,2900 +8,3000 +4,3100 +8,3200 +1,3300 +2,3400 +8,3500 +2,3600 +5,3700 +4,3800 +3,3900 +7,4000 +9,4100 +2,4200 +1,4400 +5,4500 +4,4600 +6,4700 +7,4800 +8,4900 +1,5000 +5,5100 +9,5300 +5,5400 +3,5500 +8,5600 diff --git a/hlink/tests/input_data/rel_rows_test_a.csv b/hlink/tests/input_data/rel_rows_test_a.csv new file mode 100644 index 0000000..405fb9e --- /dev/null +++ b/hlink/tests/input_data/rel_rows_test_a.csv @@ -0,0 +1,28 @@ +RECTYPEP,YEARP,SERIALP,PERNUM,PERWTREG,MOMLOC,STEPMOM,MOMRULE_HIST,POPLOC,STEPPOP,POPRULE_HIST,SPLOC,SPRULE_HIST,FAMSIZE,NCHILD,NCHLT5,FAMUNIT,ELDCH,YNGCH,NSIBS,RELATE,AGE,SEX,RACE,MARST,DURMARR,CHBORN,BPL,NATIVITY,CITIZEN,HISPAN,YRSUSA2,SPEAKENG,SCHOOL,LIT,LABFORCE,OCC1950,OCCSCORE,SEI,IND1950,YRSUSA1,QTRUNEMP,QCHSURV,QAGE,QAGEMONT,QBPL,QCHBORN,QCITIZEN,QFBPL,QIND,QMARST,QOCC,QQTRUNEM,QRACE,QRELATE,QSURSIM,QDURMARR,QSCHOOL,QSEX,QYRIMM,RACAMIND,RACASIAN,RACBLK,RACPACIS,RACOTHER,RACWHT,AGEDIFF,RACESING,HISPRULE,PRESGL,ERSCOR50,EDSCOR50,OCCSTR,ISRELATE,SUBFAM,SFTYPE,SFRELATE,YRIMMIG,SLWT,PERWT,BIRTHYR,NAMELAST,NAMEFRST,BPLSTR,FBPLSTR,MBPLSTR,RELSTR,CHSURV,MBPL,FBPL,SCHLMNTH,MOUNEMP,AGEMONTH,BIRTHMO,SURSIM,QBIRTHMO,QLIT,QMBPL,HISTID,IMAGEID,US1900M_1037,US1900M_1038,US1900M_1039,US1900M_1040,US1900M_1041,US1900M_1042,RECTYPE_P,YEAR_P,DATANUM_P,SERIAL_P,NUMPREC_P,SUBSAMP_P,HHWTREG_P,REGION_P,STATEICP_P,STATEFIP_P,SEA_P,METRO_P,METAREA_P,METDIST_P,CITY_P,CITYPOP_P,SIZEPL_P,URBAN_P,URBAREA_P,GQ_P,GQTYPE_P,GQFUNDS_P,FARM_P,OWNERSHP_P,MORTGAGE_P,PAGENO_P,NFAMS_P,NCOUPLES_P,NMOTHERS_P,NFATHERS_P,QMORTGAG_P,QFARM_P,QOWNERSH_P,QGQTYPE_P,NENGPOP_P,URBPOP_P,HHTYPE_P,CNTRY_P,NSUBFAM_P,HEADLOC_P,MULTGEN_P,STCOUNTY_P,APPAL_P,COUNTYICP_P,HHWT_P,STDCITY_P,GQSTR_P,DWELLING_P,MDSTATUS_P,REEL_P,NUMPERHH_P,LINE_P,ENUMDIST_P,STREET_P,QGQFUNDS_P,SPLIT_P,SPLITHID_P,SPLITNUM_P +P,1900,12196421,1,1,0,0,0,0,0,0,2,1,6,4,3,1,5,0,0,101,42,1,100,1,12,0,45400,5,2,0,4,9,2,1,1,999,0,0,0,17,1,0,0,0,0,0,4,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,75,10,0,0,9999,9999, ,1,0,0,0,1883,100,100,1858,HARKBERS ,ANDREW ,HUNGARY ,HUNGARY ,HUNGARY ,HEAD ,0,45400,45400,99,0,99,2,1,0,4,0,5E84734B-1201-45C8-AAA0-805BF52D8E59,4117765_00566 , HUNGARY , HUNGARY , HUNGARY , HUNGARY , HUNGARY , HUNGARY ,H,1900,4,12196421,6,83,1,21,24,39,331,1,0,0,0,0,1,1,0,1,0,0,1,20,0,566,1,1,1,1,0,0,4,0,0,0,1,840,0,1,21,390990,11,990,100, ,33-01 ,650020,1,1300,6,0,56, ,0,0,613393,6 +P,1900,12196421,2,1,0,0,0,0,0,0,1,1,6,4,3,1,5,0,0,201,32,2,100,1,12,8,45400,5,0,0,1,1,2,4,2,970,20,8,0,2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,4,0,0,1,1,1,1,1,2,85,10,0,175,397,30,LABORER ,1,0,0,0,1886,100,100,1868,HARKBERS ,BARBARA ,HUNGARY ,HUNGARY ,HUNGARY ,WIFE ,5,45400,45400,99,0,99,2,1,0,0,0,2F5E2C8C-362B-45A5-AABC-053555FD27EB,4117765_00566 , HUNGARY , HUNGARY , HUNGARY , HUNGARY , HUNGARY , HUNGARY ,H,1900,4,12196421,6,83,1,21,24,39,331,1,0,0,0,0,1,1,0,1,0,0,1,20,0,566,1,1,1,1,0,0,4,0,0,0,1,840,0,1,21,390990,11,990,100, ,33-01 ,650020,1,1300,6,0,56, ,0,0,613393,6 +P,1900,12196421,3,1,2,0,1,1,0,1,0,0,6,0,0,1,99,99,3,301,5,2,100,6,99,0,4200,4,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,112,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1894,HARKBERS ,MARY ,PENNSYLVANIA ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,0,99,6,1,0,0,0,13CB6F35-AF10-460D-9714-58FA33C4444A,4117765_00566 , PENNSYLVANIA , PENNSYLVANIA , HUNGARY , HUNGARY , HUNGARY , HUNGARY ,H,1900,4,12196421,6,83,1,21,24,39,331,1,0,0,0,0,1,1,0,1,0,0,1,20,0,566,1,1,1,1,0,0,4,0,0,0,1,840,0,1,21,390990,11,990,100, ,33-01 ,650020,1,1300,6,0,56, ,0,0,613393,6 +P,1900,12196421,4,1,2,0,1,1,0,1,0,0,6,0,0,1,99,99,3,301,4,1,100,6,99,0,3900,4,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,3,0,4,0,0,1,1,1,1,1,2,113,10,0,0,9999,9999,LABORER ,1,0,0,0,0,100,100,1896,HARKBERS ,ANDREW ,OHIO ,HUNGARY ,HUNGARY ,SON ,0,45400,45400,99,0,99,1,1,0,0,0,D1DAEB8F-66F0-435C-8E45-F004D967549D,4117765_00566 , OHIO , OHIO , HUNGARY , HUNGARY , HUNGARY , HUNGARY ,H,1900,4,12196421,6,83,1,21,24,39,331,1,0,0,0,0,1,1,0,1,0,0,1,20,0,566,1,1,1,1,0,0,4,0,0,0,1,840,0,1,21,390990,11,990,100, ,33-01 ,650020,1,1300,6,0,56, ,0,0,613393,6 +P,1900,12196421,5,1,2,0,1,1,0,1,0,0,6,0,0,1,99,99,3,301,2,1,100,6,99,0,3900,4,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,3,0,4,0,0,1,1,1,1,1,2,115,10,0,0,9999,9999,LABORER ,1,0,0,0,0,100,100,1897,HARKBERS ,JOSEPH ,OHIO ,HUNGARY ,HUNGARY ,SON ,0,45400,45400,99,0,99,7,1,0,0,0,D4656BEE-2087-4254-86D4-3ACD54F633CE,4117765_00566 , OHIO , OHIO , HUNGARY , HUNGARY , HUNGARY , HUNGARY ,H,1900,4,12196421,6,83,1,21,24,39,331,1,0,0,0,0,1,1,0,1,0,0,1,20,0,566,1,1,1,1,0,0,4,0,0,0,1,840,0,1,21,390990,11,990,100, ,33-01 ,650020,1,1300,6,0,56, ,0,0,613393,6 +P,1900,12196421,6,1,2,0,1,1,0,1,0,0,6,0,0,1,99,99,3,301,0,2,100,6,99,0,3900,4,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,3,0,4,0,0,1,1,1,1,1,2,117,10,0,0,9999,9999,LABORER ,1,0,0,0,0,100,100,1900,HARKBERS ,BARBRA ,OHIO ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,0,1,4,1,0,0,0,024C26B8-3E08-47BB-957D-AA509F5C3E93,4117765_00566 , OHIO , OHIO , HUNGARY , HUNGARY , HUNGARY , HUNGARY ,H,1900,4,12196421,6,83,1,21,24,39,331,1,0,0,0,0,1,1,0,1,0,0,1,20,0,566,1,1,1,1,0,0,4,0,0,0,1,840,0,1,21,390990,11,990,100, ,33-01 ,650020,1,1300,6,0,56, ,0,0,613393,6 +P,1900,15964000,1,1,0,0,0,0,0,0,2,1,6,3,2,1,5,0,0,101,24,1,200,1,6,0,5100,1,0,0,0,2,2,1,2,504,29,27,246,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,2,1,1,1,75,20,0,357,703,52,BRICK LAYER ,1,0,0,0,0,100,100,1876,WOOLRIGE ,JOHN ,VIRGINIA ,VIRGINIA ,VIRGINIA ,HEAD ,0,5100,5100,99,6,99,1,1,0,0,0,7FEB0CE8-36FB-4D20-BA8B-5A1ACFDF4861,4117940_00015 , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA ,H,1900,4,15964000,6,68,1,31,40,51,458,1,0,0,1631,165,6,2,0,1,0,0,2,20,0,15,1,1,2,1,0,0,0,0,0,165,1,840,0,1,31,515900,0,5900,100,DANVILLE , ,316542,1,1734,6,37,104,MONROE ST ,0,0,308143,6 +P,1900,15964000,2,1,6,0,1,0,0,0,1,1,6,3,2,1,5,0,0,201,20,2,200,1,6,5,5100,1,0,0,0,2,1,4,1,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,2,1,1,1,79,20,0,0,9999,9999, ,1,0,0,0,0,100,100,1879,WOOLRIGE ,MARTHA ,VIRGINIA ,VIRGINIA ,VIRGINIA ,WIFE ,2,5100,5100,99,0,99,1,1,0,0,0,3AAEFB36-E878-4101-B637-0168544B10EA,4117940_00015 , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA ,H,1900,4,15964000,6,68,1,31,40,51,458,1,0,0,1631,165,6,2,0,1,0,0,2,20,0,15,1,1,2,1,0,0,0,0,0,165,1,840,0,1,31,515900,0,5900,100,DANVILLE , ,316542,1,1734,6,37,104,MONROE ST ,0,0,308143,6 +P,1900,15964000,3,1,2,7,1,1,0,1,0,0,6,0,0,1,99,99,2,301,5,2,200,6,99,0,5100,1,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,4,3,0,4,0,0,1,1,2,1,1,1,94,20,0,0,9999,9999, ,1,0,0,0,0,100,100,1895,WOOLRIGE ,MARGARET ,VIRGINIA ,VIRGINIA ,VIRGINIA ,DAUGHTER IN LAW ,0,5100,5100,99,0,99,2,1,0,0,0,3491D58A-C045-4936-957A-39A52B1BF57D,4117940_00015 , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA ,H,1900,4,15964000,6,68,1,31,40,51,458,1,0,0,1631,165,6,2,0,1,0,0,2,20,0,15,1,1,2,1,0,0,0,0,0,165,1,840,0,1,31,515900,0,5900,100,DANVILLE , ,316542,1,1734,6,37,104,MONROE ST ,0,0,308143,6 +P,1900,15964000,4,1,2,7,1,1,0,1,0,0,6,0,0,1,99,99,2,301,2,1,200,6,99,0,5100,1,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,2,1,1,1,97,20,0,0,9999,9999, ,1,0,0,0,0,100,100,1897,WOOLRIGE ,WILLIE ,VIRGINIA ,VIRGINIA ,VIRGINIA ,SON ,0,5100,5100,99,0,99,12,1,0,0,0,92CB3F50-761E-4213-9FA7-80C3159358F1,4117940_00015 , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA ,H,1900,4,15964000,6,68,1,31,40,51,458,1,0,0,1631,165,6,2,0,1,0,0,2,20,0,15,1,1,2,1,0,0,0,0,0,165,1,840,0,1,31,515900,0,5900,100,DANVILLE , ,316542,1,1734,6,37,104,MONROE ST ,0,0,308143,6 +P,1900,15964000,5,1,2,0,1,1,0,1,0,0,6,0,0,1,99,99,2,301,0,1,200,6,99,0,5100,1,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,2,1,1,1,99,20,0,0,9999,9999, ,1,0,0,0,0,100,100,1899,WOOLRIGE ,HENRY ,VIRGINIA ,VIRGINIA ,VIRGINIA ,SON ,0,5100,5100,99,0,11,6,1,0,0,0,07158CE0-20BA-4654-B3F2-939887353E46,4117940_00015 , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA , VIRGINIA ,H,1900,4,15964000,6,68,1,31,40,51,458,1,0,0,1631,165,6,2,0,1,0,0,2,20,0,15,1,1,2,1,0,0,0,0,0,165,1,840,0,1,31,515900,0,5900,100,DANVILLE , ,316542,1,1734,6,37,104,MONROE ST ,0,0,308143,6 +P,1900,15964000,6,1,0,0,0,0,0,0,0,0,6,1,0,1,20,20,0,601,65,2,200,5,99,7,3700,1,0,0,0,2,2,1,2,781,11,22,826,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,2,1,1,1,34,20,0,419,77,126,NURSE ,1,0,0,0,0,100,100,1835,YARBROUGH ,CHANEY ,NORTH CAROLINA ,NORTH CAROLINA ,NORTH CAROLINA ,MOTHER IN LAW ,7,3700,3700,99,0,99,1,2,0,0,0,8812BF74-7506-4FBE-B87E-5B35E6809F48,4117940_00015 , NORTH CAROLINA , NORTH CAROLINA , NORTH CAROLINA , NORTH CAROLINA , NORTH CAROLINA , NORTH CAROLINA ,H,1900,4,15964000,6,68,1,31,40,51,458,1,0,0,1631,165,6,2,0,1,0,0,2,20,0,15,1,1,2,1,0,0,0,0,0,165,1,840,0,1,31,515900,0,5900,100,DANVILLE , ,316542,1,1734,6,37,104,MONROE ST ,0,0,308143,6 +P,1900,13992947,1,1,0,0,0,0,0,0,2,1,8,6,3,1,12,1,0,101,39,1,100,1,17,0,45300,5,2,0,3,2,2,3,2,100,14,14,105,12,9,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,4,1,1,1,1,1,2,75,10,0,407,99,46,FARMER ,1,0,0,0,1840,100,100,1861,BOGASKI ,ANDREW ,GERMANY ,GERMANY ,GERMANY ,HEAD ,0,45300,45300,99,99,99,2,1,0,0,0,17D1219A-C41C-426C-A31C-11F9FE054A79,4115233_00682 , GERMANY , GERMANY , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,2,1,0,0,0,0,0,0,1,1,8,6,3,1,12,1,0,201,70,2,100,1,17,9,45300,5,0,0,3,2,1,4,1,999,0,0,0,12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,44,10,0,0,9999,9999, ,1,0,0,0,1888,100,100,1859,BOGASKI ,KATE ,GERMANY ,GERMANY ,GERMANY ,WIFE ,7,45300,45300,99,0,99,9,1,0,0,0,1E994888-C3A1-4636-90AA-5F54884CDEE5,4115233_00682 , GERMANY , GERMANY , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,3,1,2,1,1,1,0,1,0,0,8,0,0,1,99,99,5,301,12,1,100,6,99,0,4200,4,0,0,0,2,2,4,0,983,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,1,1,1,1,2,102,10,0,0,9999,9999,AT SCHOOL ,1,0,0,0,0,100,100,1887,BOGASKI ,JACK ,PENNSYLVANIA ,GERMANY ,GERMANY ,SON ,0,45300,45300,6,99,99,7,1,0,0,0,8B0A8FA5-A260-4841-95D0-2C45689485C8,4115233_00682 , PENNSYLVANIA , PENNSYLVANIA , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,4,1,2,1,1,1,0,1,0,0,8,0,0,1,99,99,5,301,8,2,100,6,99,0,4200,4,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,1,1,1,1,1,2,106,10,0,0,9999,9999,AT SCHOOL ,1,0,0,0,0,100,100,1892,BOGASKI ,MARY ,PENNSYLVANIA ,GERMANY ,GERMANY ,DAUGHTER ,0,45300,45300,6,0,99,2,1,0,0,0,DF3C81BC-DE20-41C0-9A66-C809E550BACD,4115233_00682 , PENNSYLVANIA , PENNSYLVANIA , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,5,1,2,1,1,1,0,1,0,0,8,0,0,1,99,99,5,301,6,2,100,6,99,0,4200,4,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,1,1,1,1,2,108,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1893,BOGASKI ,SADIE ,PENNSYLVANIA ,GERMANY ,GERMANY ,DAUGHTER ,0,45300,45300,6,0,99,7,1,0,0,0,B8FDB3A5-F773-4C65-AE5F-C2590CC301AD,4115233_00682 , PENNSYLVANIA , PENNSYLVANIA , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,6,1,2,1,1,1,0,1,0,0,8,0,0,1,99,99,5,301,4,1,100,6,99,0,4200,4,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,110,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1895,BOGASKI ,WALTER ,PENNSYLVANIA ,GERMANY ,GERMANY ,SON ,0,45300,45300,99,0,99,6,1,0,0,0,DF0E8D3E-7CAF-4F3F-8FE5-4D5A75DE939F,4115233_00682 , PENNSYLVANIA , PENNSYLVANIA , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,7,1,2,1,1,1,0,1,0,0,8,0,0,1,99,99,5,301,3,1,100,6,99,0,4200,4,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,111,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1897,BOGASKI ,BURT ,PENNSYLVANIA ,GERMANY ,GERMANY ,SON ,0,45300,45300,99,0,99,4,1,0,0,0,F24EA20E-AE8F-4D0F-B1F0-882352B55582,4115233_00682 , PENNSYLVANIA , PENNSYLVANIA , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,13992947,8,1,2,1,1,1,0,1,0,0,8,0,0,1,99,99,5,301,1,1,100,6,99,0,4200,4,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,113,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1898,BOGASKI ,JOSEPH ,PENNSYLVANIA ,GERMANY ,GERMANY ,SON ,0,45300,45300,99,0,99,9,1,0,0,0,C48A84C5-CFAC-48B8-BFB8-D7D8F97B3C58,4115233_00682 , PENNSYLVANIA , PENNSYLVANIA , GERMANY , GERMANY , GERMANY , GERMANY ,H,1900,4,13992947,8,41,1,12,14,42,364,1,0,0,0,0,1,1,0,1,0,0,2,10,1,682,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,421170,11,1170,100, ,5471 ,1262583,1,1489,8,31,129, ,0,0,1201533,8 +P,1900,5125386,1,1,0,0,0,0,0,0,2,1,7,5,2,1,12,2,0,101,30,1,100,1,13,0,2100,1,0,0,0,2,1,4,2,100,14,14,105,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,1,1,1,1,2,75,10,0,407,99,46,FARMER ,1,0,0,0,0,100,100,1869,MCFARLAND ,JAMES ,KENTUCKY ,KENTUCKY ,KENTUCKY ,HEAD ,0,2100,2100,0,0,99,6,1,0,0,0,8AD15C8E-5651-493F-8B30-DA1E31B15E0F,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 +P,1900,5125386,2,1,0,0,0,0,0,0,1,1,7,5,2,1,12,2,0,201,27,2,100,1,13,6,2100,1,0,0,0,2,2,4,1,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,78,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1872,MCFARLAND ,MARNIE ,KENTUCKY ,KENTUCKY ,KENTUCKY ,WIFE ,6,2100,2100,99,0,99,9,1,0,0,0,D88C312F-1B51-4BD7-819D-0D8A4C4840B3,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 +P,1900,5125386,3,1,2,0,1,1,0,1,0,0,7,0,0,1,99,99,4,301,12,1,100,6,99,0,2100,1,0,0,0,2,2,4,0,983,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,93,10,0,0,9999,9999,AT SCHOOL ,1,0,0,0,0,100,100,1888,MCFARLAND ,JOHN J ,KENTUCKY ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,99,99,3,1,0,0,0,5319B61A-6589-4BFD-912A-4FEEDCBB1ACD,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 +P,1900,5125386,4,1,2,0,1,1,0,1,0,0,7,0,0,1,99,99,4,301,9,2,100,6,99,0,2100,1,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,96,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1890,MCFARLAND ,SUSIE G ,KENTUCKY ,KENTUCKY ,KENTUCKY ,DAUGHTER ,0,2100,2100,99,0,99,8,1,0,0,0,8FEEB01D-9769-4339-B6A5-E9B063A03B6E,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 +P,1900,5125386,5,1,2,0,1,1,0,1,0,0,7,0,0,1,99,99,4,301,6,2,100,6,99,0,2100,1,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,99,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1893,MCFARLAND ,MANDIE S ,KENTUCKY ,KENTUCKY ,KENTUCKY ,DAUGHTER ,0,2100,2100,99,0,99,11,1,0,0,0,B3441F1C-545C-4ADF-9FDD-9B9D078669DF,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 +P,1900,5125386,6,1,2,0,1,1,0,1,0,0,7,0,0,1,99,99,4,301,4,1,100,6,99,0,2100,1,0,0,0,0,1,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,101,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1895,MCFARLAND ,DAVID S ,KENTUCKY ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,0,99,12,1,0,0,0,D32ACE19-38AE-47E6-9B21-96813893EE40,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 +P,1900,5125386,7,1,2,0,1,1,0,1,0,0,7,0,0,1,99,99,4,301,2,1,100,6,99,0,2100,1,0,0,0,0,2,0,0,999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,1,1,1,1,1,2,103,10,0,0,9999,9999, ,1,0,0,0,0,100,100,1898,MCFARLAND ,SHERMAN ,KENTUCKY ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,0,99,5,1,0,0,0,90A2D28D-628D-421E-A6FB-22F56A4D24B7,4118943_00237 , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY , KENTUCKY ,H,1900,4,5125386,7,64,1,32,51,21,162,1,0,0,0,0,1,1,0,1,0,0,2,20,0,237,1,1,1,1,0,0,0,0,0,0,1,840,0,1,21,212350,20,2350,100, , ,441416,1,555,7,8,138, ,0,0,428557,7 diff --git a/hlink/tests/input_data/rel_rows_test_b.csv b/hlink/tests/input_data/rel_rows_test_b.csv new file mode 100644 index 0000000..e4b8db9 --- /dev/null +++ b/hlink/tests/input_data/rel_rows_test_b.csv @@ -0,0 +1,34 @@ +RECTYPEP,YEARP,DATANUMP,SERIALP,PERNUM,SLWTREG,PERWTREG,MOMLOC,STEPMOM,MOMRULE_HIST,POPLOC,STEPPOP,POPRULE_HIST,SPLOC,SPRULE_HIST,FAMSIZE,NCHILD,NCHLT5,FAMUNIT,ELDCH,YNGCH,NSIBS,RELATE,AGE,SEX,RACE,MARST,DURMARR,CHBORN,BPL,NATIVITY,CITIZEN,HISPAN,YRSUSA2,MTONGUE,LANGUAGE,SPEAKENG,SCHOOL,LIT,EMPSTAT,LABFORCE,OCC1950,OCCSCORE,SEI,IND1950,CLASSWKR,YRSUSA1,WKSUNEMP,VETCIVWR,QCHSURV,QAGE,QBPL,QCHBORN,QCITIZEN,QCLASSWK,QLANGUAG,QFBPL,QEMPSTAT,QIND,QMARST,QMTONGUE,QOCC,QRACE,QRELATE,QSURSIM,QDURMARR,QSCHOOL,QSEX,QSPEAKEN,QYRIMM,RACAMIND,RACASIAN,RACBLK,RACPACIS,RACOTHER,RACWHT,AGEDIFF,RACESING,PROBWHT,PROBOTH,PROBBLK,PROBAPI,PROBAI,HISPRULE,PRESGL,ERSCOR50,EDSCOR50,NPBOSS50,OCCSTR,ISRELATE,SUBFAM,SFTYPE,SFRELATE,YRIMMIG,SLWT,PERWT,BIRTHYR,MTONGSTR,NAMELAST,NAMEFRST,INDSTR,BPLSTR,FBPLSTR,MBPLSTR,RELSTR,CHSURV,MBPL,FBPL,AGEMONTH,BLIND,DEAF,SURSIM,QLIT,QMBPL,HISTID,PID,IMGMISS,RECTYPE_P,YEAR_P,DATANUM_P,SERIAL_P,NUMPREC_P,SUBSAMP_P,HHWTREG_P,REGION_P,STATEICP_P,STATEFIP_P,SEA_P,METRO_P,METAREA_P,METDIST_P,CITY_P,CITYPOP_P,SIZEPL_P,URBAN_P,URBAREA_P,GQ_P,FARM_P,OWNERSHP_P,MORTGAGE_P,PAGENO_P,NFAMS_P,NCOUPLES_P,NMOTHERS_P,NFATHERS_P,QMORTGAG_P,QFARM_P,QOWNERSH_P,URBPOP_P,HHTYPE_P,CNTRY_P,NSUBFAM_P,HEADLOC_P,MULTGEN_P,COUNTYNHG_P,YRSTCOUNTY_P,STCOUNTY_P,APPAL_P,COUNTYICP_P,HHWT_P,STDCITY_P,DWELLING_P,MDSTATUS_P,REEL_P,LINE_P,ENUMDIST_P,STREET_P,SPLIT_P,SPLITHID_P,SPLITNUM_P +P,1910,4,19786583,1,0,1,0,0,0,0,0,0,0,0,3,2,0,1,11,9,0,101,33,1,200,5,99,0,5100,1,0,0,0,0,100,2,1,1,10,2,504,29,27,246,20,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,2,1,1,1,75,20,0,0,100,0,0,0,357,703,52,594,BRICK MASON ,1,0,0,0,0,100,100,1877,ENGLISH ,WOODRIDGE ,JOHN ,BULID HOUSES ETC ,VIRGINIA ,VIRGINIA ,VIRGINIA ,HEAD ,0,5100,5100,99,1,1,1,0,0,B9EFED60-4FE7-4693-9138-2951350242D5, 0029141854,0,H,1910,4,19786583,3,46,1,31,40,51,458,1,0,0,1631,190,6,2,0,1,1,20,0,1035,1,0,0,1,0,0,0,190,2,840,0,1,21,5105900,1910405900,515900,0,5900,100,DANVILLE ,131935,1,1626,19,22,SPRUCE ST ,0,131935,3 +P,1910,4,19786583,2,0,1,0,0,0,1,0,1,0,0,3,0,0,1,99,99,1,301,11,1,200,6,99,0,5100,1,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,2,1,1,1,97,20,0,0,100,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1899,ENGLISH ,WOODRIDGE ,WILL , ,VIRGINIA ,VIRGINIA ,NORTH CAROLINA ,SON ,0,3700,5100,99,1,1,1,0,0,3495FB90-CEE7-4356-AD3C-BBAE1AEB6350, 0179387488,0,H,1910,4,19786583,3,46,1,31,40,51,458,1,0,0,1631,190,6,2,0,1,1,20,0,1035,1,0,0,1,0,0,0,190,2,840,0,1,21,5105900,1910405900,515900,0,5900,100,DANVILLE ,131935,1,1626,19,22,SPRUCE ST ,0,131935,3 +P,1910,4,19786583,3,0,1,0,0,0,1,0,1,0,0,3,0,0,1,99,99,1,301,9,1,200,6,99,0,5100,1,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,2,1,1,1,99,20,0,0,100,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1901, ,WOODRIDGE ,HENRY , ,VIRGINIA ,VIRGINIA ,NORTH CAROLINA ,SON ,0,3700,5100,99,1,1,1,0,0,E1E8A2D7-EFD8-46DB-A2CC-FFA0216BABEF, 0179387489,0,H,1910,4,19786583,3,46,1,31,40,51,458,1,0,0,1631,190,6,2,0,1,1,20,0,1035,1,0,0,1,0,0,0,190,2,840,0,1,21,5105900,1910405900,515900,0,5900,100,DANVILLE ,131935,1,1626,19,22,SPRUCE ST ,0,131935,3 +P,1910,4,15166087,1,0,1,0,0,0,0,0,0,2,1,10,8,1,1,15,1,0,101,44,1,100,1,20,0,45400,5,2,0,5,100,100,2,1,4,10,2,970,20,8,336,20,26,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,75,10,100,0,0,0,0,0,175,397,30,251,LABOR ,1,0,0,0,1884,100,100,1866,ENGLISH ,HARKABUS ,ANDY ,STEEL PLANT ,HUNGARY ,HUNGARY ,HUNGARY ,HEAD ,0,45400,45400,99,1,1,1,0,0,0BFBF9FC-CACE-4550-BD70-AB7659E768E5, 0022207981,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,2,0,1,0,0,0,0,0,0,1,1,10,8,1,1,15,1,0,201,37,2,100,1,20,11,45400,5,0,0,4,100,100,2,1,4,30,1,999,0,0,0,0,16,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,4,1,1,1,1,1,2,82,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,1894,100,100,1873,ENGLISH ,HARKALUS ,BARBA , ,HUNGARY ,HUNGARY ,HUNGARY ,WIFE ,9,45400,45400,99,1,1,2,0,0,52B64CF7-C318-4CB5-A378-CB7A9A157B37, 0142274335,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,3,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,15,2,100,6,99,0,3900,4,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,104,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1895,ENGLISH ,HARKALUS ,MARIA , ,OHIO ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,1,1,2,0,0,678AA2C9-BB27-4C3E-BF42-7CDF3D03CAEC, 0142274336,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,4,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,14,1,100,6,99,0,3900,4,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,105,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1896,ENGLISH ,HARKALUS ,ANDY , ,OHIO ,HUNGARY ,HUNGARY ,SON ,0,45400,45400,99,1,1,2,0,0,B04F6A33-9A86-4EAF-884B-0BD6107CCDEB, 0142274337,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,5,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,13,1,100,6,99,0,3900,4,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,106,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1897,ENGLISH ,HARKALUS ,JOSEPH , ,OHIO ,HUNGARY ,HUNGARY ,SON ,0,45400,45400,99,1,1,2,0,0,B438AAED-790F-4DF4-B082-8B6705AB91DD, 0142274338,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,6,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,10,2,100,6,99,0,3900,4,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,109,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1900,ENGLISH ,HARKALUS ,BERTHA , ,OHIO ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,1,1,2,0,0,7A997689-C6C0-46BA-B35F-E5EC4622531A, 0142274339,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,7,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,8,1,100,6,99,0,3900,4,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,3,0,4,0,0,0,1,1,1,1,1,2,111,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1902, ,HARKALUS ,ALBERT , ,OHIO ,HUNGARY ,HUNGARY ,SON ,0,45400,45400,99,1,1,2,0,0,B22B9B79-A2BC-4FEB-9C65-E27C754B5344, 0142274340,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,8,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,7,2,100,6,99,0,3900,4,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,3,0,4,0,0,0,1,1,1,1,1,2,112,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1903, ,HARKALUS ,KATE , ,OHIO ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,1,1,2,0,0,E5DBBD72-9FFB-44E4-8259-864A0938D3E5, 0142274341,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,9,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,5,2,100,6,99,0,3900,4,0,0,0,0,100,0,1,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,3,0,4,0,0,0,1,1,1,1,1,2,114,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1905, ,HARKALUS ,JULIA , ,OHIO ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,1,1,2,0,0,811BEAA6-B8FD-4A98-BDA9-FFBC82FF2E8C, 0142274342,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,15166087,10,0,1,2,0,1,1,7,1,0,0,10,0,0,1,99,99,7,301,1,2,100,6,99,0,3900,4,0,0,0,0,0,0,1,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,118,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1909, ,HARKALUS ,ANNIE , ,OHIO ,HUNGARY ,HUNGARY ,DAUGHTER ,0,45400,45400,99,1,1,2,0,0,C20243A9-C264-4E93-B419-40D6C81EEB08, 0142274343,0,H,1910,4,15166087,10,60,1,21,24,39,331,2,9320,9320,7630,790,9,2,9320,1,1,20,0,516,1,1,1,1,0,0,0,790,1,840,0,1,21,3900990,1910240990,390990,11,990,100,YOUNGSTOWN ,822099,2,1212,42,148,WILLINGTON AVE ,0,822099,10 +P,1910,4,5395204,1,0,1,0,0,0,0,0,0,2,1,10,8,2,1,15,0,0,101,37,1,100,1,17,0,2100,1,0,0,0,0,100,2,1,4,10,2,100,14,14,105,12,0,99,9,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,75,10,100,0,0,0,0,0,407,99,46,153,FARMER ,1,0,0,0,0,100,100,1873,ENGLISH ,MCFARLAND ,MILLARD F ,GENERAL FARM ,KENTUCKY ,KENTUCKY ,KENTUCKY ,HEAD ,0,2100,2100,99,1,1,1,0,0,611B180F-F74E-43DF-9AB1-7CF6DA48136E, 0006925919,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,2,0,1,0,0,0,0,0,0,1,1,10,8,2,1,15,0,0,201,36,2,100,1,17,9,2100,1,0,0,0,0,100,2,1,4,30,1,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,76,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1874,ENGLISH ,MCFARLAND ,LIZZIE , ,KENTUCKY ,KENTUCKY ,KENTUCKY ,WIFE ,9,2100,2100,99,1,1,1,4,0,6F046204-F5A1-4440-B97D-4DCACCAAF6AB, 0193447904,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,3,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,15,1,100,6,99,0,2100,1,0,0,0,0,100,2,2,4,10,0,830,4,17,105,20,0,99,9,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,97,10,100,0,0,0,0,0,184,6,25,122,FARM LABORER ,1,0,0,0,0,100,100,1895,ENGLISH ,MCFARLAND ,DAVIS ,HOME FARM ,KENTUCKY ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,1,1,1,0,0,1E9CABAC-B9B6-4409-B59F-6317F2D85423, 0193447905,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,4,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,13,2,100,6,99,0,2100,1,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,99,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1897,ENGLISH ,MCFARLAND ,FRANCIS ELLEN , ,KENTUCKY ,KENTUCKY ,KENTUCKY ,DAUGHTER ,0,2100,2100,99,1,1,1,0,0,4B952E6B-0106-4A18-B368-FA72994C1B81, 0193447906,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,5,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,11,2,100,6,99,0,1800,1,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,101,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1899,ENGLISH ,MCFARLAND ,ANNA , ,INDIANA ,INDIANA ,INDIANA ,DAUGHTER ,0,1800,1800,99,1,1,1,0,0,FCD8A7C7-56A5-4499-B93D-BC2B07DFD16D, 0193447907,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,6,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,9,2,100,6,99,0,1800,1,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,103,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1901,ENGLISH ,MCFARLAND ,MARY E , ,INDIANA ,INDIANA ,INDIANA ,DAUGHTER ,0,1800,1800,99,1,1,1,0,0,F9F02BC2-4472-4A8F-AF58-6ACFA90A6B44, 0193447908,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,7,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,7,1,100,6,99,0,1800,1,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,0,0,1,1,1,1,1,2,105,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1903,ENGLISH ,MCFARLAND ,LEONARD , ,INDIANA ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,1,1,1,0,0,95B1D9D6-4F3A-4265-B2AF-80E0C0BAEA05, 0193447909,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,8,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,5,1,100,6,99,0,2100,1,0,0,0,0,100,0,1,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,4,0,0,0,1,1,1,1,1,2,107,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1905, ,MCFARLAND ,WILLIAM PARK , ,KENTUCKY ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,1,1,1,0,0,4195466C-541B-440F-B235-6D5BC331D865, 0193447910,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,9,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,2,1,100,6,99,0,1800,1,0,0,0,0,0,0,1,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,110,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1908, ,MCFARLAND ,ERNEST , ,INDIANA ,KENTUCKY ,KENTUCKY ,SON ,0,2100,2100,99,1,1,1,0,0,4F23719B-7351-4FDD-BE55-504518DEE5B1, 0193447911,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,5395204,10,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,0,2,100,6,99,0,1800,1,0,0,0,0,0,0,1,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,112,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1910, ,MCFARLAND ,REBECCA , ,INDIANA ,KENTUCKY ,KENTUCKY ,DAUGHTER ,0,2100,2100,9,1,1,1,0,0,B397876C-5FFB-4BF2-877E-4B7C5801D1DA, 0193447912,0,H,1910,4,5395204,10,92,1,21,22,18,123,1,0,0,0,0,1,1,0,1,2,10,1,382,1,1,1,1,0,0,0,0,1,840,0,1,21,1801550,1910221550,181550,0,1550,100, ,545150,1,381,87,155,STONE COLLEGE ROAD ,0,545150,10 +P,1910,4,16919897,1,0,1,0,0,0,0,0,0,2,1,10,8,0,1,21,7,0,101,49,1,100,1,27,0,45500,5,2,0,5,2100,2100,1,1,4,10,2,650,24,10,216,20,23,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,75,10,100,0,0,0,0,0,263,542,33,357,MINER ,1,0,0,0,1887,100,100,1861,POLISH ,BOGACZYK ,ANDREW ,DIGGING COAL ,POLAND ,POLAND ,POLAND ,HEAD ,0,45500,45500,99,1,1,1,0,0,D700674C-3114-4B5A-823E-87BDCEA6ACC8, 0025914073,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,2,0,1,0,0,0,0,0,0,1,1,10,8,0,1,21,7,0,201,49,2,100,1,27,10,4200,4,0,0,0,0,2100,1,1,4,30,1,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,75,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1861,POLISH ,BOGACZYK ,KATE , ,PENNSYLVANIA ,POLAND ,POLAND ,WIFE ,9,45500,45500,99,1,1,1,0,0,AEA59E0E-8066-4A28-81B4-5EE25BE2CBA3, 0129052002,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,3,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,21,1,100,6,99,0,4200,4,0,0,0,0,100,2,1,4,10,2,650,24,10,216,20,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,103,10,100,0,0,0,0,0,263,542,33,357,MINER ,1,0,0,0,0,100,100,1889,ENGLISH ,BOGACZYK ,JACOB ,DIGGING COAL ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,F7E0450D-ECCC-4338-92B0-ACB4F9D40D8F, 0129052003,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,4,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,18,2,100,6,99,0,4200,4,0,0,0,0,100,2,1,4,30,1,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,106,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1892,ENGLISH ,BOGACZYK ,MARY , ,PENNSYLVANIA ,POLAND ,POLAND ,DAUGHTER ,0,45500,45500,99,1,1,1,0,0,5E6C358C-6F45-48FA-B2F6-D9487AA174B9, 0129052004,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,5,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,16,1,100,6,99,0,4200,4,0,0,0,0,100,2,1,4,10,2,650,24,10,216,20,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,108,10,100,0,0,0,0,0,263,542,33,357,MINER ,1,0,0,0,0,100,100,1894,ENGLISH ,BOGACZYK ,VINCENT ,DIGGING COAL ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,B3C172F3-D25F-43BE-8D42-9AA4712D815A, 0129052005,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,6,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,14,1,100,6,99,0,4200,4,0,0,0,0,100,2,2,4,10,0,650,24,10,216,20,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,110,10,100,0,0,0,0,0,263,542,33,357,MINER ,1,0,0,0,0,100,100,1896,ENGLISH ,BOGACZYK ,WALTER ,DIGGING COAL ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,1089B6CF-082B-4750-8677-FE9CE56687B5, 0129052006,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,7,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,13,1,100,6,99,0,4200,4,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,111,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1897,ENGLISH ,BOGACZYK ,KOSTIA , ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,F1DB36F5-158E-4EEF-A352-3ACA051DF087, 0129052007,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,8,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,11,1,100,6,99,0,4200,4,0,0,0,0,100,2,2,4,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,113,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1899,ENGLISH ,BOGACZYK ,JOSEPH , ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,75A37E2C-48E4-461C-937F-27A2DD8A6354, 0129052008,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,9,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,9,1,100,6,99,0,4200,4,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,115,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1901, ,BOGACZYK ,CHARLES , ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,4BEE543A-50A3-4FAE-A402-18C8D48806A4, 0129052009,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 +P,1910,4,16919897,10,0,1,2,0,1,1,0,1,0,0,10,0,0,1,99,99,7,301,7,1,100,6,99,0,4200,4,0,0,0,0,100,0,2,0,30,0,999,0,0,0,0,0,99,9,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,1,1,2,117,10,100,0,0,0,0,0,0,9999,9999,9999, ,1,0,0,0,0,100,100,1903, ,BOGACZYK ,FERDINAND , ,PENNSYLVANIA ,POLAND ,POLAND ,SON ,0,45500,45500,99,1,1,1,0,0,EFD47F5C-1285-4218-AB16-5756612EEB98, 0129052010,0,H,1910,4,16919897,10,19,1,12,14,42,364,1,0,0,0,0,1,1,0,1,2,10,1,507,1,1,1,1,0,0,0,0,1,840,0,1,22,4201170,1910141170,421170,11,1170,100, ,850031,1,1422,13,140, ,0,850031,10 diff --git a/hlink/tests/input_data/reporting_hh_predicted_matches.csv b/hlink/tests/input_data/reporting_hh_predicted_matches.csv new file mode 100644 index 0000000..fa73910 --- /dev/null +++ b/hlink/tests/input_data/reporting_hh_predicted_matches.csv @@ -0,0 +1,10 @@ +histid_a,prediction +1,1 +12,1 +15,1 +17,1 +18,1 +19,1 +29,1 +30,1 +33,1 \ No newline at end of file diff --git a/hlink/tests/input_data/reporting_predicted_matches.csv b/hlink/tests/input_data/reporting_predicted_matches.csv new file mode 100644 index 0000000..0b354c5 --- /dev/null +++ b/hlink/tests/input_data/reporting_predicted_matches.csv @@ -0,0 +1,15 @@ +histid_a,prediction +0,1 +2,1 +7,1 +9,1 +10,1 +11,1 +14,1 +16,1 +21,1 +24,1 +25,1 +26,1 +31,1 +32,1 \ No newline at end of file diff --git a/hlink/tests/input_data/reporting_prepped_df_a.csv b/hlink/tests/input_data/reporting_prepped_df_a.csv new file mode 100644 index 0000000..776d75d --- /dev/null +++ b/hlink/tests/input_data/reporting_prepped_df_a.csv @@ -0,0 +1,35 @@ +serialp,histid,linked_round_hardcoded +0,0,1 +0,1,2 +0,2,1 +0,3,0 +1,4,0 +1,5,0 +1,6,0 +2,7,1 +3,8,0 +4,9,1 +4,10,1 +4,11,1 +5,12,2 +5,13,0 +5,14,1 +5,15,2 +6,16,1 +6,17,2 +6,18,2 +6,19,2 +7,20,0 +7,21,1 +7,22,0 +7,23,0 +8,24,1 +8,25,1 +8,26,1 +8,27,0 +8,28,0 +9,29,2 +9,30,2 +9,31,1 +9,32,1 +9,33,2 \ No newline at end of file diff --git a/hlink/tests/input_data/representivity.csv b/hlink/tests/input_data/representivity.csv new file mode 100644 index 0000000..c7a3381 --- /dev/null +++ b/hlink/tests/input_data/representivity.csv @@ -0,0 +1,92 @@ +feature,values,us1900 all count,us1900 all percent,us1900 round 1 count,us1900 round 1 percent,us1900 round 2 count,us1900 round 2 percent,us1910 all count,us1910 all percent,us1910 round 1 count,us1910 round 1 percent,us1910 round 2 count,us1910 round 2 percent +Total count,,27,,10,,10,,27,,10,,10, + +presence of children over the age of 10 in the household,0,6,0.2222222222222222,2,0.2,2,0.2,6,0.2222222222222222,2,0.2,2,0.2 +presence of children over the age of 10 in the household,1-2,13,0.48148148148148145,5,0.5,5,0.5,13,0.48148148148148145,5,0.5,5,0.5 +presence of children over the age of 10 in the household,3-5,8,0.2962962962962963,3,0.3,3,0.3,8,0.2962962962962963,3,0.3,3,0.3 + +sex,1,12,0.4444444444444444,7,0.7,4,0.4,12,0.4444444444444444,7,0.7,4,0.4 +sex,2,15,0.5555555555555556,3,0.3,6,0.6,15,0.5555555555555556,3,0.3,6,0.6 + +age,0,2,0.07407407407407407,0,0.0,2,0.2,2,0.07407407407407407,0,0.0,2,0.2 +age,10,7,0.25925925925925924,3,0.3,4,0.4,7,0.25925925925925924,3,0.3,4,0.4 +age,20,9,0.3333333333333333,3,0.3,2,0.2,9,0.3333333333333333,3,0.3,2,0.2 +age,30,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +age,40,3,0.1111111111111111,1,0.1,1,0.1,3,0.1111111111111111,1,0.1,1,0.1 +age,50,2,0.07407407407407407,1,0.1,1,0.1,2,0.07407407407407407,1,0.1,1,0.1 +age,60,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +age,70,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +age,80,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 + +race,1,11,0.4074074074074074,5,0.5,4,0.4,11,0.4074074074074074,5,0.5,4,0.4 +race,2,6,0.2222222222222222,2,0.2,1,0.1,6,0.2222222222222222,2,0.2,1,0.1 +race,3,7,0.25925925925925924,2,0.2,3,0.3,7,0.25925925925925924,2,0.2,3,0.3 +race,8,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +race,9,2,0.07407407407407407,0,0.0,2,0.2,2,0.07407407407407407,0,0.0,2,0.2 + +relationship to household head,1,7,0.25925925925925924,5,0.5,1,0.1,7,0.25925925925925924,5,0.5,1,0.1 +relationship to household head,11,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +relationship to household head,12,2,0.07407407407407407,0,0.0,0,0.0,2,0.07407407407407407,0,0.0,0,0.0 +relationship to household head,2,2,0.07407407407407407,0,0.0,1,0.1,2,0.07407407407407407,0,0.0,1,0.1 +relationship to household head,3,12,0.4444444444444444,4,0.4,8,0.8,12,0.4444444444444444,4,0.4,8,0.8 +relationship to household head,5,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +relationship to household head,6,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +relationship to household head,7,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 + +marst,1,4,0.14814814814814814,2,0.2,1,0.1,4,0.14814814814814814,2,0.2,1,0.1 +marst,2,2,0.07407407407407407,1,0.1,0,0.0,2,0.07407407407407407,1,0.1,0,0.0 +marst,3,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +marst,4,1,0.037037037037037035,0,0.0,1,0.1,1,0.037037037037037035,0,0.0,1,0.1 +marst,5,3,0.1111111111111111,2,0.2,0,0.0,3,0.1111111111111111,2,0.2,0,0.0 +marst,6,16,0.5925925925925926,5,0.5,8,0.8,16,0.5925925925925926,5,0.5,8,0.8 + +married,0,21,0.7777777777777778,7,0.7,9,0.9,21,0.7777777777777778,7,0.7,9,0.9 +married,1,6,0.2222222222222222,3,0.3,1,0.1,6,0.2222222222222222,3,0.3,1,0.1 + +"married, by age",20,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +"married, by age",30,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +"married, by age",40,2,0.07407407407407407,1,0.1,0,0.0,2,0.07407407407407407,1,0.1,0,0.0 +"married, by age",50,2,0.07407407407407407,1,0.1,1,0.1,2,0.07407407407407407,1,0.1,1,0.1 +"married, by age",null,21,0.7777777777777778,7,0.7,9,0.9,21,0.7777777777777778,7,0.7,9,0.9 + +marriage duration at least ten years,0,22,0.8148148148148148,7,0.7,9,0.9,22,0.8148148148148148,7,0.7,9,0.9 +marriage duration at least ten years,1,5,0.18518518518518517,3,0.3,1,0.1,5,0.18518518518518517,3,0.3,1,0.1 + +"marriage duration at least 10 years, by age",30,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +"marriage duration at least 10 years, by age",40,2,0.07407407407407407,1,0.1,0,0.0,2,0.07407407407407407,1,0.1,0,0.0 +"marriage duration at least 10 years, by age",50,2,0.07407407407407407,1,0.1,1,0.1,2,0.07407407407407407,1,0.1,1,0.1 +"marriage duration at least 10 years, by age",null,22,0.8148148148148148,7,0.7,9,0.9,22,0.8148148148148148,7,0.7,9,0.9 + +region of residence,2,7,0.25925925925925924,3,0.3,2,0.2,7,0.25925925925925924,3,0.3,2,0.2 +region of residence,4,20,0.7407407407407407,7,0.7,8,0.8,20,0.7407407407407407,7,0.7,8,0.8 + +region of birth,10,2,0.07407407407407407,2,0.2,0,0.0,2,0.07407407407407407,2,0.2,0,0.0 +region of birth,2,4,0.14814814814814814,2,0.2,1,0.1,4,0.14814814814814814,2,0.2,1,0.1 +region of birth,3,1,0.037037037037037035,0,0.0,1,0.1,1,0.037037037037037035,0,0.0,1,0.1 +region of birth,4,20,0.7407407407407407,6,0.6,8,0.8,20,0.7407407407407407,6,0.6,8,0.8 + +socioeconomic status,0,12,0.4444444444444444,3,0.3,6,0.6,12,0.4444444444444444,3,0.3,6,0.6 +socioeconomic status,15,6,0.2222222222222222,2,0.2,2,0.2,6,0.2222222222222222,2,0.2,2,0.2 +socioeconomic status,30,2,0.07407407407407407,1,0.1,0,0.0,2,0.07407407407407407,1,0.1,0,0.0 +socioeconomic status,45,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +socioeconomic status,60,2,0.07407407407407407,1,0.1,0,0.0,2,0.07407407407407407,1,0.1,0,0.0 +socioeconomic status,90,4,0.14814814814814814,2,0.2,2,0.2,4,0.14814814814814814,2,0.2,2,0.2 + +lifetime migrant,0,21,0.7777777777777778,8,0.8,9,0.9,21,0.7777777777777778,8,0.8,9,0.9 +lifetime migrant,1,6,0.2222222222222222,2,0.2,1,0.1,6,0.2222222222222222,2,0.2,1,0.1 + +lifetime migrant by age,20,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +lifetime migrant by age,40,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +lifetime migrant by age,50,1,0.037037037037037035,0,0.0,1,0.1,1,0.037037037037037035,0,0.0,1,0.1 +lifetime migrant by age,60,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +lifetime migrant by age,70,1,0.037037037037037035,0,0.0,0,0.0,1,0.037037037037037035,0,0.0,0,0.0 +lifetime migrant by age,80,1,0.037037037037037035,1,0.1,0,0.0,1,0.037037037037037035,1,0.1,0,0.0 +lifetime migrant by age,null,21,0.7777777777777778,8,0.8,9,0.9,21,0.7777777777777778,8,0.8,9,0.9 + +namefrst_unstd commonality,1-5,27,1.0,10,1.0,10,1.0,27,1.0,10,1.0,10,1.0 + +namefrst_std commonality,1-5,27,1.0,10,1.0,10,1.0,27,1.0,10,1.0,10,1.0 + +namelast_clean commonality,1-5,20,0.7407407407407407,7,0.7,7,0.7,20,0.7407407407407407,7,0.7,7,0.7 +namelast_clean commonality,6-20,7,0.25925925925925924,3,0.3,3,0.3,7,0.25925925925925924,3,0.3,3,0.3 + diff --git a/hlink/tests/input_data/scored_matches_household_test.csv b/hlink/tests/input_data/scored_matches_household_test.csv new file mode 100644 index 0000000..f6aee0f --- /dev/null +++ b/hlink/tests/input_data/scored_matches_household_test.csv @@ -0,0 +1,7 @@ +histid_a,namefrst_a,namelast_a,serialp_a,histid_b,namefrst_b,namelast_b,serialp_b,prediction +1000A,Albert,Johnson,1,1000B,Albert,Johnson,7,1 +1000A,Albert,Johnson,1,1002B,John,Johnson,7,0 +1001B,Mary,Johnson,1,1001B,Mary,Johnson,7,1 +1002A,Steve,Johnson,1,1003B,Steve,Johnson,7,1 +1002A,Steve,Johnson,1,1007B,Steve,Johnson,8,1 +1003A,Robert,Johnson,1,1004B,Robert,Johnson,8,1 \ No newline at end of file diff --git a/hlink/tests/input_data/scored_matches_test_data.csv b/hlink/tests/input_data/scored_matches_test_data.csv new file mode 100644 index 0000000..2778991 --- /dev/null +++ b/hlink/tests/input_data/scored_matches_test_data.csv @@ -0,0 +1,7 @@ +histid_a,histid_b,prediction +A001,B001,1 +A002,B001,1 +A003,B002,0 +A004,B003,1 +A005,B004,0 +A006,B004,0 \ No newline at end of file diff --git a/hlink/tests/input_data/sql_condition_marst_warn_a.csv b/hlink/tests/input_data/sql_condition_marst_warn_a.csv new file mode 100644 index 0000000..1628aa2 --- /dev/null +++ b/hlink/tests/input_data/sql_condition_marst_warn_a.csv @@ -0,0 +1,37 @@ +id,marst,key_marst_warn +0,1,0 +1,1,0 +2,1,0 +3,1,0 +4,1,0 +5,1,1 +6,2,0 +7,2,0 +8,2,0 +9,2,0 +10,2,0 +11,2,1 +12,3,0 +13,3,0 +14,3,0 +15,3,0 +16,3,0 +17,3,1 +18,4,1 +19,4,1 +20,4,0 +21,4,0 +22,4,0 +23,4,1 +24,5,1 +25,5,1 +26,5,0 +27,5,0 +28,5,0 +29,5,1 +30,6,1 +31,6,1 +32,6,1 +33,6,1 +34,6,1 +35,6,0 \ No newline at end of file diff --git a/hlink/tests/input_data/sql_condition_marst_warn_b.csv b/hlink/tests/input_data/sql_condition_marst_warn_b.csv new file mode 100644 index 0000000..55aae15 --- /dev/null +++ b/hlink/tests/input_data/sql_condition_marst_warn_b.csv @@ -0,0 +1,37 @@ +id,marst +36,1 +37,2 +38,3 +39,4 +40,5 +41,6 +42,1 +43,2 +44,3 +45,4 +46,5 +47,6 +48,1 +49,2 +50,3 +51,4 +52,5 +53,6 +54,1 +55,2 +56,3 +57,4 +58,5 +59,6 +60,1 +61,2 +62,3 +63,4 +64,5 +65,6 +66,1 +67,2 +68,3 +69,4 +70,5 +71,6 \ No newline at end of file diff --git a/hlink/tests/input_data/statedist.csv b/hlink/tests/input_data/statedist.csv new file mode 100644 index 0000000..6cac27c --- /dev/null +++ b/hlink/tests/input_data/statedist.csv @@ -0,0 +1,2602 @@ +dist,statecode1,statecode2 +0,100,100 +3485.3999,100,200 +1427.1,100,400 +354.29999,100,500 +1867.4,100,600 +1128.2,100,800 +985.59998,100,900 +769.70001,100,1000 +695.40002,100,1100 +396.29999,100,1200 +199,100,1300 +4276.2002,100,1500 +1688.1,100,1600 +524.70001,100,1700 +495.89999,100,1800 +740.29999,100,1900 +759.70001,100,2000 +342.60001,100,2100 +318.89999,100,2200 +1304.4,100,2300 +712.20001,100,2400 +1061.5,100,2500 +838.59998,100,2600 +1015.6,100,2700 +163.60001,100,2800 +499.70001,100,2900 +1551.2,100,3000 +936.40002,100,3100 +1716.1,100,3200 +1117.9,100,3300 +845.79999,100,3400 +1114.4,100,3500 +931.5,100,3600 +481.5,100,3700 +1241.9,100,3800 +577.59998,100,3900 +639.20001,100,4000 +1970.6,100,4100 +752.09998,100,4200 +1039.5,100,4400 +354.60001,100,4500 +1082.3,100,4600 +217.3,100,4700 +733.29999,100,4800 +1453.9,100,4900 +1092.7,100,5000 +567.59998,100,5100 +2029.8,100,5300 +537,100,5400 +835.79999,100,5500 +1327.3,100,5600 +3485.3999,200,100 +0,200,200 +2667.5,200,400 +3187.2,200,500 +2280.8999,200,600 +2556.3,200,800 +3412.8,200,900 +3477.1001,200,1000 +3434.1001,200,1100 +3875.2,200,1200 +3597.7,200,1300 +3010.8999,200,1500 +1989.5,200,1600 +2991.6001,200,1700 +3091.7,200,1800 +2747.8999,200,1900 +2798.2,200,2000 +3256.3999,200,2100 +3443.8999,200,2200 +3321.6001,200,2300 +3442.2,200,2400 +3418.2,200,2500 +2839.8,200,2600 +2497.3,200,2700 +3401.8,200,2800 +2988.2,200,2900 +1984.5,200,3000 +2582.6001,200,3100 +2223.6001,200,3200 +3330.8999,200,3300 +3437.8,200,3400 +2806.3,200,3500 +3252.5,200,3600 +3561.6001,200,3700 +2243.8,200,3800 +3173.6001,200,3900 +2993.3,200,4000 +1842.3,200,4100 +3296.5,200,4200 +3448.5,200,4400 +3604.7,200,4500 +2410.1001,200,4600 +3319.6001,200,4700 +3185,200,4800 +2359.2,200,4900 +3276.8,200,5000 +3462,200,5100 +1646.1,200,5300 +3337.8,200,5400 +2725.8999,200,5500 +2268.7,200,5600 +1427.1,400,100 +2667.5,400,200 +0,400,400 +1089.6,400,500 +489.39999,400,600 +472.60001,400,800 +2159.7,400,900 +2019.1,400,1000 +1935,400,1100 +1761.7,400,1200 +1623.7,400,1300 +2854.6001,400,1500 +715.40002,400,1600 +1297.5,400,1700 +1445,400,1800 +1120.8,400,1900 +792.09998,400,2000 +1485.9,400,2100 +1174.4,400,2200 +2375.1001,400,2300 +1953.3,400,2400 +2224.8999,400,2500 +1556.1,400,2600 +1236.6,400,2700 +1267.2,400,2800 +1101.9,400,2900 +893.20001,400,3000 +817.40002,400,3100 +449.5,400,3200 +2224.1001,400,3300 +2059.2,400,3400 +312.89999,400,3500 +2018.8,400,3600 +1834.7,400,3700 +1082.4,400,3800 +1636.5,400,3900 +804.90002,400,4000 +828.90002,400,4100 +1894.3,400,4200 +2222.8,400,4400 +1753.9,400,4500 +930.5,400,4600 +1429.3,400,4700 +740.09998,400,4800 +354.10001,400,4900 +2171.7,400,5000 +1848.8,400,5100 +1025,400,5300 +1741.1,400,5400 +1365.8,400,5500 +645,400,5600 +354.29999,500,100 +3187.2,500,200 +1089.6,500,400 +0,500,500 +1517.1,500,600 +774.09998,500,800 +1161.1,500,900 +978.70001,500,1000 +895,500,1100 +738.29999,500,1200 +541.09998,500,1300 +3944.1001,500,1500 +1340.4,500,1600 +402.5,500,1700 +483.60001,500,1800 +499,500,1900 +412.39999,500,2000 +438.20001,500,2100 +281.5,500,2200 +1442.6,500,2300 +913.59998,500,2400 +1234.6,500,2500 +775,500,2600 +794.40002,500,2700 +221.2,500,2800 +238.7,500,2900 +1223.6,500,3000 +608.59998,500,3100 +1362.2,500,3200 +1264.9,500,3300 +1036.8,500,3400 +777.59998,500,3500 +1059,500,3600 +752.29999,500,3700 +961.70001,500,3800 +654.29999,500,3900 +288.5,500,4000 +1619.2,500,4100 +897,500,4200 +1222.1,500,4400 +664.29999,500,4500 +778.5,500,4600 +349.70001,500,4700 +462,500,4800 +1099.7,500,4900 +1225.3,500,5000 +788.40002,500,5100 +1687.3,500,5300 +703.40002,500,5400 +687.90002,500,5500 +983.09998,500,5600 +1867.4,600,100 +2280.8999,600,200 +489.39999,600,400 +1517.1,600,500 +0,600,600 +773.09998,600,800 +2487.8,600,900 +2378.3999,600,1000 +2296.7,600,1100 +2223.1001,600,1200 +2058.2,600,1300 +2470,600,1500 +557.70001,600,1600 +1645,600,1700 +1797.4,600,1800 +1421.6,600,1900 +1156,600,2000 +1871.1,600,2100 +1640.7,600,2200 +2657.3,600,2300 +2314.3,600,2400 +2545.7,600,2500 +1830.2,600,2600 +1443.6,600,2700 +1713.7,600,2800 +1476.8,600,2900 +850.70001,600,3000 +1094.3,600,3100 +219,600,3200 +2527.3,600,3300 +2405.3,600,3400 +774.09998,600,3500 +2331.8,600,3600 +2234.7,600,3700 +1202.1,600,3800 +1982.1,600,3900 +1229,600,4000 +474.79999,600,4100 +2232.8,600,4200 +2550,600,4400 +2172.6001,600,4500 +1124.8,600,4600 +1836.3,600,4700 +1219.1,600,4800 +453.60001,600,4900 +2470.3999,600,5000 +2227.6001,600,5100 +711.5,600,5300 +2108.8,600,5400 +1630.9,600,5500 +750.90002,600,5600 +1128.2,800,100 +2556.3,800,200 +472.60001,800,400 +774.09998,800,500 +773.09998,800,600 +0,800,800 +1725.6,800,900 +1606.9,800,1000 +1524.7,800,1100 +1507.7,800,1200 +1309.3,800,1300 +3240.6001,800,1500 +595,800,1600 +873.5,800,1700 +1025.7,800,1800 +665.09998,800,1900 +385.39999,800,2000 +1099.1,800,2100 +956.79999,800,2200 +1919.7,800,2300 +1542.4,800,2400 +1786.8,800,2500 +1094.1,800,2600 +764.29999,800,2700 +987.20001,800,2800 +704.90002,800,2900 +594.29999,800,3000 +348.10001,800,3100 +595.20001,800,3200 +1776.9,800,3300 +1636.8,800,3400 +316.60001,800,3500 +1575.7,800,3600 +1464.7,800,3700 +637.20001,800,3800 +1212.4,800,3900 +499.5,800,4000 +853,800,4100 +1466,800,4200 +1788.3,800,4400 +1410.6,800,4500 +464.39999,800,4600 +1071.2,800,4700 +629.20001,800,4800 +330.20001,800,4900 +1722.2,800,5000 +1454.5,800,5100 +954.5,800,5300 +1335.9,800,5400 +899.79999,800,5500 +295.89999,800,5600 +985.59998,900,100 +3412.8,900,200 +2159.7,900,400 +1161.1,900,500 +2487.8,900,600 +1725.6,900,800 +0,900,900 +228.39999,900,1000 +291.29999,900,1100 +1057.2,900,1200 +852.29999,900,1300 +4955.8999,900,1500 +2102.2,900,1600 +862.29999,900,1700 +717.29999,900,1800 +1066.2,900,1900 +1367.7,900,2000 +723.59998,900,2100 +1290.4,900,2200 +333.60001,900,2300 +275.60001,900,2400 +76,900,2500 +687.29999,900,2600 +1112.3,900,2700 +1112.8,900,2800 +1064.9,900,2900 +1844.5,900,3000 +1393.5,900,3100 +2286.8,900,3200 +156.2,900,3300 +142.89999,900,3400 +1870.2,900,3500 +172.39999,900,3600 +541.09998,900,3700 +1416,900,3800 +525.59998,900,3900 +1393.3,900,4000 +2399.6001,900,4100 +268.10001,900,4200 +63.099998,900,4400 +692,900,4500 +1396.3,900,4600 +831.70001,900,4700 +1623.1,900,4800 +2034.2,900,4900 +171.5,900,5000 +421.89999,900,5100 +2354.5,900,5300 +462.39999,900,5400 +879.59998,900,5500 +1768.6,900,5600 +769.70001,1000,100 +3477.1001,1000,200 +2019.1,1000,400 +978.70001,1000,500 +2378.3999,1000,600 +1606.9,1000,800 +228.39999,1000,900 +0,1000,1000 +84.599998,1000,1100 +830,1000,1200 +626.5,1000,1300 +4847.5,1000,1500 +2034.4,1000,1600 +733.5,1000,1700 +581.20001,1000,1800 +969.29999,1000,1900 +1233.3,1000,2000 +543.29999,1000,2100 +1081.3,1000,2200 +561.90002,1000,2300 +66,1000,2400 +301.39999,1000,2500 +661.70001,1000,2600 +1075.2,1000,2700 +905.90002,1000,2800 +917.29999,1000,2900 +1798.1,1000,3000 +1292.2,1000,3100 +2186.2,1000,3200 +380.29999,1000,3300 +87.5,1000,3400 +1719.7,1000,3500 +271,1000,3600 +313,1000,3700 +1380.5,1000,3800 +398.39999,1000,3900 +1231,1000,4000 +2334.1001,1000,4100 +182.60001,1000,4200 +274,1000,4400 +464,1000,4500 +1327.1,1000,4600 +635.70001,1000,4700 +1438,1000,4800 +1926.8,1000,4900 +378.29999,1000,5000 +202.2,1000,5100 +2312.6001,1000,5300 +279.39999,1000,5400 +829.79999,1000,5500 +1686,1000,5600 +695.40002,1100,100 +3434.1001,1100,200 +1935,1100,400 +895,1100,500 +2296.7,1100,600 +1524.7,1100,800 +291.29999,1100,900 +84.599998,1100,1000 +0,1100,1100 +785.20001,1100,1200 +562.70001,1100,1300 +4765.2998,1100,1500 +1959.3,1100,1600 +651.79999,1100,1700 +499.29999,1100,1800 +891.5,1100,1900 +1149.9,1100,2000 +459,1100,2100 +1003.7,1100,2200 +619.20001,1100,2300 +18.6,1100,2400 +367,1100,2500 +604.40002,1100,2600 +1009.7,1100,2700 +827.29999,1100,2800 +833.29999,1100,2900 +1727.7,1100,3000 +1212.7,1100,3100 +2105.8,1100,3200 +433.5,1100,3300 +150.39999,1100,3400 +1635.2,1100,3500 +286.79999,1100,3600 +259.89999,1100,3700 +1313.8,1100,3800 +320.29999,1100,3900 +1146.4,1100,4000 +2259,1100,4100 +144.7,1100,4200 +344.20001,1100,4400 +407.79999,1100,4500 +1253.2,1100,4600 +553.59998,1100,4700 +1354.9,1100,4800 +1845.7,1100,4900 +421.20001,1100,5000 +131,1100,5100 +2242.3,1100,5300 +194.8,1100,5400 +763.20001,1100,5500 +1608.8,1100,5600 +396.29999,1200,100 +3875.2,1200,200 +1761.7,1200,400 +738.29999,1200,500 +2223.1001,1200,600 +1507.7,1200,800 +1057.2,1200,900 +830,1200,1000 +785.20001,1200,1100 +0,1200,1200 +294.5,1200,1300 +4581,1200,1500 +2078.6001,1200,1600 +891.29999,1200,1700 +821.09998,1200,1800 +1127.3,1200,1900 +1149.9,1200,2000 +648.90002,1200,2100 +587.70001,1200,2200 +1390.6,1200,2300 +795.79999,1200,2400 +1127,1200,2500 +1146.1,1200,2600 +1389.9,1200,2700 +520.59998,1200,2800 +895,1200,2900 +1947.4,1200,3000 +1332.3,1200,3100 +2085.7,1200,3200 +1210.3,1200,3300 +917.5,1200,3400 +1453.9,1200,3500 +1068.6,1200,3600 +525.59998,1200,3700 +1631.7,1200,3800 +826.40002,1200,3900 +1009.8,1200,4000 +2356.8,1200,4100 +898.20001,1200,4200 +1095.1,1200,4400 +385.39999,1200,4500 +1477.4,1200,4600 +560.79999,1200,4700 +1029.8,1200,4800 +1827.4,1200,4900 +1204.8,1200,5000 +662.59998,1200,5100 +2423.5,1200,5300 +711.59998,1200,5400 +1187.4,1200,5500 +1719.9,1200,5600 +199.04657,1300,100 +3597.6848,1300,200 +1623.6844,1300,400 +541.13342,1300,500 +2058.2034,1300,600 +1309.2714,1300,800 +852.29999,1300,900 +626.5,1300,1000 +562.70001,1300,1100 +294.5,1300,1200 +0,1300,1300 +4474.3999,1300,1500 +1854.4,1300,1600 +606.5,1300,1700 +527.20001,1300,1800 +854,1300,1900 +932,1300,2000 +355,1300,2100 +507.29999,1300,2200 +1181.9,1300,2300 +577.20001,1300,2400 +927,1300,2500 +852.29999,1300,2600 +1104.4,1300,2700 +362.60001,1300,2800 +643.29999,1300,2900 +1697.5,1300,3000 +1090.4,1300,3100 +1901.3,1300,3200 +995.90002,1300,3300 +709.5,1300,3400 +1310.8,1300,3500 +828.40002,1300,3600 +317.39999,1300,3700 +1358.5,1300,3800 +539.40002,1300,3900 +829.40002,1300,4000 +2142.2,1300,4100 +649.90002,1300,4200 +900.5,1300,4400 +171.60001,1300,4500 +1217.3,1300,4600 +278.79999,1300,4700 +931.70001,1300,4800 +1637.6,1300,4900 +979.59998,1300,5000 +431.79999,1300,5100 +2188.3999,1300,5300 +444.60001,1300,5400 +895.20001,1300,5500 +1491,1300,5600 +4276.1738,1500,100 +3010.8923,1500,200 +2854.572,1500,400 +3944.0979,1500,500 +2470.0222,1500,600 +3240.6233,1500,800 +4955.8999,1500,900 +4847.5,1500,1000 +4765.2998,1500,1100 +4581,1500,1200 +4474.3999,1500,1300 +0,1500,1500 +2887.8,1500,1600 +4114,1500,1700 +4266.2998,1500,1800 +3890.3,1500,1900 +3618.1001,1500,2000 +4330.7002,1500,2100 +3999.7,1500,2200 +5105.2002,1500,2300 +4783,1500,2400 +5011.8999,1500,2500 +4287,1500,2600 +3879.6001,1500,2700 +4113.8999,1500,2800 +3938.3999,1500,2900 +3180,1500,3000 +3563,1500,3100 +2669.1001,1500,3200 +4987.3999,1500,3300 +4875.2998,1500,3400 +3166.6001,1500,3500 +4796.8999,1500,3600 +4687,1500,3700 +3600.6001,1500,3800 +4452,1500,3900 +3659,1500,4000 +2608.2,1500,4100 +4702.7002,1500,4200 +5017.6001,1500,4400 +4608.3999,1500,4500 +3571.3,1500,4600 +4282.3999,1500,4700 +3555.8999,1500,4800 +2923.1001,1500,4900 +4929.1001,1500,5000 +4691.2998,1500,5100 +2729.3999,1500,5300 +4575.7002,1500,5400 +4088,1500,5500 +3193.7,1500,5600 +1688.1068,1600,100 +1989.4573,1600,200 +715.35425,1600,400 +1340.4075,1600,500 +557.74451,1600,600 +594.96918,1600,800 +2102.2,1600,900 +2034.4,1600,1000 +1959.3,1600,1100 +2078.6001,1600,1200 +1854.4,1600,1300 +2887.8,1600,1500 +0,1600,1600 +1328.5,1600,1700 +1475.1,1600,1800 +1071.2,1600,1900 +928.90002,1600,2000 +1590.2,1600,2100 +1547,1600,2200 +2220.8,1600,2300 +1975.1,1600,2400 +2150.2,1600,2500 +1417.3,1600,2600 +998,1600,2700 +1560.1,1600,2800 +1215.6,1600,2900 +305.60001,1600,3000 +770.70001,1600,3100 +358.89999,1600,3200 +2114.1001,1600,3300 +2043.7,1600,3400 +817,1600,3500 +1935.1,1600,3600 +1957,1600,3700 +712.79999,1600,3800 +1639.3,1600,3900 +1084.6,1600,4000 +299.70001,1600,4100 +1868.8,1600,4200 +2161.2,1600,4400 +1931,1600,4500 +708.29999,1600,4600 +1593.4,1600,4700 +1215.9,1600,4800 +375,1600,4900 +2054.3999,1600,5000 +1917.1,1600,5100 +360.10001,1600,5300 +1788.3,1600,5400 +1222.5,1600,5500 +364,1600,5600 +524.67432,1700,100 +2991.637,1700,200 +1297.4592,1700,400 +402.45465,1700,500 +1644.9725,1700,600 +873.53247,1700,800 +862.29999,1700,900 +733.5,1700,1000 +651.79999,1700,1100 +891.29999,1700,1200 +606.5,1700,1300 +4114,1700,1500 +1328.5,1700,1600 +0,1700,1700 +152.5,1700,1800 +263.70001,1700,1900 +505.60001,1700,2000 +273.20001,1700,2100 +655.90002,1700,2200 +1098.6,1700,2300 +669.29999,1700,2400 +928.5,1700,2500 +372.60001,1700,2600 +498.70001,1700,2700 +513.20001,1700,2800 +214.3,1700,2900 +1127.6,1700,3000 +565.79999,1700,3100 +1454.8,1700,3200 +934.90002,1700,3300 +764.79999,1700,3400 +1009.9,1700,3500 +726.90002,1700,3600 +630.79999,1700,3700 +757.90002,1700,3800 +340.20001,1700,3900 +551,1700,4000 +1626.8,1700,4100 +597.09998,1700,4200 +925.5,1700,4400 +627.90002,1700,4500 +640.59998,1700,4600 +330.60001,1700,4700 +824.20001,1700,4800 +1194,1700,4900 +886.5,1700,5000 +591.59998,1700,5100 +1635.8,1700,5300 +466.60001,1700,5400 +314,1700,5500 +969.59998,1700,5600 +495.94901,1800,100 +3091.7444,1800,200 +1444.968,1800,400 +483.57211,1800,500 +1797.3762,1800,600 +1025.666,1800,800 +717.29999,1800,900 +581.20001,1800,1000 +499.29999,1800,1100 +821.09998,1800,1200 +527.20001,1800,1300 +4266.2998,1800,1500 +1475.1,1800,1600 +152.5,1800,1700 +0,1800,1800 +404.60001,1800,1900 +654.59998,1800,2000 +172.2,1800,2100 +697.70001,1800,2200 +970.20001,1800,2300 +516.90002,1800,2400 +785.70001,1800,2500 +342.70001,1800,2600 +595.90002,1800,2700 +532.70001,1800,2800 +347.70001,1800,2900 +1264.6,1800,3000 +716.70001,1800,3100 +1607.1,1800,3200 +799.70001,1800,3300 +614.70001,1800,3400 +1152.9,1800,3500 +591.09998,1800,3600 +493.39999,1800,3700 +877.79999,1800,3800 +191.89999,1800,3900 +680.09998,1800,4000 +1774.1,1800,4100 +450.10001,1800,4200 +780.40002,1800,4400 +513.20001,1800,4500 +779.20001,1800,4600 +279.5,1800,4700 +933.5,1800,4800 +1346.5,1800,4900 +754.70001,1800,5000 +442.20001,1800,5100 +1775.9,1800,5300 +315.20001,1800,5400 +370.5,1800,5500 +1118.3,1800,5600 +740.27533,1900,100 +2747.9326,1900,200 +1120.7531,1900,400 +498.96304,1900,500 +1421.6421,1900,600 +665.11847,1900,800 +1066.2,1900,900 +969.29999,1900,1000 +891.5,1900,1100 +1127.3,1900,1200 +854,1900,1300 +3890.3,1900,1500 +1071.2,1900,1600 +263.70001,1900,1700 +404.60001,1900,1800 +0,1900,1900 +356.70001,1900,2000 +536,1900,2100 +780.40002,1900,2200 +1255.9,1900,2300 +907.90002,1900,2400 +1124.7,1900,2500 +436.89999,1900,2600 +295.5,1900,2700 +681.5,1900,2800 +262.70001,1900,2900 +863.90002,1900,3000 +327.39999,1900,3100 +1222.1,1900,3200 +1111.9,1900,3300 +987.79999,1900,3400 +863.20001,1900,3500 +912,1900,3600 +893.59998,1900,3700 +505,1900,3800 +571.29999,1900,3900 +496.60001,1900,4000 +1370.6,1900,4100 +813.70001,1900,4200 +1128.3,1900,4400 +889.79999,1900,4500 +376.89999,1900,4600 +575.29999,1900,4700 +801.20001,1900,4800 +968.09998,1900,4900 +1057.1,1900,5000 +846,1900,5100 +1372.8,1900,5300 +717.09998,1900,5400 +259.5,1900,5500 +717.29999,1900,5600 +759.68787,2000,100 +2798.1772,2000,200 +792.11206,2000,400 +412.44397,2000,500 +1155.9771,2000,600 +385.35938,2000,800 +1367.7,2000,900 +1233.3,2000,1000 +1149.9,2000,1100 +1149.9,2000,1200 +932,2000,1300 +3618.1001,2000,1500 +928.90002,2000,1600 +505.60001,2000,1700 +654.59998,2000,1800 +356.70001,2000,1900 +0,2000,2000 +715.09998,2000,2100 +646.5,2000,2200 +1589.8,2000,2300 +1167.9,2000,2400 +1432.9,2000,2500 +788.09998,2000,2600 +580.59998,2000,2700 +633.29999,2000,2800 +320.89999,2000,2900 +819.90002,2000,3000 +223.60001,2000,3100 +980.20001,2000,3200 +1434,2000,3300 +1269.3,2000,3400 +513.29999,2000,3500 +1227.8,2000,3600 +1079.6,2000,3700 +627.79999,2000,3800 +845.29999,2000,3900 +207,2000,4000 +1211.5,2000,4100 +1102.7,2000,4200 +1430.8,2000,4400 +1026.5,2000,4500 +422,2000,4600 +686.59998,2000,4700 +490.70001,2000,4800 +715.29999,2000,4900 +1382.9,2000,5000 +1073.4,2000,5100 +1274.9,2000,5300 +958.29999,2000,5400 +616.09998,2000,5500 +570.70001,2000,5600 +342.63089,2100,100 +3256.3992,2100,200 +1485.9325,2100,400 +438.22516,2100,500 +1871.059,2100,600 +1099.0764,2100,800 +723.59998,2100,900 +543.29999,2100,1000 +459,2100,1100 +648.90002,2100,1200 +355,2100,1300 +4330.7002,2100,1500 +1590.2,2100,1600 +273.20001,2100,1700 +172.2,2100,1800 +536,2100,1900 +715.09998,2100,2000 +0,2100,2100 +592.29999,2100,2200 +1015.2,2100,2300 +477.60001,2100,2400 +797.70001,2100,2500 +505.29999,2100,2600 +759.20001,2100,2700 +415.79999,2100,2800 +394.29999,2100,2900 +1399.1,2100,3000 +820.59998,2100,3100 +1691.4,2100,3200 +833.5,2100,3300 +598.70001,2100,3400 +1181.7,2100,3500 +631.70001,2100,3600 +368.70001,2100,3700 +1029.8,2100,3800 +242.60001,2100,3900 +689.40002,2100,4000 +1886.6,2100,4100 +462.29999,2100,4200 +784.20001,2100,4400 +354.79999,2100,4500 +912.09998,2100,4600 +129.7,2100,4700 +900,2100,4800 +1427.3,2100,4900 +798,2100,5000 +363,2100,5100 +1904.8,2100,5300 +265.60001,2100,5400 +541.40002,2100,5500 +1228,2100,5600 +318.93018,2200,100 +3443.9321,2200,200 +1174.4142,2200,400 +281.46832,2200,500 +1640.6642,2200,600 +956.75464,2200,800 +1290.4,2200,900 +1081.3,2200,1000 +1003.7,2200,1100 +587.70001,2200,1200 +507.29999,2200,1300 +3999.7,2200,1500 +1547,2200,1600 +655.90002,2200,1700 +697.70001,2200,1800 +780.40002,2200,1900 +646.5,2200,2000 +592.29999,2200,2100 +0,2200,2200 +1599.6,2200,2300 +1021.3,2200,2400 +1366.3,2200,2500 +1021.9,2200,2600 +1075.8,2200,2700 +178.5,2200,2800 +519.09998,2200,2900 +1465.9,2200,3000 +862,2200,3100 +1513.7,2200,3200 +1414.8,2200,3300 +1153.6,2200,3400 +868.40002,2200,3500 +1219.1,2200,3600 +800.09998,2200,3700 +1235.3,2200,3800 +833.70001,2200,3900 +463.29999,2200,4000 +1809.1,2200,4100 +1043.5,2200,4200 +1346.8,2200,4400 +671.70001,2200,4500 +1044.8,2200,4600 +467.20001,2200,4700 +444,2200,4800 +1262.1,2200,4900 +1384.2,2200,5000 +879.40002,2200,5100 +1902.9,2200,5300 +831.29999,2200,5400 +958.29999,2200,5500 +1203.9,2200,5600 +1304.3502,2300,100 +3321.6035,2300,200 +2375.082,2300,400 +1442.6398,2300,500 +2657.343,2300,600 +1919.7101,2300,800 +333.60001,2300,900 +561.90002,2300,1000 +619.20001,2300,1100 +1390.6,2300,1200 +1181.9,2300,1300 +5105.2002,2300,1500 +2220.8,2300,1600 +1098.6,2300,1700 +970.20001,2300,1800 +1255.9,2300,1900 +1589.8,2300,2000 +1015.2,2300,2100 +1599.6,2300,2200 +0,2300,2300 +605.09998,2300,2400 +264.39999,2300,2500 +828.29999,2300,2600 +1225.6,2300,2700 +1421.1,2300,2800 +1311.5,2300,2900 +1939.9,2300,3000 +1574.6,2300,3100 +2447.1001,2300,3200 +186.7,2300,3300 +475.29999,2300,3400 +2102.3,2300,3500 +384.10001,2300,3600 +873.90002,2300,3700 +1510.9,2300,3800 +788.59998,2300,3900 +1647.8,2300,4000 +2509.8999,2300,4100 +556.09998,2300,4200 +298.89999,2300,4400 +1024.5,2300,4500 +1536.6,2300,4600 +1133.8,2300,4700 +1901.1,2300,4800 +2207.7,2300,4900 +217.5,2300,5000 +750.09998,2300,5100 +2432.3999,2300,5300 +769.40002,2300,5400 +1026.9,2300,5500 +1914.8,2300,5600 +712.22675,2400,100 +3442.1838,2400,200 +1953.3309,2400,400 +913.61102,2400,500 +2314.252,2400,600 +1542.416,2400,800 +275.60001,2400,900 +66,2400,1000 +18.6,2400,1100 +795.79999,2400,1200 +577.20001,2400,1300 +4783,2400,1500 +1975.1,2400,1600 +669.29999,2400,1700 +516.90002,2400,1800 +907.90002,2400,1900 +1167.9,2400,2000 +477.60001,2400,2100 +1021.3,2400,2200 +605.09998,2400,2300 +0,2400,2400 +351,2400,2500 +615.29999,2400,2600 +1022.9,2400,2700 +845,2400,2800 +851.59998,2400,2900 +1742.2,2400,3000 +1229.7,2400,3100 +2123,2400,3200 +419.89999,2400,3300 +133.8,2400,3400 +1653.7,2400,3500 +279.79999,2400,3600 +271.39999,2400,3700 +1327.4,2400,3800 +336.60001,2400,3900 +1165,2400,4000 +2274.8,2400,4100 +148.39999,2400,4200 +327.39999,2400,4400 +420.39999,2400,4500 +1268.6,2400,4600 +571.90002,2400,4700 +1373.4,2400,4800 +1863.1,2400,4900 +409.39999,2400,5000 +146.3,2400,5100 +2256.8999,2400,5300 +213.39999,2400,5400 +776.70001,2400,5500 +1625.1,2400,5600 +1061.5129,2500,100 +3418.1528,2500,200 +2224.9414,2500,400 +1234.623,2500,500 +2545.7285,2500,600 +1786.7737,2500,800 +76,2500,900 +301.39999,2500,1000 +367,2500,1100 +1127,2500,1200 +927,2500,1300 +5011.8999,2500,1500 +2150.2,2500,1600 +928.5,2500,1700 +785.70001,2500,1800 +1124.7,2500,1900 +1432.9,2500,2000 +797.70001,2500,2100 +1366.3,2500,2200 +264.39999,2500,2300 +351,2500,2400 +0,2500,2500 +733,2500,2600 +1156,2500,2700 +1188.6,2500,2800 +1133.4,2500,2900 +1887.5,2500,3000 +1451.7,2500,3100 +2342.8999,2500,3200 +104.7,2500,3300 +217.60001,2500,3400 +1937.7,2500,3500 +215.5,2500,3600 +614.40002,2500,3700 +1457.5,2500,3800 +594.5,2500,3900 +1463.6,2500,4000 +2446.5,2500,4100 +339,2500,4200 +38.599998,2500,4400 +765.40002,2500,4500 +1446.8,2500,4600 +907,2500,4700 +1696.5,2500,4800 +2092.3,2500,4900 +144.5,2500,5000 +497.29999,2500,5100 +2395,2500,5300 +537.70001,2500,5400 +928.70001,2500,5500 +1821.2,2500,5600 +838.58295,2600,100 +2839.7705,2600,200 +1556.0894,2600,400 +775.03918,2600,500 +1830.234,2600,600 +1094.0647,2600,800 +687.29999,2600,900 +661.70001,2600,1000 +604.40002,2600,1100 +1146.1,2600,1200 +852.29999,2600,1300 +4287,2600,1500 +1417.3,2600,1600 +372.60001,2600,1700 +342.70001,2600,1800 +436.89999,2600,1900 +788.09998,2600,2000 +505.29999,2600,2100 +1021.9,2600,2200 +828.29999,2600,2300 +615.29999,2600,2400 +733,2600,2500 +0,2600,2600 +425.20001,2600,2700 +866.70001,2600,2800 +569.20001,2600,2900 +1157.4,2600,3000 +747.29999,2600,3100 +1621.8,2600,3200 +700.5,2600,3300 +648,2600,3400 +1299,2600,3500 +518.20001,2600,3600 +728.70001,2600,3700 +730,2600,3800 +340.5,2600,3900 +891.59998,2600,4000 +1713.6,2600,4100 +480.60001,2600,4200 +745,2600,4400 +799.70001,2600,4500 +716.29999,2600,4600 +621.70001,2600,4700 +1182.4,2600,4800 +1379.6,2600,4900 +642.09998,2600,5000 +622.29999,2600,5100 +1668.4,2600,5300 +500.5,2600,5400 +199.39999,2600,5500 +1093.3,2600,5600 +1015.6068,2700,100 +2497.2527,2700,200 +1236.5819,2700,400 +794.43103,2700,500 +1443.6023,2700,600 +764.29999,2700,800 +1112.3,2700,900 +1075.2,2700,1000 +1009.7,2700,1100 +1389.9,2700,1200 +1104.4,2700,1300 +3879.6001,2700,1500 +998,2700,1600 +498.70001,2700,1700 +595.90002,2700,1800 +295.5,2700,1900 +580.59998,2700,2000 +759.20001,2700,2100 +1075.8,2700,2200 +1225.6,2700,2300 +1022.9,2700,2400 +1156,2700,2500 +425.20001,2700,2600 +0,2700,2700 +972,2700,2800 +557.5,2700,2900 +732.20001,2700,3000 +431.89999,2700,3100 +1228.9,2700,3200 +1116.3,2700,3300 +1069.6,2700,3400 +1030.8,2700,3500 +942.29999,2700,3600 +1079.7,2700,3700 +306,2700,3800 +705.40002,2700,3900 +760.59998,2700,4000 +1291.8,2700,4100 +898.09998,2700,4200 +1169.4,2700,4400 +1108.9,2700,4500 +320.20001,2700,4600 +829.20001,2700,4700 +1063.4,2700,4800 +1004.5,2700,4900 +1056.5,2700,5000 +1000.4,2700,5100 +1243.5,2700,5300 +869.59998,2700,5400 +246.7,2700,5500 +693.90002,2700,5600 +163.56023,2800,100 +3401.8125,2800,200 +1267.2089,2800,400 +221.20421,2800,500 +1713.7136,2800,600 +987.19226,2800,800 +1112.8,2800,900 +905.90002,2800,1000 +827.29999,2800,1100 +520.59998,2800,1200 +362.60001,2800,1300 +4113.8999,2800,1500 +1560.1,2800,1600 +513.20001,2800,1700 +532.70001,2800,1800 +681.5,2800,1900 +633.29999,2800,2000 +415.79999,2800,2100 +178.5,2800,2200 +1421.1,2800,2300 +845,2800,2400 +1188.6,2800,2500 +866.70001,2800,2600 +972,2800,2700 +0,2800,2800 +421.79999,2800,2900 +1443.9,2800,3000 +828.09998,2800,3100 +1568.5,2800,3200 +1236.3,2800,3300 +976.79999,2800,3400 +954.90002,2800,3500 +1040.7,2800,3600 +633.20001,2800,3700 +1167.1,2800,3800 +658.09998,2800,3900 +490.5,2800,4000 +1836.4,2800,4100 +865,2800,4200 +1169.6,2800,4400 +514.90002,2800,4500 +991.70001,2800,4600 +289.29999,2800,4700 +570.09998,2800,4800 +1308.5,2800,4900 +1205.7,2800,5000 +704.59998,2800,5100 +1908.1,2800,5300 +652.90002,2800,5400 +825.20001,2800,5500 +1203.9,2800,5600 +499.69159,2900,100 +2988.2231,2900,200 +1101.881,2900,400 +238.74861,2900,500 +1476.7578,2900,600 +704.85382,2900,800 +1064.9,2900,900 +917.29999,2900,1000 +833.29999,2900,1100 +895,2900,1200 +643.29999,2900,1300 +3938.3999,2900,1500 +1215.6,2900,1600 +214.3,2900,1700 +347.70001,2900,1800 +262.70001,2900,1900 +320.89999,2900,2000 +394.29999,2900,2100 +519.09998,2900,2200 +1311.5,2900,2300 +851.59998,2900,2400 +1133.4,2900,2500 +569.20001,2900,2600 +557.5,2900,2700 +421.79999,2900,2800 +0,2900,2900 +1055.7,2900,3000 +447.5,2900,3100 +1298.1,2900,3200 +1145.2,2900,3300 +958.40002,2900,3400 +805.90002,2900,3500 +936.70001,2900,3600 +760.29999,2900,3700 +747.5,2900,3800 +539.29999,2900,3900 +336.79999,2900,4000 +1507.7,2900,4100 +797.20001,2900,4200 +1127.9,2900,4400 +717,2900,4500 +582.59998,2900,4600 +377.89999,2900,4700 +614.40002,2900,4800 +1033.6,2900,4900 +1098.1,2900,5000 +752.90002,2900,5100 +1545.4,2900,5300 +640.09998,2900,5400 +456.39999,2900,5500 +851.59998,2900,5600 +1551.1813,3000,100 +1984.4647,3000,200 +893.19525,3000,400 +1223.6277,3000,500 +850.65106,3000,600 +594.28949,3000,800 +1844.5,3000,900 +1798.1,3000,1000 +1727.7,3000,1100 +1947.4,3000,1200 +1697.5,3000,1300 +3180,3000,1500 +305.60001,3000,1600 +1127.6,3000,1700 +1264.6,3000,1800 +863.90002,3000,1900 +819.90002,3000,2000 +1399.1,3000,2100 +1465.9,3000,2200 +1939.9,3000,2300 +1742.2,3000,2400 +1887.5,3000,2500 +1157.4,3000,2600 +732.20001,3000,2700 +1443.9,3000,2800 +1055.7,3000,2900 +0,3000,3000 +616.29999,3000,3100 +638.29999,3000,3200 +1843.8,3000,3300 +1798.5,3000,3400 +890.40002,3000,3500 +1674.3,3000,3600 +1756.8,3000,3700 +430.89999,3000,3800 +1411.9,3000,3900 +1009.7,3000,4000 +572.40002,3000,4100 +1625.2,3000,4200 +1901.5,3000,4400 +1750.9,3000,4500 +487.10001,3000,4600 +1424.1,3000,4700 +1210.5,3000,4800 +542.79999,3000,4900 +1783.4,3000,5000 +1700.3,3000,5100 +514.59998,3000,5300 +1569.2,3000,5400 +970.79999,3000,5500 +298.5,3000,5600 +936.39087,3100,100 +2582.6321,3100,200 +817.40857,3100,400 +608.6076,3100,500 +1094.2856,3100,600 +348.05515,3100,800 +1393.5,3100,900 +1292.2,3100,1000 +1212.7,3100,1100 +1332.3,3100,1200 +1090.4,3100,1300 +3563,3100,1500 +770.70001,3100,1600 +565.79999,3100,1700 +716.70001,3100,1800 +327.39999,3100,1900 +223.60001,3100,2000 +820.59998,3100,2100 +862,3100,2200 +1574.6,3100,2300 +1229.7,3100,2400 +1451.7,3100,2500 +747.29999,3100,2600 +431.89999,3100,2700 +828.09998,3100,2800 +447.5,3100,2900 +616.29999,3100,3000 +0,3100,3100 +895.29999,3100,3200 +1436.3,3100,3300 +1313.9,3100,3400 +598.90002,3100,3500 +1238.3,3100,3600 +1188.6,3100,3700 +409.79999,3100,3800 +894,3100,3900 +430.10001,3100,4000 +1066.1,3100,4100 +1140.3,3100,4200 +1455.7,3100,4400 +1160.5,3100,4500 +201.7,3100,4600 +824.09998,3100,4700 +699.09998,3100,4800 +640.70001,3100,4900 +1380.5,3100,5000 +1157.4,3100,5100 +1098,3100,5300 +1031.8,3100,5400 +552,3100,5500 +407.5,3100,5600 +1716.0898,3200,100 +2223.5754,3200,200 +449.49045,3200,400 +1362.2151,3200,500 +218.95181,3200,600 +595.1651,3200,800 +2286.8,3200,900 +2186.2,3200,1000 +2105.8,3200,1100 +2085.7,3200,1200 +1901.3,3200,1300 +2669.1001,3200,1500 +358.89999,3200,1600 +1454.8,3200,1700 +1607.1,3200,1800 +1222.1,3200,1900 +980.20001,3200,2000 +1691.4,3200,2100 +1513.7,3200,2200 +2447.1001,3200,2300 +2123,3200,2400 +2342.8999,3200,2500 +1621.8,3200,2600 +1228.9,3200,2700 +1568.5,3200,2800 +1298.1,3200,2900 +638.29999,3200,3000 +895.29999,3200,3100 +0,3200,3200 +2320.8999,3200,3300 +2209.3,3200,3400 +670.40002,3200,3500 +2128.1001,3200,3600 +2058.3999,3200,3700 +983.29999,3200,3800 +1788.4,3200,3900 +1078.3,3200,4000 +381.5,3200,4100 +2035.6,3200,4200 +2348.5,3200,4400 +2005.7,3200,4500 +911.5,3200,4600 +1666.4,3200,4700 +1115.8,3200,4800 +265,3200,4900 +2263.3,3200,5000 +2042.8,3200,5100 +593.20001,3200,5300 +1921,3200,5400 +1422.4,3200,5500 +535.09998,3200,5600 +1117.9094,3300,100 +3330.8691,3300,200 +2224.0771,3300,400 +1264.8726,3300,500 +2527.282,3300,600 +1776.8662,3300,800 +156.2,3300,900 +380.29999,3300,1000 +433.5,3300,1100 +1210.3,3300,1200 +995.90002,3300,1300 +4987.3999,3300,1500 +2114.1001,3300,1600 +934.90002,3300,1700 +799.70001,3300,1800 +1111.9,3300,1900 +1434,3300,2000 +833.5,3300,2100 +1414.8,3300,2200 +186.7,3300,2300 +419.89999,3300,2400 +104.7,3300,2500 +700.5,3300,2600 +1116.3,3300,2700 +1236.3,3300,2800 +1145.2,3300,2900 +1843.8,3300,3000 +1436.3,3300,3100 +2320.8999,3300,3200 +0,3300,3300 +292.89999,3300,3400 +1943.8,3300,3500 +208.60001,3300,3600 +690.09998,3300,3700 +1412.9,3300,3800 +613.29999,3300,3900 +1479.7,3300,4000 +2408,3300,4100 +372,3300,4200 +143.3,3300,4400 +840.09998,3300,4500 +1416.7,3300,4600 +949.90002,3300,4700 +1725.2,3300,4800 +2074.8999,3300,4900 +60.700001,3300,5000 +564.09998,3300,5100 +2346.7,3300,5300 +583.79999,3300,5400 +899.5,3300,5500 +1793.8,3300,5600 +845.79138,3400,100 +3437.7798,3400,200 +2059.1541,3400,400 +1036.7902,3400,500 +2405.313,3400,600 +1636.8102,3400,800 +142.89999,3400,900 +87.5,3400,1000 +150.39999,3400,1100 +917.5,3400,1200 +709.5,3400,1300 +4875.2998,3400,1500 +2043.7,3400,1600 +764.79999,3400,1700 +614.70001,3400,1800 +987.79999,3400,1900 +1269.3,3400,2000 +598.70001,3400,2100 +1153.6,3400,2200 +475.29999,3400,2300 +133.8,3400,2400 +217.60001,3400,2500 +648,3400,2600 +1069.6,3400,2700 +976.79999,3400,2800 +958.40002,3400,2900 +1798.5,3400,3000 +1313.9,3400,3100 +2209.3,3400,3200 +292.89999,3400,3300 +0,3400,3400 +1763.8,3400,3500 +199.8,3400,3600 +398.60001,3400,3700 +1375.5,3400,3800 +424.70001,3400,3900 +1279.8,3400,4000 +2342.8999,3400,4100 +175,3400,4200 +193.7,3400,4400 +549.40002,3400,4500 +1335.5,3400,4600 +700,3400,4700 +1498.1,3400,4800 +1952.4,3400,4900 +291.60001,3400,5000 +279.89999,3400,5100 +2312,3400,5300 +333.5,3400,5400 +827.70001,3400,5500 +1700.8,3400,5600 +1114.4049,3500,100 +2806.3108,3500,200 +312.86325,3500,400 +777.56403,3500,500 +774.05371,3500,600 +316.55722,3500,800 +1870.2,3500,900 +1719.7,3500,1000 +1635.2,3500,1100 +1453.9,3500,1200 +1310.8,3500,1300 +3166.6001,3500,1500 +817,3500,1600 +1009.9,3500,1700 +1152.9,3500,1800 +863.20001,3500,1900 +513.29999,3500,2000 +1181.7,3500,2100 +868.40002,3500,2200 +2102.3,3500,2300 +1653.7,3500,2400 +1937.7,3500,2500 +1299,3500,2600 +1030.8,3500,2700 +954.90002,3500,2800 +805.90002,3500,2900 +890.40002,3500,3000 +598.90002,3500,3100 +670.40002,3500,3200 +1943.8,3500,3300 +1763.8,3500,3400 +0,3500,3500 +1736.4,3500,3600 +1525.3,3500,3700 +946.20001,3500,3800 +1344.8,3500,3900 +495.20001,3500,4000 +1015.3,3500,4100 +1602.9,3500,4200 +1933.3,3500,4400 +1441.8,3500,4500 +759,3500,4600 +1119.7,3500,4700 +447.79999,3500,4800 +455.70001,3500,4900 +1893.8,3500,5000 +1544.4,3500,5100 +1168.6,3500,5300 +1440.6,3500,5400 +1119.2,3500,5500 +595.90002,3500,5600 +931.53394,3600,100 +3252.4641,3600,200 +2018.8484,3600,400 +1058.9569,3600,500 +2331.8479,3600,600 +1575.6758,3600,800 +172.39999,3600,900 +271,3600,1000 +286.79999,3600,1100 +1068.6,3600,1200 +828.40002,3600,1300 +4796.8999,3600,1500 +1935.1,3600,1600 +726.90002,3600,1700 +591.09998,3600,1800 +912,3600,1900 +1227.8,3600,2000 +631.70001,3600,2100 +1219.1,3600,2200 +384.10001,3600,2300 +279.79999,3600,2400 +215.5,3600,2500 +518.20001,3600,2600 +942.29999,3600,2700 +1040.7,3600,2800 +936.70001,3600,2900 +1674.3,3600,3000 +1238.3,3600,3100 +2128.1001,3600,3200 +208.60001,3600,3300 +199.8,3600,3400 +1736.4,3600,3500 +0,3600,3600 +543.40002,3600,3700 +1245.1,3600,3800 +405.70001,3600,3900 +1271.1,3600,4000 +2231.7,3600,4100 +180.10001,3600,4200 +227.10001,3600,4400 +686.09998,3600,4500 +1231.3,3600,4600 +752.20001,3600,4700 +1518.3,3600,4800 +1878.6,3600,4900 +166.60001,3600,5000 +406.20001,3600,5100 +2183.3999,3600,5300 +394.70001,3600,5400 +713.29999,3600,5500 +1605.8,3600,5600 +481.46091,3700,100 +3561.5747,3700,200 +1834.722,3700,400 +752.31226,3700,500 +2234.749,3700,600 +1464.6719,3700,800 +541.09998,3700,900 +313,3700,1000 +259.89999,3700,1100 +525.59998,3700,1200 +317.39999,3700,1300 +4687,3700,1500 +1957,3700,1600 +630.79999,3700,1700 +493.39999,3700,1800 +893.59998,3700,1900 +1079.6,3700,2000 +368.70001,3700,2100 +800.09998,3700,2200 +873.90002,3700,2300 +271.39999,3700,2400 +614.40002,3700,2500 +728.70001,3700,2600 +1079.7,3700,2700 +633.20001,3700,2800 +760.29999,3700,2900 +1756.8,3700,3000 +1188.6,3700,3100 +2058.3999,3700,3200 +690.09998,3700,3300 +398.60001,3700,3400 +1525.3,3700,3500 +543.40002,3700,3600 +0,3700,3700 +1369.6,3700,3800 +389.20001,3700,3900 +1030.3,3700,4000 +2254.2,3700,4100 +377.29999,3700,4200 +586.09998,3700,4400 +151.10001,3700,4500 +1270.3,3700,4600 +405.60001,3700,4700 +1193.1,3700,4800 +1793.8,3700,4900 +680.79999,3700,5000 +139.39999,3700,5100 +2266.3999,3700,5300 +229.8,3700,5400 +839.59998,3700,5500 +1595.7,3700,5600 +1241.901,3800,100 +2243.7854,3800,200 +1082.4448,3800,400 +961.66644,3800,500 +1202.0992,3800,600 +637.24768,3800,800 +1416,3800,900 +1380.5,3800,1000 +1313.8,3800,1100 +1631.7,3800,1200 +1358.5,3800,1300 +3600.6001,3800,1500 +712.79999,3800,1600 +757.90002,3800,1700 +877.79999,3800,1800 +505,3800,1900 +627.79999,3800,2000 +1029.8,3800,2100 +1235.3,3800,2200 +1510.9,3800,2300 +1327.4,3800,2400 +1457.5,3800,2500 +730,3800,2600 +306,3800,2700 +1167.1,3800,2800 +747.5,3800,2900 +430.89999,3800,3000 +409.79999,3800,3100 +983.29999,3800,3200 +1412.9,3800,3300 +1375.5,3800,3400 +946.20001,3800,3500 +1245.1,3800,3600 +1369.6,3800,3700 +0,3800,3800 +1005,3800,3900 +834.20001,3800,4000 +998.90002,3800,4100 +1204,3800,4200 +1472.2,3800,4400 +1384.4,3800,4500 +208.3,3800,4600 +1079.7,3800,4700 +1108.9,3800,4800 +792.70001,3800,4900 +1352.5,3800,5000 +1298.7,3800,5100 +938.59998,3800,5300 +1167.3,3800,5400 +550.79999,3800,5500 +462.20001,3800,5600 +577.58514,3900,100 +3173.6365,3900,200 +1636.4548,3900,400 +654.25818,3900,500 +1982.1279,3900,600 +1212.4308,3900,800 +525.59998,3900,900 +398.39999,3900,1000 +320.29999,3900,1100 +826.40002,3900,1200 +539.40002,3900,1300 +4452,3900,1500 +1639.3,3900,1600 +340.20001,3900,1700 +191.89999,3900,1800 +571.29999,3900,1900 +845.29999,3900,2000 +242.60001,3900,2100 +833.70001,3900,2200 +788.59998,3900,2300 +336.60001,3900,2400 +594.5,3900,2500 +340.5,3900,2600 +705.40002,3900,2700 +658.09998,3900,2800 +539.29999,3900,2900 +1411.9,3900,3000 +894,3900,3100 +1788.4,3900,3200 +613.29999,3900,3300 +424.70001,3900,3400 +1344.8,3900,3500 +405.70001,3900,3600 +389.20001,3900,3700 +1005,3900,3800 +0,3900,3900 +869.40002,3900,4000 +1939,3900,4100 +258.20001,3900,4200 +588.59998,3900,4400 +463.39999,3900,4500 +934.09998,3900,4600 +371.70001,3900,4700 +1112.7,3900,4800 +1529.8,3900,4900 +571.5,3900,5000 +295.10001,3900,5100 +1926.3,3900,5300 +165.39999,3900,5400 +460,3900,5500 +1288.4,3900,5600 +639.18048,4000,100 +2993.2622,4000,200 +804.88892,4000,400 +288.52658,4000,500 +1228.981,4000,600 +499.46585,4000,800 +1393.3,4000,900 +1231,4000,1000 +1146.4,4000,1100 +1009.8,4000,1200 +829.40002,4000,1300 +3659,4000,1500 +1084.6,4000,1600 +551,4000,1700 +680.09998,4000,1800 +496.60001,4000,1900 +207,4000,2000 +689.40002,4000,2100 +463.29999,4000,2200 +1647.8,4000,2300 +1165,4000,2400 +1463.6,4000,2500 +891.59998,4000,2600 +760.59998,4000,2700 +490.5,4000,2800 +336.79999,4000,2900 +1009.7,4000,3000 +430.10001,4000,3100 +1078.3,4000,3200 +1479.7,4000,3300 +1279.8,4000,3400 +495.20001,4000,3500 +1271.1,4000,3600 +1030.3,4000,3700 +834.20001,4000,3800 +869.40002,4000,3900 +0,4000,4000 +1352,4000,4100 +1125.3,4000,4200 +1455.9,4000,4400 +950.79999,4000,4500 +629,4000,4600 +624.70001,4000,4700 +305.10001,4000,4800 +818,4000,4900 +1433.7,4000,5000 +1051.2,4000,5100 +1439.7,4000,5300 +951.59998,4000,5400 +746.79999,4000,5500 +741.09998,4000,5600 +1970.6246,4100,100 +1842.2766,4100,200 +828.90674,4100,400 +1619.1962,4100,500 +474.84555,4100,600 +853.01898,4100,800 +2399.6001,4100,900 +2334.1001,4100,1000 +2259,4100,1100 +2356.8,4100,1200 +2142.2,4100,1300 +2608.2,4100,1500 +299.70001,4100,1600 +1626.8,4100,1700 +1774.1,4100,1800 +1370.6,4100,1900 +1211.5,4100,2000 +1886.6,4100,2100 +1809.1,4100,2200 +2509.8999,4100,2300 +2274.8,4100,2400 +2446.5,4100,2500 +1713.6,4100,2600 +1291.8,4100,2700 +1836.4,4100,2800 +1507.7,4100,2900 +572.40002,4100,3000 +1066.1,4100,3100 +381.5,4100,3200 +2408,4100,3300 +2342.8999,4100,3400 +1015.3,4100,3500 +2231.7,4100,3600 +2254.2,4100,3700 +998.90002,4100,3800 +1939,4100,3900 +1352,4100,4000 +0,4100,4100 +2168,4100,4200 +2458.2,4100,4400 +2224.3999,4100,4500 +1007.5,4100,4600 +1885.5,4100,4700 +1445.8,4100,4800 +562.40002,4100,4900 +2348,4100,5000 +2216.2,4100,5100 +238,4100,5300 +2087.7,4100,5400 +1520.1,4100,5500 +658.59998,4100,5600 +752.08038,4200,100 +3296.4731,4200,200 +1894.293,4200,400 +896.97687,4200,500 +2232.8188,4200,600 +1466.0251,4200,800 +268.10001,4200,900 +182.60001,4200,1000 +144.7,4200,1100 +898.20001,4200,1200 +649.90002,4200,1300 +4702.7002,4200,1500 +1868.8,4200,1600 +597.09998,4200,1700 +450.10001,4200,1800 +813.70001,4200,1900 +1102.7,4200,2000 +462.29999,4200,2100 +1043.5,4200,2200 +556.09998,4200,2300 +148.39999,4200,2400 +339,4200,2500 +480.60001,4200,2600 +898.09998,4200,2700 +865,4200,2800 +797.20001,4200,2900 +1625.2,4200,3000 +1140.3,4200,3100 +2035.6,4200,3200 +372,4200,3300 +175,4200,3400 +1602.9,4200,3500 +180.10001,4200,3600 +377.29999,4200,3700 +1204,4200,3800 +258.20001,4200,3900 +1125.3,4200,4000 +2168,4200,4100 +0,4200,4200 +330.79999,4200,4400 +513.29999,4200,4500 +1160.6,4200,4600 +578,4200,4700 +1358.6,4200,4800 +1779.6,4200,4900 +341.29999,4200,5000 +238,4200,5100 +2139.1001,4200,5300 +215.10001,4200,5400 +654.59998,4200,5500 +1525.8,4200,5600 +1039.5215,4400,100 +3448.4883,4400,200 +2222.8384,4400,400 +1222.0619,4400,500 +2549.9517,4400,600 +1788.3411,4400,800 +63.099998,4400,900 +274,4400,1000 +344.20001,4400,1100 +1095.1,4400,1200 +900.5,4400,1300 +5017.6001,4400,1500 +2161.2,4400,1600 +925.5,4400,1700 +780.40002,4400,1800 +1128.3,4400,1900 +1430.8,4400,2000 +784.20001,4400,2100 +1346.8,4400,2200 +298.89999,4400,2300 +327.39999,4400,2400 +38.599998,4400,2500 +745,4400,2600 +1169.4,4400,2700 +1169.6,4400,2800 +1127.9,4400,2900 +1901.5,4400,3000 +1455.7,4400,3100 +2348.5,4400,3200 +143.3,4400,3300 +193.7,4400,3400 +1933.3,4400,3500 +227.10001,4400,3600 +586.09998,4400,3700 +1472.2,4400,3800 +588.59998,4400,3900 +1455.9,4400,4000 +2458.2,4400,4100 +330.79999,4400,4200 +0,4400,4400 +737.09998,4400,4500 +1456.1,4400,4600 +890.40002,4400,4700 +1684,4400,4800 +2096.3999,4400,4900 +179.8,4400,5000 +473.10001,4400,5100 +2410.3999,4400,5300 +521.59998,4400,5400 +938.79999,4400,5500 +1829.2,4400,5600 +354.5762,4500,100 +3604.7126,4500,200 +1753.9199,4500,400 +664.30255,4500,500 +2172.6396,4500,600 +1410.631,4500,800 +692,4500,900 +464,4500,1000 +407.79999,4500,1100 +385.39999,4500,1200 +171.60001,4500,1300 +4608.3999,4500,1500 +1931,4500,1600 +627.90002,4500,1700 +513.20001,4500,1800 +889.79999,4500,1900 +1026.5,4500,2000 +354.79999,4500,2100 +671.70001,4500,2200 +1024.5,4500,2300 +420.39999,4500,2400 +765.40002,4500,2500 +799.70001,4500,2600 +1108.9,4500,2700 +514.90002,4500,2800 +717,4500,2900 +1750.9,4500,3000 +1160.5,4500,3100 +2005.7,4500,3200 +840.09998,4500,3300 +549.40002,4500,3400 +1441.8,4500,3500 +686.09998,4500,3600 +151.10001,4500,3700 +1384.4,4500,3800 +463.39999,4500,3900 +950.79999,4500,4000 +2224.3999,4500,4100 +513.29999,4500,4200 +737.09998,4500,4400 +0,4500,4500 +1264.5,4500,4600 +340.29999,4500,4700 +1083.7,4500,4800 +1740.8,4500,4900 +828.79999,4500,5000 +280.29999,4500,5100 +2253.3,4500,5300 +330.10001,4500,5400 +880.09998,4500,5500 +1567.2,4500,5600 +1082.3309,4600,100 +2410.1079,4600,200 +930.49426,4600,400 +778.48999,4600,500 +1124.7947,4600,600 +464.36285,4600,800 +1396.3,4600,900 +1327.1,4600,1000 +1253.2,4600,1100 +1477.4,4600,1200 +1217.3,4600,1300 +3571.3,4600,1500 +708.29999,4600,1600 +640.59998,4600,1700 +779.20001,4600,1800 +376.89999,4600,1900 +422,4600,2000 +912.09998,4600,2100 +1044.8,4600,2200 +1536.6,4600,2300 +1268.6,4600,2400 +1446.8,4600,2500 +716.29999,4600,2600 +320.20001,4600,2700 +991.70001,4600,2800 +582.59998,4600,2900 +487.10001,4600,3000 +201.7,4600,3100 +911.5,4600,3200 +1416.7,4600,3300 +1335.5,4600,3400 +759,4600,3500 +1231.3,4600,3600 +1270.3,4600,3700 +208.3,4600,3800 +934.09998,4600,3900 +629,4600,4000 +1007.5,4600,4100 +1160.6,4600,4200 +1456.1,4600,4400 +1264.5,4600,4500 +0,4600,4600 +941.20001,4600,4700 +900.59998,4600,4800 +684.40002,4600,4900 +1358,4600,5000 +1217.9,4600,5100 +996.79999,4600,5300 +1087.5,4600,5400 +518.20001,4600,5500 +378.20001,4600,5600 +217.34731,4700,100 +3319.6003,4700,200 +1429.3254,4700,400 +349.72833,4700,500 +1836.2632,4700,600 +1071.228,4700,800 +831.70001,4700,900 +635.70001,4700,1000 +553.59998,4700,1100 +560.79999,4700,1200 +278.79999,4700,1300 +4282.3999,4700,1500 +1593.4,4700,1600 +330.60001,4700,1700 +279.5,4700,1800 +575.29999,4700,1900 +686.59998,4700,2000 +129.7,4700,2100 +467.20001,4700,2200 +1133.8,4700,2300 +571.90002,4700,2400 +907,4700,2500 +621.70001,4700,2600 +829.20001,4700,2700 +289.29999,4700,2800 +377.89999,4700,2900 +1424.1,4700,3000 +824.09998,4700,3100 +1666.4,4700,3200 +949.90002,4700,3300 +700,4700,3400 +1119.7,4700,3500 +752.20001,4700,3600 +405.60001,4700,3700 +1079.7,4700,3800 +371.70001,4700,3900 +624.70001,4700,4000 +1885.5,4700,4100 +578,4700,4200 +890.40002,4700,4400 +340.29999,4700,4500 +941.20001,4700,4600 +0,4700,4700 +803.40002,4700,4800 +1401.4,4700,4900 +917.59998,4700,5000 +440.70001,4700,5100 +1920.6,4700,5300 +369.29999,4700,5400 +631.20001,4700,5500 +1229.4,4700,5600 +733.29602,4800,100 +3184.9512,4800,200 +740.12372,4800,400 +461.9743,4800,500 +1219.1371,4800,600 +629.16016,4800,800 +1623.1,4800,900 +1438,4800,1000 +1354.9,4800,1100 +1029.8,4800,1200 +931.70001,4800,1300 +3555.8999,4800,1500 +1215.9,4800,1600 +824.20001,4800,1700 +933.5,4800,1800 +801.20001,4800,1900 +490.70001,4800,2000 +900,4800,2100 +444,4800,2200 +1901.1,4800,2300 +1373.4,4800,2400 +1696.5,4800,2500 +1182.4,4800,2600 +1063.4,4800,2700 +570.09998,4800,2800 +614.40002,4800,2900 +1210.5,4800,3000 +699.09998,4800,3100 +1115.8,4800,3200 +1725.2,4800,3300 +1498.1,4800,3400 +447.79999,4800,3500 +1518.3,4800,3600 +1193.1,4800,3700 +1108.9,4800,3800 +1112.7,4800,3900 +305.10001,4800,4000 +1445.8,4800,4100 +1358.6,4800,4200 +1684,4800,4400 +1083.7,4800,4500 +900.59998,4800,4600 +803.40002,4800,4700 +0,4800,4800 +884,4800,4900 +1684.1,4800,5000 +1244,4800,5100 +1575.6,4800,5300 +1164.6,4800,5400 +1048.2,4800,5500 +917.79999,4800,5600 +1453.8961,4900,100 +2359.2075,4900,200 +354.13834,4900,400 +1099.6786,4900,500 +453.57294,4900,600 +330.18384,4900,800 +2034.2,4900,900 +1926.8,4900,1000 +1845.7,4900,1100 +1827.4,4900,1200 +1637.6,4900,1300 +2923.1001,4900,1500 +375,4900,1600 +1194,4900,1700 +1346.5,4900,1800 +968.09998,4900,1900 +715.29999,4900,2000 +1427.3,4900,2100 +1262.1,4900,2200 +2207.7,4900,2300 +1863.1,4900,2400 +2092.3,4900,2500 +1379.6,4900,2600 +1004.5,4900,2700 +1308.5,4900,2800 +1033.6,4900,2900 +542.79999,4900,3000 +640.70001,4900,3100 +265,4900,3200 +2074.8999,4900,3300 +1952.4,4900,3400 +455.70001,4900,3500 +1878.6,4900,3600 +1793.8,4900,3700 +792.70001,4900,3800 +1529.8,4900,3900 +818,4900,4000 +562.40002,4900,4100 +1779.6,4900,4200 +2096.3999,4900,4400 +1740.8,4900,4500 +684.40002,4900,4600 +1401.4,4900,4700 +884,4900,4800 +0,4900,4900 +2018.3,4900,5000 +1780.1,4900,5100 +715.59998,4900,5300 +1659.4,4900,5400 +1180.8,4900,5500 +331.10001,4900,5600 +1092.705,5000,100 +3276.7676,5000,200 +2171.6936,5000,400 +1225.2714,5000,500 +2470.3586,5000,600 +1722.2137,5000,800 +171.5,5000,900 +378.29999,5000,1000 +421.20001,5000,1100 +1204.8,5000,1200 +979.59998,5000,1300 +4929.1001,5000,1500 +2054.3999,5000,1600 +886.5,5000,1700 +754.70001,5000,1800 +1057.1,5000,1900 +1382.9,5000,2000 +798,5000,2100 +1384.2,5000,2200 +217.5,5000,2300 +409.39999,5000,2400 +144.5,5000,2500 +642.09998,5000,2600 +1056.5,5000,2700 +1205.7,5000,2800 +1098.1,5000,2900 +1783.4,5000,3000 +1380.5,5000,3100 +2263.3,5000,3200 +60.700001,5000,3300 +291.60001,5000,3400 +1893.8,5000,3500 +166.60001,5000,3600 +680.79999,5000,3700 +1352.5,5000,3800 +571.5,5000,3900 +1433.7,5000,4000 +2348,5000,4100 +341.29999,5000,4200 +179.8,5000,4400 +828.79999,5000,4500 +1358,5000,4600 +917.59998,5000,4700 +1684.1,5000,4800 +2018.3,5000,4900 +0,5000,5000 +549.59998,5000,5100 +2286.1001,5000,5300 +556,5000,5400 +841.29999,5000,5500 +1735.4,5000,5600 +567.64008,5100,100 +3462.0112,5100,200 +1848.8412,5100,400 +788.42499,5100,500 +2227.6006,5100,600 +1454.5239,5100,800 +421.89999,5100,900 +202.2,5100,1000 +131,5100,1100 +662.59998,5100,1200 +431.79999,5100,1300 +4691.2998,5100,1500 +1917.1,5100,1600 +591.59998,5100,1700 +442.20001,5100,1800 +846,5100,1900 +1073.4,5100,2000 +363,5100,2100 +879.40002,5100,2200 +750.09998,5100,2300 +146.3,5100,2400 +497.29999,5100,2500 +622.29999,5100,2600 +1000.4,5100,2700 +704.59998,5100,2800 +752.90002,5100,2900 +1700.3,5100,3000 +1157.4,5100,3100 +2042.8,5100,3200 +564.09998,5100,3300 +279.89999,5100,3400 +1544.4,5100,3500 +406.20001,5100,3600 +139.39999,5100,3700 +1298.7,5100,3800 +295.10001,5100,3900 +1051.2,5100,4000 +2216.2,5100,4100 +238,5100,4200 +473.10001,5100,4400 +280.29999,5100,4500 +1217.9,5100,4600 +440.70001,5100,4700 +1244,5100,4800 +1780.1,5100,4900 +549.59998,5100,5000 +0,5100,5100 +2213.7,5100,5300 +131.5,5100,5400 +755.09998,5100,5500 +1560.4,5100,5600 +2029.8469,5300,100 +1646.0514,5300,200 +1025.0396,5300,400 +1687.2906,5300,500 +711.51343,5300,600 +954.52271,5300,800 +2354.5,5300,900 +2312.6001,5300,1000 +2242.3,5300,1100 +2423.5,5300,1200 +2188.3999,5300,1300 +2729.3999,5300,1500 +360.10001,5300,1600 +1635.8,5300,1700 +1775.9,5300,1800 +1372.8,5300,1900 +1274.9,5300,2000 +1904.8,5300,2100 +1902.9,5300,2200 +2432.3999,5300,2300 +2256.8999,5300,2400 +2395,5300,2500 +1668.4,5300,2600 +1243.5,5300,2700 +1908.1,5300,2800 +1545.4,5300,2900 +514.59998,5300,3000 +1098,5300,3100 +593.20001,5300,3200 +2346.7,5300,3300 +2312,5300,3400 +1168.6,5300,3500 +2183.3999,5300,3600 +2266.3999,5300,3700 +938.59998,5300,3800 +1926.3,5300,3900 +1439.7,5300,4000 +238,5300,4100 +2139.1001,5300,4200 +2410.3999,5300,4400 +2253.3,5300,4500 +996.79999,5300,4600 +1920.6,5300,4700 +1575.6,5300,4800 +715.59998,5300,4900 +2286.1001,5300,5000 +2213.7,5300,5100 +0,5300,5300 +2082.8,5300,5400 +1484.5,5300,5500 +704.20001,5300,5600 +536.98462,5400,100 +3337.8293,5400,200 +1741.1154,5400,400 +703.36243,5400,500 +2108.811,5400,600 +1335.9457,5400,800 +462.39999,5400,900 +279.39999,5400,1000 +194.8,5400,1100 +711.59998,5400,1200 +444.60001,5400,1300 +4575.7002,5400,1500 +1788.3,5400,1600 +466.60001,5400,1700 +315.20001,5400,1800 +717.09998,5400,1900 +958.29999,5400,2000 +265.60001,5400,2100 +831.29999,5400,2200 +769.40002,5400,2300 +213.39999,5400,2400 +537.70001,5400,2500 +500.5,5400,2600 +869.59998,5400,2700 +652.90002,5400,2800 +640.09998,5400,2900 +1569.2,5400,3000 +1031.8,5400,3100 +1921,5400,3200 +583.79999,5400,3300 +333.5,5400,3400 +1440.6,5400,3500 +394.70001,5400,3600 +229.8,5400,3700 +1167.3,5400,3800 +165.39999,5400,3900 +951.59998,5400,4000 +2087.7,5400,4100 +215.10001,5400,4200 +521.59998,5400,4400 +330.10001,5400,4500 +1087.5,5400,4600 +369.29999,5400,4700 +1164.6,5400,4800 +1659.4,5400,4900 +556,5400,5000 +131.5,5400,5100 +2082.8,5400,5300 +0,5400,5400 +624.90002,5400,5500 +1433,5400,5600 +835.7821,5500,100 +2725.9456,5500,200 +1365.8253,5500,400 +687.89459,5500,500 +1630.9467,5500,600 +899.84467,5500,800 +879.59998,5500,900 +829.79999,5500,1000 +763.20001,5500,1100 +1187.4,5500,1200 +895.20001,5500,1300 +4088,5500,1500 +1222.5,5500,1600 +314,5500,1700 +370.5,5500,1800 +259.5,5500,1900 +616.09998,5500,2000 +541.40002,5500,2100 +958.29999,5500,2200 +1026.9,5500,2300 +776.70001,5500,2400 +928.70001,5500,2500 +199.39999,5500,2600 +246.7,5500,2700 +825.20001,5500,2800 +456.39999,5500,2900 +970.79999,5500,3000 +552,5500,3100 +1422.4,5500,3200 +899.5,5500,3300 +827.70001,5500,3400 +1119.2,5500,3500 +713.29999,5500,3600 +839.59998,5500,3700 +550.79999,5500,3800 +460,5500,3900 +746.79999,5500,4000 +1520.1,5500,4100 +654.59998,5500,4200 +938.79999,5500,4400 +880.09998,5500,4500 +518.20001,5500,4600 +631.20001,5500,4700 +1048.2,5500,4800 +1180.8,5500,4900 +841.29999,5500,5000 +755.09998,5500,5100 +1484.5,5500,5300 +624.90002,5500,5400 +0,5500,5500 +894.40002,5500,5600 +1327.3319,5600,100 +2268.6812,5600,200 +644.98596,5600,400 +983.10938,5600,500 +750.90369,5600,600 +295.91144,5600,800 +1768.6,5600,900 +1686,5600,1000 +1608.8,5600,1100 +1719.9,5600,1200 +1491,5600,1300 +3193.7,5600,1500 +364,5600,1600 +969.59998,5600,1700 +1118.3,5600,1800 +717.29999,5600,1900 +570.70001,5600,2000 +1228,5600,2100 +1203.9,5600,2200 +1914.8,5600,2300 +1625.1,5600,2400 +1821.2,5600,2500 +1093.3,5600,2600 +693.90002,5600,2700 +1203.9,5600,2800 +851.59998,5600,2900 +298.5,5600,3000 +407.5,5600,3100 +535.09998,5600,3200 +1793.8,5600,3300 +1700.8,5600,3400 +595.90002,5600,3500 +1605.8,5600,3600 +1595.7,5600,3700 +462.20001,5600,3800 +1288.4,5600,3900 +741.09998,5600,4000 +658.59998,5600,4100 +1525.8,5600,4200 +1829.2,5600,4400 +1567.2,5600,4500 +378.20001,5600,4600 +1229.4,5600,4700 +917.79999,5600,4800 +331.10001,5600,4900 +1735.4,5600,5000 +1560.4,5600,5100 +704.20001,5600,5300 +1433,5600,5400 +894.40002,5600,5500 +0,5600,5600 diff --git a/hlink/tests/input_data/street_abbrevs.csv b/hlink/tests/input_data/street_abbrevs.csv new file mode 100644 index 0000000..9c6c82c --- /dev/null +++ b/hlink/tests/input_data/street_abbrevs.csv @@ -0,0 +1,544 @@ +ALLEY,ALLEE +ALLEY,ALLEY +ALLEY,ALLY +ALLEY,ALY +ANEX,ANEX +ANEX,ANNEX +ANEX,ANNX +ANEX,ANX +ARCADE,ARCADE +ARCADE,ARC +AVENUE,AV +AVENUE,AVE +AVENUE,AVEN +AVENUE,AVENU +AVENUE,AVENUE +AVENUE,AVN +AVENUE,AVNUE +BAYOU,BAYOU +BAYOU,BYU +BAYOU,BAYOO +BEACH,BCH +BEACH,BEACH +BEND,BEND +BEND,BND +BLUFF,BLF +BLUFF,BLUF +BLUFF,BLUFF +BLUFFS,BLFS +BLUFFS,BLUFFS +BOTTOM,BOT +BOTTOM,BTM +BOTTOM,BOTTOM +BOTTOM,BOTTM +BOULEVARD,BLVD +BOULEVARD,BOUL +BOULEVARD,BOULEVARD +BOULEVARD,BOULV +BRANCH,BR +BRANCH,BRNCH +BRANCH,BRANCH +BRIDGE,BRIDGE +BRIDGE,BRG +BRIDGE,BRDGE +BROOK,BRK +BROOK,BROOK +BROOKS,BROOKS +BROOKS,BRKS +BURG,BURG +BURG,BG +BURGS,BURGS +BURGS,BGS +BYPASS,BYPA +BYPASS,BYPS +BYPASS,BYPASS +BYPASS,BYPAS +BYPASS,BYP +CAMP,CP +CAMP,CMP +CAMP,CAMP +CANYON,CANYN +CANYON,CANYON +CANYON,CNYN +CANYON,CYN +CAPE,CAPE +CAPE,CPE +CAUSEWAY,CAUSEWAY +CAUSEWAY,CAUSWA +CAUSEWAY,CSWY +CENTER,CENTRE +CENTER,CEN +CENTER,CENTER +CENTER,CENTR +CENTER,CENT +CENTER,CTR +CENTER,CNTR +CENTER,CNTER +CENTERS,CENTERS +CENTERS,CTRS +CIRCLE,CIRC +CIRCLE,CIR +CIRCLE,CIRCL +CIRCLE,CRCLE +CIRCLE,CRCL +CIRCLE,CIRCLE +CIRCLES,CIRS +CIRCLES,CIRCLES +CLIFF,CLIFF +CLIFF,CLF +CLIFFS,CLFS +CLIFFS,CLIFFS +CLUB,CLB +CLUB,CLUB +COMMON,CMN +COMMON,COMMON +COMMONS,CMNS +COMMONS,COMMONS +CORNER,CORNER +CORNER,COR +CORNERS,CORNERS +CORNERS,CORS +COURSE,COURSE +COURSE,CRSE +COURT,CT +COURT,COURT +COURTS,CTS +COURTS,COURTS +COVE,CV +COVE,COVE +COVES,CVS +COVES,COVES +CREEK,CREEK +CREEK,CRK +CRESCENT,CRESCENT +CRESCENT,CRES +CRESCENT,CRSENT +CRESCENT,CRSNT +CREST,CREST +CREST,CRST +CROSSING,CROSSING +CROSSING,CRSSNG +CROSSING,XING +CROSSROAD,XRD +CROSSROAD,CROSSROAD +CROSSROADS,XRDS +CROSSROADS,CROSSROADS +CURVE,CURVE +CURVE,CURV +DALE,DL +DALE,DALE +DAM,DM +DAM,DAM +DIVIDE,DV +DIVIDE,DIVIDE +DIVIDE,DIV +DIVIDE,DVD +DRIVE,DRV +DRIVE,DRIVE +DRIVE,DRIV +DRIVE,DR +DRIVES,DRS +DRIVES,DRIVES +ESTATE,EST +ESTATE,ESTATE +ESTATES,ESTATES +ESTATES,ESTS +EXPRESSWAY,EXPY +EXPRESSWAY,EXP +EXPRESSWAY,EXPR +EXPRESSWAY,EXPRESSWAY +EXPRESSWAY,EXPW +EXPRESSWAY,EXPRESS +EXTENSION,EXT +EXTENSION,EXTENSION +EXTENSION,EXTN +EXTENSION,EXTNSN +EXTENSIONS,EXTS +FALL,FALL +FALLS,FLS +FALLS,FALLS +FERRY,FRY +FERRY,FRRY +FERRY,FERRY +FIELD,FIELD +FIELD,FLD +FIELDS,FIELDS +FIELDS,FLDS +FLAT,FLAT +FLAT,FLT +FLATS,FLATS +FLATS,FLTS +FORD,FRD +FORD,FORD +FORDS,FORDS +FORDS,FRDS +FOREST,FOREST +FOREST,FORESTS +FOREST,FRST +FORGE,FORG +FORGE,FORGE +FORGE,FRG +FORGES,FRGS +FORGES,FORGES +FORK,FORK +FORK,FRK +FORKS,FORKS +FORKS,FRKS +FORT,FORT +FORT,FRT +FORT,FT +FREEWAY,FWY +FREEWAY,FRWY +FREEWAY,FRWAY +FREEWAY,FREEWY +FREEWAY,FREEWAY +GARDEN,GRDEN +GARDEN,GARDEN +GARDEN,GARDN +GARDEN,GDN +GARDEN,GRDN +GARDENS,GRDNS +GARDENS,GDNS +GARDENS,GARDENS +GATEWAY,GATEWAY +GATEWAY,GATEWY +GATEWAY,GATWAY +GATEWAY,GTWAY +GATEWAY,GTWY +GLEN,GLEN +GLEN,GLN +GLENS,GLNS +GLENS,GLENS +GREEN,GREEN +GREEN,GRN +GREENS,GRNS +GREENS,GREENS +GROVE,GROV +GROVE,GROVE +GROVE,GRV +GROVES,GRVS +GROVES,GROVES +HARBOR,HARB +HARBOR,HARBOR +HARBOR,HARBR +HARBOR,HBR +HARBOR,HRBOR +HARBORS,HBRS +HARBORS,HARBORS +HAVEN,HAVEN +HAVEN,HVN +HEIGHTS,HT +HEIGHTS,HTS +HIGHWAY,HWY +HIGHWAY,HIGHWAY +HIGHWAY,HIGHWY +HIGHWAY,HIWAY +HIGHWAY,HIWY +HIGHWAY,HWAY +HILL,HL +HILL,HILL +HILLS,HILLS +HILLS,HLS +HOLLOW,HLLW +HOLLOW,HOLLOW +HOLLOW,HOLLOWS +HOLLOW,HOLW +HOLLOW,HOLWS +INLET,INLT +ISLAND,IS +ISLAND,ISLAND +ISLAND,ISLND +ISLANDS,ISLANDS +ISLANDS,ISLNDS +ISLANDS,ISS +ISLE,ISLE +ISLE,ISLES +JUNCTION,JUNCTON +JUNCTION,JUNCTION +JUNCTION,JCTN +JUNCTION,JUNCTN +JUNCTION,JCT +JUNCTION,JCTION +JUNCTIONS,JCTNS +JUNCTIONS,JCTS +JUNCTIONS,JUNCTIONS +KEY,KEY +KEY,KY +KEYS,KEYS +KEYS,KYS +KNOLL,KNOL +KNOLL,KNL +KNOLL,KNOLL +KNOLLS,KNLS +KNOLLS,KNOLLS +LAKE,LK +LAKE,LAKE +LAKES,LKS +LAKES,LAKES +LAND,LAND +LANDING,LANDING +LANDING,LNDG +LANDING,LNDNG +LANE,LN +LANE,LANE +LIGHT,LGT +LIGHT,LIGHT +LIGHTS,LGTS +LIGHTS,LIGHTS +LOAF,LF +LOAF,LOAF +LOCK,LCK +LOCK,LOCK +LOCKS,LOCKS +LOCKS,LCKS +LODGE,LODG +LODGE,LDG +LODGE,LDGE +LODGE,LODGE +LOOP,LOOP +LOOP,LOOPS +MALL,MALL +MANOR,MNR +MANOR,MANOR +MANORS,MANORS +MANORS,MNRS +MEADOW,MEADOW +MEADOWS,MDW +MEADOWS,MEADOWS +MEADOWS,MEDOWS +MEADOWS,MDWS +MEWS,MEWS +MILL,MILL +MILL,ML +MILLS,MLS +MILLS,MILLS +MISSION,MISSN +MISSION,MSSN +MISSION,MSN +MOTORWAY,MTWY +MOTORWAY,MOTORWAY +MOUNT,MT +MOUNT,MOUNT +MOUNT,MNT +MOUNTAIN,MNTAIN +MOUNTAIN,MOUNTIN +MOUNTAIN,MTIN +MOUNTAIN,MNTN +MOUNTAIN,MOUNTAIN +MOUNTAIN,MTN +MOUNTAINS,MNTNS +MOUNTAINS,MTNS +MOUNTAINS,MOUNTAINS +NECK,NCK +NECK,NECK +ORCHARD,ORCH +ORCHARD,ORCHARD +ORCHARD,ORCHRD +OVAL,OVAL +OVAL,OVL +OVERPASS,OPAS +OVERPASS,OVERPASS +PARK,PRK +PARK,PARK +PARKS,PARKS +PARKWAY,PARKWAY +PARKWAY,PARKWY +PARKWAY,PKWAY +PARKWAY,PKWY +PARKWAY,PKY +PARKWAYS,PARKWAYS +PARKWAYS,PKWYS +PASS,PASS +PASSAGE,PASSAGE +PASSAGE,PSGE +PATH,PATHS +PATH,PATH +PIKE,PIKE +PIKE,PIKES +PINE,PNE +PINE,PINE +PINES,PINES +PINES,PNES +PLACE,PL +PLAIN,PLAIN +PLAIN,PLN +PLAINS,PLNS +PLAINS,PLAINS +PLAZA,PLZA +PLAZA,PLZ +PLAZA,PLAZA +POINT,PT +POINT,POINT +POINTS,POINTS +POINTS,PTS +PORT,PORT +PORT,PRT +PORTS,PORTS +PORTS,PRTS +PRAIRIE,PR +PRAIRIE,PRR +PRAIRIE,PRAIRIE +RADIAL,RAD +RADIAL,RADIAL +RADIAL,RADL +RADIAL,RADIEL +RAMP,RAMP +RANCH,RANCH +RANCH,RANCHES +RANCH,RNCH +RANCH,RNCHS +RAPID,RPD +RAPID,RAPID +RAPIDS,RPDS +RAPIDS,RAPIDS +REST,RST +REST,REST +RIDGE,RIDGE +RIDGE,RDGE +RIDGE,RDG +RIDGES,RIDGES +RIDGES,RDGS +RIVER,RVR +RIVER,RIV +RIVER,RIVER +RIVER,RIVR +ROAD,RD +ROAD,ROAD +ROADS,ROADS +ROADS,RDS +ROUTE,ROUTE +ROUTE,RTE +ROW,ROW +RUE,RUE +RUN,RUN +SHOAL,SHL +SHOAL,SHOAL +SHOALS,SHLS +SHOALS,SHOALS +SHORE,SHORE +SHORE,SHR +SHORE,SHOAR +SHORES,SHRS +SHORES,SHOARS +SHORES,SHORES +SKYWAY,SKWY +SKYWAY,SKYWAY +SPRING,SPG +SPRING,SPNG +SPRING,SPRING +SPRING,SPRNG +SPRINGS,SPGS +SPRINGS,SPNGS +SPRINGS,SPRINGS +SPRINGS,SPRNGS +SPUR,SPUR +SPURS,SPURS +SQUARE,SQR +SQUARE,SQRE +SQUARE,SQU +SQUARE,SQUARE +SQUARE,SQ +SQUARES,SQRS +SQUARES,SQS +SQUARES,SQUARES +STATION,STA +STATION,STATION +STATION,STN +STATION,STATN +STRAVENUE,STRVNUE +STRAVENUE,STRAV +STRAVENUE,STRA +STRAVENUE,STRVN +STRAVENUE,STRAVEN +STRAVENUE,STRAVENUE +STRAVENUE,STRAVN +STREAM,STREAM +STREAM,STREME +STREAM,STRM +STREET,STREET +STREET,STRT +STREET,ST +STREET,STR +STREETS,STREETS +STREETS,STS +SUMMIT,SMT +SUMMIT,SUMIT +SUMMIT,SUMITT +SUMMIT,SUMMIT +TERRACE,TERRACE +TERRACE,TER +TERRACE,TERR +THROUGHWAY,THROUGHWAY +THROUGHWAY,TRWY +TRACE,TRACE +TRACE,TRACES +TRACE,TRCE +TRACK,TRACK +TRACK,TRACKS +TRACK,TRAK +TRACK,TRK +TRACK,TRKS +TRAFFICWAY,TRFY +TRAFFICWAY,TRAFFICWAY +TRAIL,TRAILS +TRAIL,TRAIL +TRAIL,TRL +TRAIL,TRLS +TRAILER,TRAILER +TRAILER,TRLR +TRAILER,TRLRS +TUNNEL,TUNNL +TUNNEL,TUNNEL +TUNNEL,TUNLS +TUNNEL,TUNNELS +TUNNEL,TUNEL +TUNNEL,TUNL +TURNPIKE,TURNPIKE +TURNPIKE,TRNPK +TURNPIKE,TURNPK +TURNPIKE,TPKE +UNDERPASS,UNDERPASS +UNDERPASS,UPAS +UNION,UN +UNION,UNION +UNIONS,UNIONS +UNIONS,UNS +VALLEY,VALLEY +VALLEY,VALLY +VALLEY,VLLY +VALLEY,VLY +VALLEYS,VLYS +VALLEYS,VALLEYS +VIADUCT,VDCT +VIADUCT,VIADCT +VIADUCT,VIADUCT +VIADUCT,VIA +VIEW,VIEW +VIEW,VW +VIEWS,VIEWS +VIEWS,VWS +VILLAGE,VILL +VILLAGE,VILLAG +VILLAGE,VILLAGE +VILLAGE,VILLG +VILLAGE,VILLIAGE +VILLAGE,VLG +VILLAGES,VILLAGES +VILLAGES,VLGS +VILLE,VL +VILLE,VILLE +VISTA,VSTA +VISTA,VISTA +VISTA,VIST +VISTA,VIS +VISTA,VST +WALK,WALK +WALKS,WALKS +WALL,WALL +WAY,WAY +WAY,WY +WAYS,WAYS +WELL,WELL +WELL,WL +WELLS,WELLS +WELLS,WLS diff --git a/hlink/tests/input_data/street_abbrevs_most_common.csv b/hlink/tests/input_data/street_abbrevs_most_common.csv new file mode 100644 index 0000000..a920365 --- /dev/null +++ b/hlink/tests/input_data/street_abbrevs_most_common.csv @@ -0,0 +1,72 @@ +ALLEY,ALLY +ALLEY,ALY +ANEX,ANNEX +AVENUE,AV +AVENUE,AVE +AVENUE,AVEN +AVENUE,AVENU +AVENUE,AVN +AVENUE,AVNUE +BOULEVARD,BLVD +BOULEVARD,BOUL +BRIDGE,BRG +BURG,BG +CAMP,CP +CENTER,CENTRE +CENTER,CEN +CENTER,CENTR +CENTER,CENT +CORNER,COR +CORNERS,CORS +COURT,CT +COURTS,CTS +CRESCENT,CRES +DRIVE,DRIV +DRIVE,DR +ESTATE,EST +EXPRESSWAY,EXPRESS +EXTENSION,EXT +FERRY,FRY +FORT,FRT +FORT,FT +HEIGHTS,HT +HEIGHTS,HTS +ISLAND,IS +KEY,KY +LODGE,LDG +MILL,ML +MILLS,MLS +MOUNT,MT +MOUNTAIN,MOUNTIN +MOUNTAIN,MTN +PARK,PRK +PIKE,PIKES +PLACE,PL +POINT,PT +POINTS,PTS +PRAIRIE,PR +RADIAL,RAD +REST,RST +RIVER,RIV +ROAD,RD +ROADS,RDS +SPRINGS,SPGS +SQUARE,SQ +STATION,STA +STREET,STRT +STREET,ST +STREET,STR +STREETS,STS +SUMMIT,SUMIT +SUMMIT,SUMITT +TERRACE,TER +TRACK,TRACKS +TRAIL,TRAILS +TURNPIKE,TRNPK +UNION,UN +VALLEY,VALLY +VIADUCT,VIA +VILLAGE,VILL +VILLAGE,VILLAG +VILLAGE,VILLIAGE +WELL,WL diff --git a/hlink/tests/input_data/test_csv_data_a.csv b/hlink/tests/input_data/test_csv_data_a.csv new file mode 100644 index 0000000..5e281ee --- /dev/null +++ b/hlink/tests/input_data/test_csv_data_a.csv @@ -0,0 +1,4 @@ +id,serialp,namelast,namefrst,bpl +10,A,,John,120 +20,B,Last,Marc,240 +30,B,L.T.,Jon,360 \ No newline at end of file diff --git a/hlink/tests/input_data/test_csv_data_b.csv b/hlink/tests/input_data/test_csv_data_b.csv new file mode 100644 index 0000000..6972a79 --- /dev/null +++ b/hlink/tests/input_data/test_csv_data_b.csv @@ -0,0 +1,4 @@ +id,serialp,namelast,namefrst,bpl +10,C,Name,J,460 +30,D,None,None,540 +50,E,Jean,,710 \ No newline at end of file diff --git a/hlink/tests/input_data/test_parquet_data_a.parquet/._SUCCESS.crc b/hlink/tests/input_data/test_parquet_data_a.parquet/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hlink/tests/input_data/test_parquet_data_a.parquet/._SUCCESS.crc differ diff --git a/hlink/tests/input_data/test_parquet_data_a.parquet/.part-00000-8465f06e-9bb0-4817-81eb-813ccf07429a-c000.snappy.parquet.crc b/hlink/tests/input_data/test_parquet_data_a.parquet/.part-00000-8465f06e-9bb0-4817-81eb-813ccf07429a-c000.snappy.parquet.crc new file mode 100644 index 0000000..364ec29 Binary files /dev/null and b/hlink/tests/input_data/test_parquet_data_a.parquet/.part-00000-8465f06e-9bb0-4817-81eb-813ccf07429a-c000.snappy.parquet.crc differ diff --git a/hlink/tests/input_data/test_parquet_data_a.parquet/_SUCCESS b/hlink/tests/input_data/test_parquet_data_a.parquet/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hlink/tests/input_data/test_parquet_data_a.parquet/part-00000-8465f06e-9bb0-4817-81eb-813ccf07429a-c000.snappy.parquet b/hlink/tests/input_data/test_parquet_data_a.parquet/part-00000-8465f06e-9bb0-4817-81eb-813ccf07429a-c000.snappy.parquet new file mode 100644 index 0000000..e6e9f63 Binary files /dev/null and b/hlink/tests/input_data/test_parquet_data_a.parquet/part-00000-8465f06e-9bb0-4817-81eb-813ccf07429a-c000.snappy.parquet differ diff --git a/hlink/tests/input_data/test_parquet_data_b.parquet/._SUCCESS.crc b/hlink/tests/input_data/test_parquet_data_b.parquet/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hlink/tests/input_data/test_parquet_data_b.parquet/._SUCCESS.crc differ diff --git a/hlink/tests/input_data/test_parquet_data_b.parquet/.part-00000-bb515275-04b4-4a16-80f1-5cc21450b93f-c000.snappy.parquet.crc b/hlink/tests/input_data/test_parquet_data_b.parquet/.part-00000-bb515275-04b4-4a16-80f1-5cc21450b93f-c000.snappy.parquet.crc new file mode 100644 index 0000000..1bc931b Binary files /dev/null and b/hlink/tests/input_data/test_parquet_data_b.parquet/.part-00000-bb515275-04b4-4a16-80f1-5cc21450b93f-c000.snappy.parquet.crc differ diff --git a/hlink/tests/input_data/test_parquet_data_b.parquet/_SUCCESS b/hlink/tests/input_data/test_parquet_data_b.parquet/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hlink/tests/input_data/test_parquet_data_b.parquet/part-00000-bb515275-04b4-4a16-80f1-5cc21450b93f-c000.snappy.parquet b/hlink/tests/input_data/test_parquet_data_b.parquet/part-00000-bb515275-04b4-4a16-80f1-5cc21450b93f-c000.snappy.parquet new file mode 100644 index 0000000..4e66f2c Binary files /dev/null and b/hlink/tests/input_data/test_parquet_data_b.parquet/part-00000-bb515275-04b4-4a16-80f1-5cc21450b93f-c000.snappy.parquet differ diff --git a/hlink/tests/input_data/test_prepped_data_a.csv b/hlink/tests/input_data/test_prepped_data_a.csv new file mode 100644 index 0000000..5c5ef0c --- /dev/null +++ b/hlink/tests/input_data/test_prepped_data_a.csv @@ -0,0 +1,4 @@ +id,serialp,namelast,bpl,sex,region +10,A,Name,100,1,1 +20,B,Last,200,2,2 +30,B,Lest,300,2,2 \ No newline at end of file diff --git a/hlink/tests/input_data/test_prepped_data_b.csv b/hlink/tests/input_data/test_prepped_data_b.csv new file mode 100644 index 0000000..4129de6 --- /dev/null +++ b/hlink/tests/input_data/test_prepped_data_b.csv @@ -0,0 +1,4 @@ +id,serialp,namelast,bpl,sex,region +10,C,Nameish,400,1,1 +30,D,Last,500,2,2 +50,E,List,700,2,2 \ No newline at end of file diff --git a/hlink/tests/input_data/test_street_names_data.csv b/hlink/tests/input_data/test_street_names_data.csv new file mode 100644 index 0000000..402b508 --- /dev/null +++ b/hlink/tests/input_data/test_street_names_data.csv @@ -0,0 +1,10 @@ +histid,street +a01,TRNPK 35 +b02,4TH TER AVN +c03,4th STATE +d04,OLD BLVD +e05,OLD MOTORWAY +f06,MIAMI BCH RD +g07,CENTRE STR +g08,CTR STREET +i09,STRSTREET \ No newline at end of file diff --git a/hlink/tests/input_data/threshold_ratio_test.csv b/hlink/tests/input_data/threshold_ratio_test.csv new file mode 100644 index 0000000..b5a7955 --- /dev/null +++ b/hlink/tests/input_data/threshold_ratio_test.csv @@ -0,0 +1,17 @@ +histid_a,histid_b,probability,prediction +0,0,0.98,1 +0,1,0.97,1 +0,2,0.76,1 +1,0,0.74,1 +1,1,0.94,1 +1,2,0.34,0 +2,2,0.8,1 +2,4,0.44,0 +4,5,0.22,0 +4,6,0.65,1 +4,7,0.75,1 +4,8,0.88,1 +5,5,0.05,0 +5,7,0.95,1 +6,6,0.99,1 +7,7,0.09,0 \ No newline at end of file diff --git a/hlink/tests/input_data/threshold_ratio_test_data_2.csv b/hlink/tests/input_data/threshold_ratio_test_data_2.csv new file mode 100644 index 0000000..a6047fa --- /dev/null +++ b/hlink/tests/input_data/threshold_ratio_test_data_2.csv @@ -0,0 +1,14 @@ +histid_a,histid_b,probability +0a,0b,0.95 +0a,1b,0.95 +0a,2b,0.42 +1a,3b,0.75 +2a,4b,0.8 +3a,4b,0.85 +4a,5b,0.99 +5a,6b,0.53 +5a,7b,0.93 +6a,8b,0.23 +6a,9b,0.33 +7a,10b,0.44 +8a,10b,0.41 diff --git a/hlink/tests/input_data/training_data.csv b/hlink/tests/input_data/training_data.csv new file mode 100644 index 0000000..d61f60d --- /dev/null +++ b/hlink/tests/input_data/training_data.csv @@ -0,0 +1,10 @@ +id_a,id_b,match +10,10,1 +10,30,0 +10,50,0 +20,10,0 +20,30,1 +20,50,0 +30,10,0 +30,30,0 +30,50,1 \ No newline at end of file diff --git a/hlink/tests/input_data/training_data_doubled.csv b/hlink/tests/input_data/training_data_doubled.csv new file mode 100644 index 0000000..12b928f --- /dev/null +++ b/hlink/tests/input_data/training_data_doubled.csv @@ -0,0 +1,19 @@ +id_a,id_b,match +10,10,1 +10,30,0 +10,50,0 +20,10,0 +20,30,1 +20,50,0 +30,10,0 +30,30,0 +30,50,1 +10,10,1 +10,30,0 +10,50,0 +20,10,0 +20,30,1 +20,50,0 +30,10,0 +30,30,0 +30,50,1 \ No newline at end of file diff --git a/hlink/tests/input_data/training_data_households.csv b/hlink/tests/input_data/training_data_households.csv new file mode 100644 index 0000000..11982da --- /dev/null +++ b/hlink/tests/input_data/training_data_households.csv @@ -0,0 +1,59 @@ +id,SERIAL,NAMEFRST,NAMELAST,AGE,BIRTHYR,SEX,BPL,ENUMDIST,PERNUM,SPLOC,POPLOC,MOMLOC,RELATE +b5689d06-edd3-498e-8b5b-e04f2fa2f2a9,1062582,Catherine,Beebe,44,1866,2,10,2345,01,00,00,00,01 +a7118f06-949d-4d02-be0a-db33a6f8f3a8,1061605,Frances E,Bird,40,1870,2,10,2345,01,00,00,00,01 +85d089c0-b907-4d9c-95ab-c5fa4a3dd2bb,1029271,J S,Luff,49,1861,1,10,2345,01,00,00,00,01 +cddd9455-48e0-4b48-89a5-9ee315e00087,1237122,John,Smith,26,1884,1,10,2345,01,00,00,00,01 +8cb74256-6dfa-4d17-913a-59fa646c388a,1022156,Saml H,Russell,77,1833,1,10,2345,01,00,00,00,01 +1f8e1a74-d486-44ad-8d5c-51aedf86208e,1025771,Charles,Robertson,26,1884,1,10,2345,01,00,00,00,01 +61a1590f-1d3a-4666-8406-3d4aaf0770b4,1027559,John,Dickinson,42,1868,1,10,2345,01,00,00,00,01 +92277f0b-1476-41f5-9dc8-bf83672616d0,1028383,Joseph,Shissler,36,1874,1,10,2345,01,00,00,00,01 +322291a1-de91-439d-bba0-45fc2f47a2eb,1029335,David,Hall,71,1839,1,10,2345,01,00,00,00,01 +136f7105-ff59-4eac-9d95-44b002cbb448,1031229,John,Decame,52,1858,1,10,2345,01,02,00,00,01 +1138ab41-e234-4c72-b812-eaaf0fc5f76c,1031229,Nancy,Decame,53,1857,2,10,2345,02,01,00,00,02 +066ea4e1-f340-4231-b505-ec7bb9a07103,1031229,Peter N,Decame,15,1895,1,10,2345,03,00,01,02,03 +b7d96336-404e-490c-8c45-61f2287b52ff,1031229,Annam,Decame,13,1897,2,10,2345,04,00,01,02,03 +24bdff6a-5590-4494-8e8a-ac4a549c8890,1031229,Sarah,Decame,10,1900,2,10,2345,05,00,01,02,03 +c1fedaab-f026-4aa4-9320-e10f2432d539,1031230,James,Carney,22,1888,1,10,2345,01,00,00,00,01 +43a6ebe5-752b-4054-818d-6f6f75cc89e7,1031235,Alfred,Dell,27,1883,1,10,2345,01,00,00,00,01 +0d693015-2349-4363-9667-45036af7d0db,1031760,Chas,Syaex,40,1870,1,10,2345,01,00,00,00,01 +1d586e26-aac1-49df-a2ad-fe0a385a26bf,1031767,Sarah,Russell,13,1897,2,10,2345,01,00,00,00,01 +93b7ac89-f9db-49b2-a1f2-c189fecc14ae,1034579,Wm H,Hazard,29,1881,1,10,2345,01,02,00,00,01 +e51c36c9-570c-466d-aac1-bf380c9c20f1,1034579,Martha,Hazard,30,1880,2,10,2345,02,01,00,00,02 +9250341a-8336-494a-bc84-2b803efe64c6,1034579,Willie May,Hazard,8,1902,2,10,2345,03,00,01,02,03 +a70679f0-9313-4ef3-bf87-5dfe81beed5d,1034579,Samuel,Hazard,4,1906,2,10,2345,04,00,01,02,03 +4715bbf6-d3e2-4260-9ddd-6aece147e5c1,1034579,Samuel,Morgan,32,1878,1,10,2345,05,00,00,00,12 +77378570-5214-4ac5-8258-c5156e8b99b3,1034648,J Clauson,Mcfarland,20,1890,1,10,2345,01,00,00,00,01 +6542b541-6e10-411f-9b2a-7c0b93b0aa68,1034648,Eugene,Mcfarland,18,1892,1,10,2345,02,00,00,00,07 +396c4077-6a70-4a17-97fb-f8a0c06fdafe,1037015,Anna,Preston,39,1871,2,10,2345,01,00,00,00,01 +7e9dde5e-3fad-4b2e-b367-643c0dc8cabb,1038208,Rebecca N,Alexander,49,1861,2,10,2345,01,00,00,00,01 +f7d9e25f-c390-4222-ac24-4e93d72daa05,1038222,Martha,Ellis,37,1873,2,10,2345,01,00,00,00,01 +24b7afa1-8c49-4833-8292-c545c85d3b89,1039117,Otillia,Zeider,34,1876,2,10,2345,01,00,00,00,01 +4b416874-0c5c-4233-81ec-39223bc66f4f,1048673,Mary,Doyle,64,1846,2,10,2345,01,00,00,00,01 +a499b0dc-7ac0-4d61-b493-91a3036c712e ,2484121,ANNIE ,FAUBLE ,26,1884,2,10,2222,01,00,00,00,01 +ae7261c3-7d71-4ea1-997f-5d1a68c18777 ,2485245,MARY ,REESE ,35,1875,2,10,2222,01,00,00,00,01 +ad6442b5-42bc-4c2e-a517-5a951d989a92 ,2485245,MARY ,REESE ,11,1899,2,10,2222,02,00,00,01,03 +b0b6695f-dfa5-4e4d-bc75-798c27195fff ,2485245,SALLY ,REESE ,9,1901,2,10,2222,03,00,00,01,03 +9e807937-de09-414c-bfb2-ac821e112929 ,2485411,JOHN ,SHIELDS ,21,1889,1,10,2222,01,00,00,00,01 +426f2cbe-32e1-45eb-9f86-89a2b9116b7e ,2485411,ANNE ,FAUBLE ,26,1884,2,10,2222,02,00,00,00,11 +a76697d9-b0c8-4774-bc3e-12a7e403c7e6 ,2485601,JOHN ,COLLINS ,17,1893,1,10,2222,03,00,00,00,10 +3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ,2485601,MAGGIE ,COLLINS ,16,1894,2,10,2222,04,00,00,00,10 +49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ,2485601,MARY ,COLLINS ,12,1898,2,10,2222,05,00,00,00,10 +50b33ef6-259d-43af-8cdc-56a61f881169 ,2486481,WILLIAM H. ,SEWARD ,54,1856,1,10,2222,01,00,00,00,01 +952754a5-48b4-462a-ac57-e4a059a9ef98 ,2486951,ESTHER ,BIERHAHN ,40,1870,2,10,2222,01,00,00,00,01 +ea6d77b3-2e2d-4c59-a0ac-6b297e8898e3 ,2488461,CHARLES ,CLEVELAND ,45,1865,1,10,2222,01,00,00,00,01 +60a5052e-6d67-455a-a3aa-bb79560c7d8d ,2489211,SUSAN ,WILSON ,60,1850,2,10,2222,01,00,00,00,01 +0d4472ec-6378-4aeb-b6c7-17e1c388bb94 ,2489301,ARCHER ,HARVEY ,20,1890,1,10,2222,01,00,00,00,01 +65ccbeb7-2c79-4fb0-b354-c67f150ad80c ,2489831,ELIZABETH ,MC LEAN ,42,1868,2,10,2222,01,00,00,00,01 +72cbe5fa-f558-4393-8423-1842fadf7f11 ,2490611,MARY A. ,FLEMMING ,73,1837,2,10,2222,01,00,00,00,01 +44693008-fd6f-48fe-9c52-e6c07baff361 ,2490671,BESSIE ,CHAMBERS ,2,1908,2,10,2222,01,00,00,00,01 +bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ,2491501,THOMAS ,GRAHAM ,64,1846,1,10,2222,01,00,00,00,01 +a7b10530-b7c9-44d5-9125-c603f392d6d3 ,2491801,EDWARD ,DEKAY ,35,1875,1,10,2222,01,00,00,00,01 +1e635c1c-7faa-4270-acf3-a22635884b90 ,2492069,NATHEN ,THORPE ,74,1836,1,10,2222,01,00,00,00,01 +d3217545-3453-4d96-86c0-d6a3e60fb2f8 ,2492741,JOB ,FOSTER ,26,1884,1,10,2222,01,02,00,00,01 +2a35bae5-3120-4e2c-87da-694d4419c9ce ,2492741,JEZEBEL ,FOSTER ,22,1888,2,10,2222,02,01,00,03,02 +94460fc2-954b-469d-9726-f7126c30e5e2 ,2492741,ELIZA ,GOODWIN ,39,1871,2,10,2222,03,00,00,00,06 +620b6ebb-82e6-42db-8aae-300ca2be0c00 ,2492741,MARY ,GOODWIN ,17,1893,2,10,2222,04,00,00,03,08 +bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ,2492741,JO ,GOODWIN ,15,1895,1,10,2222,05,00,00,03,08 +7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ,2493841,PHINEAS ,TAYLOR ,39,1871,1,10,2222,01,00,00,00,01 +a0f33b36-cef7-4949-a031-22b90f1055d4 ,2494397,MARY A. ,LORD ,54,1856,2,10,2222,01,00,00,00,01 +1a76745c-acf8-48a0-9992-7fb10c11710b ,2494421,E.B. ,ALLEN ,21,1889,1,10,2222,01,00,00,00,01 diff --git a/hlink/tests/input_data/training_data_households.parquet b/hlink/tests/input_data/training_data_households.parquet new file mode 100644 index 0000000..91c979e Binary files /dev/null and b/hlink/tests/input_data/training_data_households.parquet differ diff --git a/hlink/tests/input_data/training_data_long.csv b/hlink/tests/input_data/training_data_long.csv new file mode 100644 index 0000000..e40e304 --- /dev/null +++ b/hlink/tests/input_data/training_data_long.csv @@ -0,0 +1,25 @@ +id_a,id_b,match +0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,1 +1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,1 +095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,1 +6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,1 +EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,1 +AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,1 +8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1 +F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,1 +D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,1 +CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,1 +4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,1 +CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1 +2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,0 +195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,1 +74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,0 +F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,0 +6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,0 +EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,0 +47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,0 +7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,0 +A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,0 +E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,0 +671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,0 +81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,0 \ No newline at end of file diff --git a/hlink/tests/input_data/training_data_long_a.csv b/hlink/tests/input_data/training_data_long_a.csv new file mode 100644 index 0000000..f900956 --- /dev/null +++ b/hlink/tests/input_data/training_data_long_a.csv @@ -0,0 +1,25 @@ +namelast,id,bpl,namefrst_unstd,sex,region +cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,3100,gerald,1,4 +symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,3600,horace,1,2 +abrahams,095AD921-9B08-468E-817A-44879FBCADDE,60094,isiah,1,99 +eilbatt,6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,4700,reginald,1,6 +knopke,EAD03D68-F21D-4A74-8C16-F9123F5288D7,2100,andrew,1,6 +caldwell,AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,15010,daisy,2,99 +sonnenschein,8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,1700,max,1,3 +gibson,F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,5500,dwight,1,3 +hegewald,D30C40B9-2E7C-4933-84CE-CEAAB37E3209,5600,karl,1,8 +king,CCBA170F-93D0-42C3-A57B-CCABBF2772FB,3800,virgel,1,4 +looney,4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,4700,sadie,2,6 +rydstrom,CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,1700,hubert,1,3 +mugrdickian,2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,3600,misak,1,2 +brightman,195EA695-D047-4045-8757-E7A22F12E148,3900,austin,1,3 +harman,74941094-9737-40F0-BF3C-0C2380B08040,5400,eston,1,5 +oglesby,F0F34E2F-49CC-4F06-8CC4-691CF3150244,4000,stephen,1,7 +kassik,6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,5600,james,1,8 +wood,EE22ED8E-9544-4C77-A689-75895376E3EB,1700,dudley,1,3 +foulkrod,47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,4200,s,1,2 +huges,7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,100,keneth,1,6 +caldwell,A859D9BC-6106-43A2-8A47-B12D9D2C49C8,5000,nathan,1,1 +platta,E19E5381-C68D-4E03-A688-597DF13311CE,1200,norman,1,5 +lipscomb,671DE512-479B-4EEB-85B4-93A848E6BDD7,1300,roy,1,5 +woodburne,81E992C0-3796-4BE7-B02E-9CAD0289C6EC,200,walter,1,9 diff --git a/hlink/tests/input_data/training_data_long_b.csv b/hlink/tests/input_data/training_data_long_b.csv new file mode 100644 index 0000000..2ea4d33 --- /dev/null +++ b/hlink/tests/input_data/training_data_long_b.csv @@ -0,0 +1,25 @@ +namelast,id,bpl,namefrst_unstd,sex +cridlebaugh,001B8A74-3795-4997-BC5B-2A07257668F9,3100,gerald,1 +symonds,00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,horace,1 +abrahams,00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,isniah,1 +eilbott,00669345-C937-4405-A0F0-1FCA5204DF64,4700,reginald,1 +knopke,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,andrew,1 +caldwell,00849961-E52F-42F2-9B70-052606223052,15010,daisy,2 +sonnenschein,00C4291F-7064-4A81-8589-5854C367EEC4,1700,max,1 +gebson,010F244F-94D0-4295-82DB-0E172724358A,5500,dwight,1 +hegewald,01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,karl,1 +king,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,virgil,1 +looney,016EF43B-E70F-440E-882E-E447663F682F,4700,sadye,2 +rydstrom,018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,hubert,1 +mugrdichian,019D26A0-0335-48B5-A6D6-1D499424BE84,3600,misak,1 +brightman,0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,anstin,1 +harman,0282109F-581C-4B8E-A99D-135CF0077C2E,5400,estan,1 +oglesby,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,stephen,1 +kassek,033FD0FA-C523-42B5-976A-751E830F7021,5600,james,1 +wood,0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,dudley,1 +foulkrod,03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,s,1 +hughes,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,kenneth,1 +caldwell,039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,nathan,1 +platts,03B89FD5-872A-4504-9758-F5AA1607BA01,1200,norman,1 +lipscomb,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,roy,1 +woodburn,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,walter,1 diff --git a/hlink/tests/integration_score_with_trained_models_test.py b/hlink/tests/integration_score_with_trained_models_test.py new file mode 100644 index 0000000..1626e74 --- /dev/null +++ b/hlink/tests/integration_score_with_trained_models_test.py @@ -0,0 +1,1229 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +def test_apply_chosen_model_RF( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "random_forest", + "maxDepth": 6, + "numTrees": 100, + "featureSubsetStrategy": "sqrt", + } + + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = False + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_apply_chosen_model_RF_pm_IDs_only( + spark, + training_conf, + training, + state_dist_path, + training_data_path, + potential_matches_path_ids_only, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "random_forest", + "maxDepth": 6, + "numTrees": 100, + "featureSubsetStrategy": "sqrt", + } + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + potential_matches = training.spark.read.csv( + potential_matches_path_ids_only, header=True, inferSchema=True + ) + potential_matches.write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert pm_df.shape == (9, 5) + assert pm_df.query("id_a == 10 and id_b == 10")["prediction"].iloc()[0] == 1 + assert pm_df.query("id_a == 20 and id_b == 30")["prediction"].iloc()[0] == 1 + assert ( + round( + list( + pm_df.query("id_a == 10 and id_b == 50")["probability_array"].iloc()[0] + )[0], + 0, + ) + == 1 + ) + assert ( + round( + list( + pm_df.query("id_a == 10 and id_b == 50")["probability_array"].iloc()[0] + )[1], + 0, + ) + == 0 + ) + + +def test_apply_chosen_model_probit_pm_IDs_only( + spark, + training_conf, + training, + state_dist_path, + training_data_path, + potential_matches_path_ids_only, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5} + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + potential_matches = training.spark.read.csv( + potential_matches_path_ids_only, header=True, inferSchema=True + ) + potential_matches.write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert pm_df.shape == (9, 4) + assert pm_df.query("id_a == 10 and id_b == 10")["prediction"].iloc()[0] == 1 + assert pm_df.query("id_a == 20 and id_b == 30")["prediction"].iloc()[0] == 1 + assert ( + round(pm_df.query("id_a == 10 and id_b == 50")["probability"].iloc()[0], 0) == 0 + ) + assert ( + round(pm_df.query("id_a == 10 and id_b == 10")["probability"].iloc()[0], 0) == 1 + ) + + +def test_apply_chosen_model_RF_pm_IDs_only_full_data_out( + spark, + training_conf, + training, + state_dist_path, + training_data_path, + potential_matches_path_ids_only, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "random_forest", + "maxDepth": 6, + "numTrees": 100, + "featureSubsetStrategy": "sqrt", + } + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = False + + potential_matches = training.spark.read.csv( + potential_matches_path_ids_only, header=True, inferSchema=True + ) + potential_matches.write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert pm_df.shape == (9, 13) + assert pm_df.query("id_a == 10 and id_b == 10")["prediction"].iloc()[0] == 1 + assert pm_df.query("id_a == 20 and id_b == 30")["prediction"].iloc()[0] == 1 + assert ( + round( + list( + pm_df.query("id_a == 10 and id_b == 50")["probability_array"].iloc()[0] + )[0], + 0, + ) + == 1 + ) + assert ( + round( + list( + pm_df.query("id_a == 10 and id_b == 50")["probability_array"].iloc()[0] + )[1], + 0, + ) + == 0 + ) + + +def test_apply_chosen_model_probit( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5} + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_step_3_apply_chosen_model_logistic_regression( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "logistic_regression", + "threshold": 0.8, + } + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_step_3_apply_chosen_model_decision_tree( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "decision_tree", + "maxDepth": 6.0, + "minInstancesPerNode": 2.0, + "maxBins": 4.0, + } + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_step_3_apply_chosen_model_boosted_trees( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "gradient_boosted_trees", + "maxDepth": 4.0, + "minInstancesPerNode": 1.0, + "maxBins": 6.0, + } + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + # assert "probability_array" not in list(pm_df.columns) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_step_3_apply_chosen_model_RF_threshold( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["decision"] = "drop_duplicate_with_threshold_ratio" + training_conf["training"]["threshold_ratio"] = 1.3 + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "random_forest", + "maxDepth": 6, + "numTrees": 100, + "featureSubsetStrategy": "sqrt", + } + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_step_3_apply_chosen_model_probit_threshold( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["decision"] = "drop_duplicate_with_threshold_ratio" + training_conf["training"]["threshold_ratio"] = 1.3 + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5} + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) + + +def test_step_3_apply_chosen_model_boosted_trees_threshold( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "gradient_boosted_trees", + "maxDepth": 4.0, + "minInstancesPerNode": 1.0, + "maxBins": 6.0, + } + training_conf["training"]["decision"] = "drop_duplicate_with_threshold_ratio" + training_conf["training"]["threshold_ratio"] = 1.3 + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + + pm_df = training.spark.table("scored_potential_matches").toPandas() + + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'")[ + "prediction" + ].iloc()[0] + == 1 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_a == '81E992C0-3796-4BE7-B02E-9CAD0289C6EC'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "probability" + ].iloc()[0] + < 0.5 + ) + assert ( + pm_df.query("id_b == '033FD0FA-C523-42B5-976A-751E830F7021'")[ + "prediction" + ].iloc()[0] + == 0 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "probability" + ].iloc()[0] + > 0.5 + ) + assert ( + pm_df.query("id_b == '00849961-E52F-42F2-9B70-052606223052'")[ + "prediction" + ].iloc()[0] + == 1 + ) diff --git a/hlink/tests/integration_test.py b/hlink/tests/integration_test.py new file mode 100644 index 0000000..df988d7 --- /dev/null +++ b/hlink/tests/integration_test.py @@ -0,0 +1,51 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +def test_input_args_preprocessing(spark, main, integration_conf): + main.link_run.config = integration_conf + main.do_run_all_steps("preprocessing training matching") + + scored_matches = main.spark.table("scored_potential_matches").toPandas() + row = scored_matches.query("id_a == 10 and id_b == 10").iloc[0] + + assert all( + elem not in list(scored_matches.columns) + for elem in [ + "region_a", + "region_b", + "age_a", + "age_b", + "serialp_a", + "serialp_b", + "bpl_a", + "bpl_b", + ] + ) + assert all( + elem in list(scored_matches.columns) + for elem in [ + "id_a", + "id_b", + "namelast_jw", + "regionf", + "hits", + "sex_equals", + "namelast_jw_imp", + "sex_equals_imp", + "hits_imp", + "regionf_onehotencoded", + "sex_regionf_interaction", + "features_vector", + "rawPrediction", + "probability_array", + "probability", + "second_best_prob", + "ratio", + "prediction", + ] + ) + assert row.probability.round(2) > 0.6 + assert row.prediction == 1 diff --git a/hlink/tests/logging_test.py b/hlink/tests/logging_test.py new file mode 100644 index 0000000..e0dd63e --- /dev/null +++ b/hlink/tests/logging_test.py @@ -0,0 +1,16 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pytest +import hlink.scripts.main +import logging + + +def test_step_log(spark, preprocessing_conf, main, caplog): + caplog.set_level(logging.INFO) + main.do_set_link_task("preprocessing") + main.do_run_step("0") + print(caplog.records[0]) + assert "Finished Preprocessing - step 0: register raw dataframes in" in caplog.text diff --git a/hlink/tests/main_loop_test.py b/hlink/tests/main_loop_test.py new file mode 100755 index 0000000..35a82c7 --- /dev/null +++ b/hlink/tests/main_loop_test.py @@ -0,0 +1,155 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import os +import pandas as pd +import pytest +from argparse import Namespace +from pyspark.ml.feature import VectorAssembler, OneHotEncoderEstimator +from hlink.linking.link_run import link_task_choices + + +def test_do_get_steps(capsys, main, spark): + for task in link_task_choices: + task_inst = getattr(main.link_run, task) + steps = task_inst.get_steps() + main.do_set_link_task(task) + main.do_get_steps("") + output = capsys.readouterr().out + for step in steps: + if str(step) not in output: + print(type(step)) + print(step) + print(output) + assert str(step) in output + + +def test_do_set_link_task(capsys, main): + main.current_link_task = main.link_run.matching + main.do_set_link_task("preprocessing") + assert main.current_link_task is main.link_run.preprocessing + output = capsys.readouterr().out + assert "preprocessing" in output.lower() + + +def test_output_csv_array_and_vector_data( + main, preprocessing, spark, preprocessing_conf_household_data +): + """ Test if csv output works for array and vector data. """ + preprocessing_conf_household_data["feature_selections"] = [ + { + "output_col": "namefrst_related", + "input_col": "namefrst_clean", + "transform": "related_individuals", + "family_id": "serial", + "relate_col": "relate", + "top_code": 10, + "bottom_code": 3, + }, + { + "input_column": "namelast_clean", + "output_column": "namelast_clean_bigrams", + "transform": "bigrams", + }, + { + "input_column": "namelast_clean", + "output_column": "namelast_clean_soundex", + "transform": "soundex", + }, + { + "input_column": "namefrst_orig", + "output_column": "namefrst_orig_soundex", + "transform": "soundex", + }, + { + "input_columns": ["namelast_clean_soundex", "namefrst_orig_soundex"], + "output_column": "namelast_frst_soundex", + "transform": "array", + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + data_a = preprocessing.spark.table("prepped_df_a") + # data_a.withColumn('dense_vector_ex', Vectors.dense([0.0, 0.5, 0.6, 0.8])) + + encoder = OneHotEncoderEstimator( + inputCols=["pernum"], outputCols=["pernum_onehotencoded"] + ) + model = encoder.fit(data_a) + data_e = model.transform(data_a) + + assembler = VectorAssembler( + inputCols=["bpl", "sex", "pernum_onehotencoded"], outputCol="feature_vector" + ) + data_v = assembler.transform(data_e) + + preprocessing.run_register_python("prepped_df_v", lambda: data_v) + + current_dir = os.getcwd() + output_path_v = os.path.join(current_dir, "output_data/array_vector_test.csv") + + main.do_csv(args=f"prepped_df_v {output_path_v}") + + assert os.path.isfile(output_path_v) + + prepped_v = pd.read_csv(output_path_v) + assert prepped_v.shape == (58, 24) + + assert ( + prepped_v.query("namelast_orig == 'Beebe'")["feature_vector"].iloc[0] + == "(7,[0,1,3],[10.0,2.0,1.0])" + ) + assert ( + prepped_v.query("namelast_orig == 'Morgan'")["feature_vector"].iloc[0] + == "(7,[0,1],[10.0,1.0])" + ) + main.do_drop_all("") + os.remove(output_path_v) + + +def test_crosswalk_reporting( + main, + capsys, + spark, + crosswalk_input_paths, + crosswalk_validation_path, + crosswalk_with_round_validation_path, + tmp_path, +): + ( + raw_df_a_path, + raw_df_b_path, + predicted_matches_path, + hh_predicted_matches_path, + ) = crosswalk_input_paths + + spark.read.csv( + raw_df_a_path, header=True, inferSchema=True + ).createOrReplaceTempView("raw_df_a") + spark.read.csv( + raw_df_b_path, header=True, inferSchema=True + ).createOrReplaceTempView("raw_df_b") + spark.read.csv( + predicted_matches_path, header=True, inferSchema=True + ).createOrReplaceTempView("predicted_matches") + spark.read.csv( + hh_predicted_matches_path, header=True, inferSchema=True + ).createOrReplaceTempView("hh_predicted_matches") + + output_path = os.path.join(tmp_path, "crosswalk.csv") + main.do_x_crosswalk(args=f"{output_path} histid,age") + assert [row for row in open(output_path)] == [ + row for row in open(crosswalk_validation_path) + ] + + output_path = os.path.join(tmp_path, "crosswalk_with_round.csv") + print(f"Testing {output_path} with round column") + main.do_x_crosswalk(args=f"{output_path} --include-rounds histid,age") + + assert [row for row in open(output_path)] == [ + row for row in open(crosswalk_with_round_validation_path) + ] diff --git a/hlink/tests/main_test.py b/hlink/tests/main_test.py new file mode 100644 index 0000000..c2d39e1 --- /dev/null +++ b/hlink/tests/main_test.py @@ -0,0 +1,309 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import sys +import pytest +import json +import toml +from pathlib import Path + +from hlink.scripts.main import load_conf +from hlink.errors import UsageError + +users = ("jesse", "woody") + + +@pytest.fixture() +def global_conf(tmp_path): + """The contents of the test global config as a dictionary.""" + global_conf = {} + global_conf["users_dir"] = str(tmp_path / "users_dir") + global_conf["users_dir_fast"] = str(tmp_path / "users_dir_fast") + global_conf["python"] = "python" + + return global_conf + + +@pytest.fixture() +def set_up_global_conf_file(monkeypatch, tmp_path, global_conf): + """Create the global config file and set the HLINK_CONF environment variable. + + The contents of the global config file are the same as the `global_conf` fixture + dictionary. + """ + file = tmp_path / "global_config_file.json" + + with open(file, "w") as f: + json.dump(global_conf, f) + + monkeypatch.setenv("HLINK_CONF", str(file)) + + +def get_conf_dir(global_conf, user): + """Given the global config and user, return the path to the user's config directory.""" + return Path(global_conf["users_dir"]) / user / "confs" + + +@pytest.mark.parametrize("conf_file", ("my_conf", "my_conf.toml", "my_conf.json")) +@pytest.mark.parametrize("user", users) +def test_load_conf_does_not_exist_no_env(monkeypatch, tmp_path, conf_file, user): + monkeypatch.delenv("HLINK_CONF", raising=False) + + filename = str(tmp_path / conf_file) + toml_filename = filename + ".toml" + json_filename = filename + ".json" + + error_msg = f"Couldn't find any of these three files: {filename}, {toml_filename}, {json_filename}" + with pytest.raises(FileNotFoundError, match=error_msg): + load_conf(filename, user) + + +@pytest.mark.parametrize("conf_file", ("my_conf.json",)) +@pytest.mark.parametrize("user", users) +def test_load_conf_json_exists_no_env(monkeypatch, tmp_path, conf_file, user): + monkeypatch.delenv("HLINK_CONF", raising=False) + monkeypatch.chdir(tmp_path) + filename = str(tmp_path / conf_file) + + contents = {} + with open(filename, "w") as f: + json.dump(contents, f) + + conf = load_conf(filename, user) + assert conf["conf_path"] == filename + + +@pytest.mark.parametrize("conf_name", ("my_conf", "my_conf.json", "my_conf.toml")) +@pytest.mark.parametrize("user", users) +def test_load_conf_json_exists_ext_added_no_env(monkeypatch, tmp_path, conf_name, user): + monkeypatch.delenv("HLINK_CONF", raising=False) + monkeypatch.chdir(tmp_path) + filename = str(tmp_path / conf_name) + ".json" + + contents = {} + with open(filename, "w") as f: + json.dump(contents, f) + + conf = load_conf(str(tmp_path / conf_name), user) + assert conf["conf_path"] == filename + + +@pytest.mark.parametrize("conf_file", ("my_conf.toml",)) +@pytest.mark.parametrize("user", users) +def test_load_conf_toml_exists_no_env(monkeypatch, tmp_path, conf_file, user): + monkeypatch.delenv("HLINK_CONF", raising=False) + monkeypatch.chdir(tmp_path) + filename = str(tmp_path / conf_file) + + contents = {} + with open(filename, "w") as f: + toml.dump(contents, f) + + conf = load_conf(filename, user) + assert conf["conf_path"] == filename + + +@pytest.mark.parametrize("conf_name", ("my_conf", "my_conf.json", "my_conf.toml")) +@pytest.mark.parametrize("user", users) +def test_load_conf_toml_exists_ext_added_no_env(monkeypatch, tmp_path, conf_name, user): + monkeypatch.delenv("HLINK_CONF", raising=False) + monkeypatch.chdir(tmp_path) + filename = str(tmp_path / conf_name) + ".toml" + + contents = {} + with open(filename, "w") as f: + toml.dump(contents, f) + + conf = load_conf(str(tmp_path / conf_name), user) + assert conf["conf_path"] == filename + + +@pytest.mark.parametrize("conf_name", ("my_conf", "testing.txt", "what.yaml")) +@pytest.mark.parametrize("user", users) +def test_load_conf_unrecognized_ext_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_name, user +): + monkeypatch.chdir(tmp_path) + + conf_dir = get_conf_dir(global_conf, user) + conf_dir.mkdir(parents=True) + file = conf_dir / conf_name + file.touch() + + error_msg = ( + f"The file {file} exists, but it doesn't have a '.toml' or '.json' extension." + ) + with pytest.raises(UsageError, match=error_msg): + load_conf(str(file), user) + + +def test_load_conf_keys_set_no_env(monkeypatch, tmp_path): + monkeypatch.delenv("HLINK_CONF", raising=False) + monkeypatch.chdir(tmp_path) + filename = str(tmp_path / "keys_test.json") + contents = {"key1": "value1", "rock": "stone", "how": "about that"} + + with open(filename, "w") as f: + json.dump(contents, f) + + conf = load_conf(filename, "test") + + for (key, value) in contents.items(): + assert conf[key] == value + + # Check for extra keys added by load_conf() + assert "conf_path" in conf + assert "derby_dir" in conf + assert "warehouse_dir" in conf + assert "spark_tmp_dir" in conf + assert "log_file" in conf + assert "python" in conf + + +@pytest.mark.parametrize("global_conf", ("my_global_conf.json", "test.json")) +def test_load_conf_global_conf_does_not_exist_env(monkeypatch, tmp_path, global_conf): + global_path = str(tmp_path / global_conf) + monkeypatch.setenv("HLINK_CONF", global_path) + + with pytest.raises(FileNotFoundError): + load_conf("notthere.toml", "test") + + +@pytest.mark.parametrize("conf_file", ("my_conf", "my_conf.json", "my_conf.toml")) +@pytest.mark.parametrize("user", users) +def test_load_conf_does_not_exist_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_file, user +): + monkeypatch.chdir(tmp_path) + + conf_dir = get_conf_dir(global_conf, user) + filename = str(conf_dir / conf_file) + toml_filename = filename + ".toml" + json_filename = filename + ".json" + + error_msg = f"Couldn't find any of these three files: {filename}, {toml_filename}, {json_filename}" + with pytest.raises(FileNotFoundError, match=error_msg): + load_conf(conf_file, user) + + +@pytest.mark.parametrize("conf_file", ("my_conf.json",)) +@pytest.mark.parametrize("user", users) +def test_load_conf_json_exists_in_conf_dir_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_file, user +): + monkeypatch.chdir(tmp_path) + conf_dir = get_conf_dir(global_conf, user) + conf_dir.mkdir(parents=True) + + file = conf_dir / conf_file + contents = {} + + with open(file, "w") as f: + json.dump(contents, f) + + conf = load_conf(conf_file, user) + assert conf["conf_path"] == str(file) + + +@pytest.mark.parametrize("conf_file", ("my_conf.toml",)) +@pytest.mark.parametrize("user", users) +def test_load_conf_toml_exists_in_conf_dir_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_file, user +): + monkeypatch.chdir(tmp_path) + conf_dir = get_conf_dir(global_conf, user) + conf_dir.mkdir(parents=True) + + file = conf_dir / conf_file + contents = {} + + with open(file, "w") as f: + toml.dump(contents, f) + + conf = load_conf(conf_file, user) + assert conf["conf_path"] == str(file) + + +@pytest.mark.parametrize("conf_name", ("my_conf", "test", "testingtesting123.txt")) +@pytest.mark.parametrize("user", users) +def test_load_conf_json_exists_in_conf_dir_ext_added_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_name, user +): + monkeypatch.chdir(tmp_path) + conf_dir = get_conf_dir(global_conf, user) + conf_dir.mkdir(parents=True) + + conf_file = conf_name + ".json" + file = conf_dir / conf_file + contents = {} + + with open(file, "w") as f: + json.dump(contents, f) + + conf = load_conf(conf_name, user) + assert conf["conf_path"] == str(file) + + +@pytest.mark.parametrize("conf_name", ("my_conf", "test", "testingtesting123.txt")) +@pytest.mark.parametrize("user", users) +def test_load_conf_toml_exists_in_conf_dir_ext_added_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_name, user +): + monkeypatch.chdir(tmp_path) + conf_dir = get_conf_dir(global_conf, user) + conf_dir.mkdir(parents=True) + + conf_file = conf_name + ".toml" + file = conf_dir / conf_file + contents = {} + + with open(file, "w") as f: + toml.dump(contents, f) + + conf = load_conf(conf_name, user) + assert conf["conf_path"] == str(file) + + +@pytest.mark.parametrize("conf_name", ("my_conf", "testing.txt", "what.yaml")) +@pytest.mark.parametrize("user", users) +def test_load_conf_unrecognized_ext_no_env(monkeypatch, tmp_path, conf_name, user): + monkeypatch.delenv("HLINK_CONF", raising=False) + monkeypatch.chdir(tmp_path) + + file = tmp_path / conf_name + file.touch() + + error_msg = f"The file {conf_name} exists, but it doesn't have a '.toml' or '.json' extension." + with pytest.raises(UsageError, match=error_msg): + load_conf(conf_name, user) + + +def test_load_conf_keys_set_env( + monkeypatch, tmp_path, set_up_global_conf_file, global_conf +): + monkeypatch.chdir(tmp_path) + user = "test" + conf_dir = get_conf_dir(global_conf, user) + conf_dir.mkdir(parents=True) + file = conf_dir / "keys_test.json" + filename = str(file) + + contents = {"key1": "value1", "rock": "stone", "how": "about that"} + + with open(file, "w") as f: + json.dump(contents, f) + + conf = load_conf(filename, user) + + for (key, value) in contents.items(): + assert conf[key] == value + + # Check for extra keys added by load_conf() + assert "conf_path" in conf + assert "derby_dir" in conf + assert "warehouse_dir" in conf + assert "spark_tmp_dir" in conf + assert "log_file" in conf + assert "python" in conf diff --git a/hlink/tests/matching_blocking_explode_test.py b/hlink/tests/matching_blocking_explode_test.py new file mode 100755 index 0000000..2ab6bc8 --- /dev/null +++ b/hlink/tests/matching_blocking_explode_test.py @@ -0,0 +1,127 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pandas as pd +from hlink.linking.matching.link_step_score import LinkStepScore + + +def test_steps_1_2_matching( + spark, blocking_explode_conf, matching_test_input, matching, main +): + """Test explode step with blocking columns""" + table_a, table_b = matching_test_input + table_a.createOrReplaceTempView("prepped_df_a") + table_b.createOrReplaceTempView("prepped_df_b") + + matching.run_step(0) + + expl_a = spark.table("exploded_df_a").toPandas() + expl_b = spark.table("exploded_df_b").toPandas() + + assert all( + elem in list(expl_a.columns) + for elem in ["namefrst", "namelast", "sex", "birthyr_3"] + ) + assert all( + elem in list(expl_b.columns) + for elem in ["namefrst", "namelast", "sex", "birthyr_3"] + ) + + matching.run_step(1) + + potential_matches_df = spark.table("potential_matches").toPandas() + + assert all( + elem not in list(potential_matches_df.columns) for elem in ["birthyr_3", "ssex"] + ) + + blocking_explode_conf["streamline_potential_match_generation"] = True + main.do_drop("potential_matches") + matching.run_step(1) + + pm_small = spark.table("potential_matches").toPandas() + + assert "ssex" not in list(pm_small.columns) + assert all( + elem in list(pm_small.columns) + for elem in ["id_a", "id_b", "namefrst_jw", "namelast_jw"] + ) + + LinkStepScore(matching)._create_features(matching.link_run.config) + pmp = spark.table("potential_matches_prepped").toPandas() + + assert all( + elem in list(pmp.columns) + for elem in ["id_a", "id_b", "namefrst_jw", "namelast_jw", "ssex"] + ) + + +def test_blocking_multi_layer_comparison( + matching_conf_namefrst_std_and_unstd, spark, preprocessing, matching +): + """ Test a blocking criteria comparison which contains an 'and' clause and a nested 'or' clause """ + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + pms = matching.spark.table("potential_matches").toPandas() + + assert len(pms["histid_a"]) == 4 + assert "ginny" not in list(pms["namefrst_unstd_a"]) + assert "jupiter" not in list(pms["namelast_clean_a"]) + for index, row in pms.iterrows(): + assert (row["namefrst_unstd_jw"] > 0.7) or (row["namefrst_std_jw"] > 0.7) + assert row["namelast_jw"] > 0.7 + + matching_conf_namefrst_std_and_unstd["comparisons"] = { + "operator": "AND", + "comp_a": { + "operator": "OR", + "comp_a": { + "feature_name": "namefrst_unstd_jw", + "threshold": 0.0, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namefrst_std_jw", + "threshold": 0.0, + "comparison_type": "threshold", + }, + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.0, + "comparison_type": "threshold", + }, + } + + spark.sql("drop table potential_matches") + matching.run_step(1) + + pm_no_clause = matching.spark.table("potential_matches").toPandas() + + assert len(pm_no_clause["histid_a"]) == 6 + assert "ginny" in list(pm_no_clause["namefrst_unstd_a"]) + assert "jupiter" in list(pm_no_clause["namelast_clean_a"]) + + only_test_df = pd.merge( + pm_no_clause, pms, on=["histid_a", "histid_b"], how="outer", indicator=True + ) + only_test_df = only_test_df[only_test_df["_merge"] == "left_only"] + assert len(only_test_df["histid_a"]) == 2 + for index, row in only_test_df.iterrows(): + assert ( + (row["namefrst_unstd_jw_x"] < 0.7) or (row["namefrst_std_jw_x"] < 0.7) + ) or (row["namelast_jw_x"] < 0.7) + + +# TODO: test_step_2_length_b + +# TODO: test_step_2_has_matching_element + +# TODO: test_step_2_error_no_comp_type diff --git a/hlink/tests/matching_comparison_features_test.py b/hlink/tests/matching_comparison_features_test.py new file mode 100755 index 0000000..b8b8308 --- /dev/null +++ b/hlink/tests/matching_comparison_features_test.py @@ -0,0 +1,1204 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pandas as pd +from hlink.linking.matching.link_step_score import LinkStepScore + + +# TODO: add documentation +def test_step_2_equals_and_equals_as_int( + spark, matching_household_conf, matching, preprocessing +): + matching_household_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namelast_equal", + "column_name": "namelast_clean", + "comparison_type": "equals", + }, + { + "alias": "namelast_equal_as_int", + "column_name": "namelast_clean", + "comparison_type": "equals_as_int", + }, + ] + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_1 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 37 + assert len(potential_matches_df.namelast_equal) == 37 + assert len(potential_matches_df.namelast_equal_as_int == 37) + assert not potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["namelast_equal"].iloc[0] + assert ( + potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["namelast_equal_as_int"].iloc[0] + == 0 + ) + assert potential_matches_df.query( + "id_a == '49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ' and id_b == '3575c9ba-1527-4ca2-aff0-d7c2d1efb421 '" + )["namelast_equal"].iloc[0] + assert ( + potential_matches_df.query( + "id_a == '49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ' and id_b == '3575c9ba-1527-4ca2-aff0-d7c2d1efb421 '" + )["namelast_equal_as_int"].iloc[0] + == 1 + ) + + +# TODO: add documentation +def test_step_2_all_equals(spark, matching_household_conf, matching, preprocessing): + matching_household_conf["comparison_features"] = [ + { + "alias": "exact_all", + "column_names": ["namefrst_std", "namelast_clean", "age"], + "comparison_type": "all_equals", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + ] + matching_household_conf["training"]["dependent_var"] = "match" + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 37 + assert len(potential_matches_df.exact_all) == 37 + assert ( + potential_matches_df.query( + "id_a == 'a499b0dc-7ac0-4d61-b493-91a3036c712e ' and id_b == '426f2cbe-32e1-45eb-9f86-89a2b9116b7e '" + )["exact_all"].iloc[0] + == 0 + ) + assert ( + potential_matches_df.query( + "id_a == 'bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ' and id_b == 'bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 '" + )["exact_all"].iloc[0] + == 1 + ) + + +def test_step_2_fetch_either_length( + spark, preprocessing, matching, matching_conf_nativity +): + """ Test nativity, imm, sgen (second generation immigrant) code transforms as well as nested comps and fetch_a """ + matching_conf_nativity["id_column"] = "histid" + matching_conf_nativity["column_mappings"] = [ + {"column_name": "pair_no"}, + {"column_name": "nativity"}, + {"column_name": "county"}, + {"column_name": "state"}, + {"column_name": "street"}, + ] + matching_conf_nativity["blocking"] = [{"column_name": "pair_no"}] + matching_conf_nativity["comparisons"] = {} + matching_conf_nativity["comparison_features"] = [ + { + "alias": "imm", + "comparison_type": "fetch_b", + "column_name": "nativity", + "threshold": 5, + "categorical": True, + }, + { + "alias": "either_1", + "column_name": "nativity", + "comparison_type": "either_are_1", + "categorical": True, + }, + { + "alias": "either_0", + "column_name": "nativity", + "comparison_type": "either_are_0", + "categorical": True, + }, + ] + + matching_conf_nativity["training"]["independent_vars"] = [ + "imm", + "sgen", + "street_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_conf_nativity) + + # Create pandas DFs of the step_1 potential matches table + matches = spark.table("potential_matches_prepped").toPandas() + + assert matches.query("pair_no_a == 5")["either_1"].iloc[0] + assert not matches.query("pair_no_a == 5")["either_0"].iloc[0] + assert matches.query("pair_no_a == 2")["imm"].iloc[0] + assert matches.query("pair_no_a == 4")["imm"].iloc[0] + + +def test_step_2_nativity(spark, preprocessing, matching, matching_conf_nativity): + """ Test nativity, imm, sgen (second generation immigrant) code transforms as well as nested comps and fetch_a """ + matching_conf_nativity["id_column"] = "histid" + matching_conf_nativity["column_mappings"] = [ + {"column_name": "pair_no"}, + {"column_name": "nativity"}, + {"column_name": "county"}, + {"column_name": "state"}, + {"column_name": "street"}, + ] + matching_conf_nativity["blocking"] = [{"column_name": "pair_no"}] + matching_conf_nativity["comparisons"] = {} + matching_conf_nativity["comparison_features"] = [ + { + "alias": "imm", + "comparison_type": "fetch_a", + "column_name": "nativity", + "threshold": 5, + "categorical": True, + }, + { + "alias": "sgen", + "column_name": "nativity", + "comparison_type": "second_gen_imm", + "categorical": True, + }, + { + "alias": "street_jw", + "column_names": ["street", "county", "state"], + "comparison_type": "times", + "comp_a": { + "column_name": "street", + "comparison_type": "jaro_winkler", + "lower_threshold": 0.9, + }, + "comp_b": { + "comparison_type": "and", + "comp_a": {"column_name": "county", "comparison_type": "equals"}, + "comp_b": {"column_name": "state", "comparison_type": "equals"}, + }, + }, + ] + + matching_conf_nativity["training"]["independent_vars"] = [ + "imm", + "sgen", + "street_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_conf_nativity) + + matches = spark.table("potential_matches_prepped").toPandas() + + assert matches.query("pair_no_a == 4")["imm"].iloc[0] + assert matches.query("pair_no_a == 2")["sgen"].iloc[0] + assert not matches.query("pair_no_a == 5")["sgen"].iloc[0] + assert matches.query("pair_no_a == 4")["street_jw"].iloc[0] == 1 + assert matches.query("pair_no_a == 1")["street_jw"].iloc[0] == 0 + + +def test_step_2_JW_only(spark, matching_conf, matching): + """ Test matching step 2 to ensure that comparison features are generated (can a regular comparison (as represented by J/W) still run if there's NOT a distance lookup feature)""" + + matching_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + } + ] + + matching.run_step(0) + matching.run_step(1) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 5 + assert len(potential_matches_df.namelast_jw) == 5 + assert ( + potential_matches_df.query("id_a == 20 and id_b == 30")["namelast_jw"].iloc[0] + == 1 + ) + assert ( + potential_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] + > 0.87 + ) + + +def test_step_2_JW_street(spark, matching_conf, matching): + """ Test creation of comparison feature with an IF requirement (jw_street) as well as a regular comparison feature (represented by J/W)""" + + matching_conf["comparison_features"] = [ + { + "alias": "jw_street", + "column_name": "street", + "boundary": "enum_dist", + "comparison_type": "jaro_winkler_street", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_conf) + + # Create pandas DFs of the step_1 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 5 + assert len(potential_matches_df.jw_street) == 5 + assert ( + potential_matches_df.query("id_a == 20 and id_b == 50")["jw_street"].iloc[0] + == 0 + ) + assert ( + potential_matches_df.query("id_a == 10 and id_b == 10")["jw_street"].iloc[0] + > 0.97 + ) + assert ( + potential_matches_df.query("id_a == 20 and id_b == 30")["namelast_jw"].iloc[0] + == 1 + ) + assert ( + 0.48 + < potential_matches_df.query("id_a == 30 and id_b == 50")["jw_street"].iloc[0] + < 0.5 + ) + + +def test_step_2_maximum_jaro_winkler(spark, matching_conf, matching): + """ Test creation of maximum_jaro_winkler comparison feature""" + + matching_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namefrst_jw", + "column_name": "namefrst", + "comparison_type": "jaro_winkler", + }, + { + "alias": "maximum_jw", + "column_names": ["namelast", "namefrst"], + "comparison_type": "maximum_jaro_winkler", + }, + ] + + matching_conf["training"]["dependent_var"] = "matching" + matching_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_conf) + + # Create pandas DFs of the step_1 potential matches table + potential_matches_prepped_df = spark.table("potential_matches_prepped").toPandas() + + # Create pandas DFs of the step_2 potential matches table + # potential_matches_prepped_df = spark.table("potential_matches").toPandas() + + # Make assertions on the data + assert len(potential_matches_prepped_df.id_a) == 5 + assert len(potential_matches_prepped_df.maximum_jw) == 5 + assert ( + potential_matches_prepped_df.query("id_a == 20 and id_b == 30")[ + "maximum_jw" + ].iloc[0] + == 1 + ) + assert ( + 0.98 + > potential_matches_prepped_df.query("id_a == 10 and id_b == 10")[ + "maximum_jw" + ].iloc[0] + > 0.97 + ) + assert ( + 0.855 + > potential_matches_prepped_df.query("id_a == 30 and id_b == 30")[ + "maximum_jw" + ].iloc[0] + > 0.84 + ) + assert ( + 0.80 + < potential_matches_prepped_df.query("id_a == 30 and id_b == 30")[ + "namefrst_jw" + ].iloc[0] + < 0.81 + ) + + +def test_step_2_max_jaro_winkler( + spark, matching_household_conf, matching, preprocessing +): + """ Test creation of max_jaro_winkler comparison feature""" + + matching_household_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + { + "alias": "related_individual_max_jw", + "column_name": "namefrst_related", + "comparison_type": "max_jaro_winkler", + }, + ] + + matching_household_conf["feature_selections"] = [ + { + "output_col": "namefrst_related", + "input_col": "namefrst_std", + "transform": "related_individuals", + "family_id": "serialp", + "relate_col": "relate", + "top_code": 10, + "bottom_code": 3, + } + ] + matching_household_conf["training"]["dependent_var"] = "matching" + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_1 potential matches table + potential_matches_prepped_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_prepped_df.id_a) == 37 + assert len(potential_matches_prepped_df.related_individual_max_jw) == 37 + assert ( + potential_matches_prepped_df.query( + "id_a == '3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ' and id_b == '49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 '" + )["related_individual_max_jw"].iloc[0] + == 1 + ) + assert potential_matches_prepped_df.query( + "id_a == '3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ' and id_b == '49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 '" + )["namefrst_related_a"].iloc[0] == ["john", "mary"] + assert potential_matches_prepped_df.query( + "id_a == '3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ' and id_b == '49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 '" + )["namefrst_related_b"].iloc[0] == ["john", "maggie"] + assert ( + round( + potential_matches_prepped_df.query( + "id_a == 'ad6442b5-42bc-4c2e-a517-5a951d989a92 ' and id_b == 'ae7261c3-7d71-4ea1-997f-5d1a68c18777 '" + )["related_individual_max_jw"].iloc[0], + 2, + ) + == 0.63 + ) + assert potential_matches_prepped_df.query( + "id_a == 'ad6442b5-42bc-4c2e-a517-5a951d989a92 ' and id_b == 'ae7261c3-7d71-4ea1-997f-5d1a68c18777 '" + )["namefrst_related_a"].iloc[0] == ["sally"] + assert potential_matches_prepped_df.query( + "id_a == 'ad6442b5-42bc-4c2e-a517-5a951d989a92 ' and id_b == 'ae7261c3-7d71-4ea1-997f-5d1a68c18777 '" + )["namefrst_related_b"].iloc[0] == ["mary"] + + +def test_step_2_rel_jaro_winkler( + spark, matching_household_conf, matching, preprocessing +): + """ Test creation of max_jaro_winkler comparison feature""" + + matching_household_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + { + "alias": "rel", + "column_name": "namefrst_related_rows", + "comparison_type": "rel_jaro_winkler", + "jw_threshold": 0.9, + "age_threshold": 5, + }, + { + "alias": "rel_threshold", + "column_name": "namefrst_related_rows", + "comparison_type": "rel_jaro_winkler", + "jw_threshold": 0.9, + "age_threshold": 5, + "lower_threshold": 1, + }, + ] + + matching_household_conf["feature_selections"] = [ + { + "family_id": "serialp", + "input_cols": ["namefrst_std", "birthyr", "sex"], + "output_col": "namefrst_related_rows", + "transform": "related_individual_rows", + "filters": [ + {"column": "relate", "max": 10, "min": 3}, + {"column": "age", "max": 99, "min": 8, "dataset": "b"}, + ], + } + ] + + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_1 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 37 + assert len(potential_matches_df.namefrst_related_rows_a) == 37 + assert len(potential_matches_df.rel) == 37 + assert len(potential_matches_df.rel_threshold) == 37 + + assert ( + len( + potential_matches_df.query( + "id_a == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 ' and id_b == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 '" + )["namefrst_related_rows_a"].iloc[0] + ) + == 3 + ) + assert ( + len( + potential_matches_df.query( + "id_a == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 ' and id_b == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 '" + )["namefrst_related_rows_b"].iloc[0] + ) + == 1 + ) + assert ( + potential_matches_df.query( + "id_a == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 ' and id_b == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 '" + )["rel"].iloc[0] + == 1 + ) + assert potential_matches_df.query( + "id_a == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 ' and id_b == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 '" + )["rel_threshold"].iloc[0] + + +def test_step_2_jaro_winkler_rate( + spark, matching_household_conf, matching, preprocessing +): + """ Test creation of jaro_winkler_rate comparison feature""" + + matching_household_conf["comparison_features"] = [ + { + "alias": "neighbor_namelast_jw_rate", + "column_name": "namelast_neighbors", + "comparison_type": "jaro_winkler_rate", + "jw_threshold": 0.95, + }, + { + "alias": "neighbor_namelast_jw_rate_threshold", + "column_name": "namelast_neighbors", + "comparison_type": "jaro_winkler_rate", + "jw_threshold": 0.95, + "threshold": "0.8", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + ] + + matching_household_conf["feature_selections"] = [ + { + "output_column": "namelast_neighbors", + "input_column": "namelast_clean", + "transform": "neighbor_aggregate", + "neighborhood_column": "enumdist", + "sort_column": "serialp", + "range": 10, + } + ] + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_1 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 37 + assert len(potential_matches_df.neighbor_namelast_jw_rate) == 37 + assert len(potential_matches_df.neighbor_namelast_jw_rate_threshold) == 37 + assert ( + round( + potential_matches_df.query( + "id_a == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 ' and id_b == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 '" + )["neighbor_namelast_jw_rate"].iloc[0], + 2, + ) + == 0.92 + ) + assert potential_matches_df.query( + "id_a == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 ' and id_b == 'd3217545-3453-4d96-86c0-d6a3e60fb2f8 '" + )["neighbor_namelast_jw_rate_threshold"].iloc[0] + assert ( + potential_matches_df.query( + "id_a == '426f2cbe-32e1-45eb-9f86-89a2b9116b7e ' and id_b == 'a499b0dc-7ac0-4d61-b493-91a3036c712e '" + )["neighbor_namelast_jw_rate"].iloc[0] + == 0.75 + ) + assert not potential_matches_df.query( + "id_a == '426f2cbe-32e1-45eb-9f86-89a2b9116b7e ' and id_b == 'a499b0dc-7ac0-4d61-b493-91a3036c712e '" + )["neighbor_namelast_jw_rate_threshold"].iloc[0] + assert ( + potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["neighbor_namelast_jw_rate"].iloc[0] + == 0 + ) + assert not potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["neighbor_namelast_jw_rate_threshold"].iloc[0] + assert ( + potential_matches_df.query( + "id_a == 'a499b0dc-7ac0-4d61-b493-91a3036c712e ' and id_b == '426f2cbe-32e1-45eb-9f86-89a2b9116b7e '" + )["neighbor_namelast_jw_rate"].iloc[0] + == 0.9 + ) + assert potential_matches_df.query( + "id_a == 'a499b0dc-7ac0-4d61-b493-91a3036c712e ' and id_b == '426f2cbe-32e1-45eb-9f86-89a2b9116b7e '" + )["neighbor_namelast_jw_rate_threshold"].iloc[0] + + +def test_step_2_JW_double_array_blocking_conf(spark, matching_conf, matching, capsys): + """ Test matching step 2 to ensure that comparison features are generated (can a regular comparison (as represented by J/W) still run if there's NOT a distance lookup feature)""" + matching_conf["blocking_steps"] = [[{"column_name": "sex"}]] + matching_conf.pop("blocking") + + matching_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + } + ] + + matching.run_step(0) + matching.run_step(1) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 5 + assert len(potential_matches_df.namelast_jw) == 5 + assert ( + potential_matches_df.query("id_a == 20 and id_b == 30")["namelast_jw"].iloc[0] + == 1 + ) + assert ( + potential_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] + > 0.87 + ) + + captured = capsys.readouterr() + assert ( + "DEPRECATION WARNING: The config value 'blocking_steps' has been renamed to 'blocking' and is now just a single array of objects." + in captured.out + ) + + +def test_step_2_comparison_features_comp_c_and_caution( + spark, matching_comparison_conf, matching +): + """ Test a comparison feature with comp_a, comp_b, and comp_c using spouse caution feature as example """ + matching_comparison_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "sp_caution", + "column_names": ["spouse_bpl", "spouse_birthyr", "durmarr"], + "comparison_type": "or", + "comp_a": {"column_name": "spouse_bpl", "comparison_type": "not_equals"}, + "comp_b": { + "column_name": "spouse_birthyr", + "comparison_type": "abs_diff", + "lower_threshold": 5, + }, + "comp_c": { + "column_name": "durmarr", + "comparison_type": "new_marr", + "upper_threshold": 7, + }, + }, + { + "alias": "m_caution", + "column_names": ["mbpl", "mother_birthyr", "stepmom"], + "comparison_type": "or", + "comp_a": {"column_name": "mbpl", "comparison_type": "not_equals"}, + "comp_b": { + "column_name": "mother_birthyr", + "comparison_type": "abs_diff", + "lower_threshold": 5, + }, + "comp_c": { + "column_name": "stepmom", + "comparison_type": "parent_step_change", + }, + }, + { + "alias": "new_marr", + "column_name": "durmarr", + "comparison_type": "new_marr", + "upper_threshold": 7, + }, + { + "alias": "existing_marr", + "column_name": "durmarr", + "comparison_type": "existing_marr", + "lower_threshold": 8, + }, + { + "alias": "mom_step_change", + "column_name": "stepmom", + "comparison_type": "parent_step_change", + }, + ] + matching_comparison_conf["streamline_potential_match_generation"] = True + + matching_comparison_conf["training"] = { + "independent_vars": [ + "namelast_jw", + "sp_caution", + "m_caution", + "new_marr", + "existing_marr", + "mom_step_change", + ], + "dependent_var": "match", + "use_training_data_features": False, + # "use_potential_matches_features": False, + # "check_for_null_columns": False, + } + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching.link_run.config) + + # Create pandas DFs of the step_2 potential matches table + assert sorted(spark.table("potential_matches").columns) == [ + "id_a", + "id_b", + "namelast_jw", + ] + + potential_matches_prepped = spark.table("potential_matches_prepped").toPandas() + assert not potential_matches_prepped.query("id_a == 10 and id_b == 10")[ + "sp_caution" + ].iloc[0] + assert not potential_matches_prepped.query("id_a == 10 and id_b == 10")[ + "m_caution" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 20 and id_b == 20")[ + "sp_caution" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 20 and id_b == 20")[ + "m_caution" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 30 and id_b == 30")[ + "sp_caution" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 30 and id_b == 30")[ + "m_caution" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 30 and id_b == 40")[ + "sp_caution" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 30 and id_b == 40")[ + "m_caution" + ].iloc[0] + assert not potential_matches_prepped.query("id_a == 10 and id_b == 10")[ + "new_marr" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 10 and id_b == 10")[ + "existing_marr" + ].iloc[0] + assert not potential_matches_prepped.query("id_a == 10 and id_b == 10")[ + "mom_step_change" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 10 and id_b == 40")[ + "new_marr" + ].iloc[0] + assert not potential_matches_prepped.query("id_a == 10 and id_b == 40")[ + "existing_marr" + ].iloc[0] + assert potential_matches_prepped.query("id_a == 10 and id_b == 40")[ + "mom_step_change" + ].iloc[0] + + +def test_step_2_comparison_features_comp_d_and_caution( + spark, matching_comparison_conf, matching +): + """ Test a comparison feature with comp_a, comp_b, comp_c, and comp_d using mixed booleans and caution features as example """ + matching.link_run.print_sql = True + matching_comparison_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "new_marr", + "column_name": "durmarr", + "comparison_type": "new_marr", + "upper_threshold": 7, + }, + { + "alias": "existing_marr", + "column_name": "durmarr", + "comparison_type": "existing_marr", + "lower_threshold": 8, + }, + { + "alias": "mom_present_both_years", + "column_name": "momloc", + "comparison_type": "present_both_years", + }, + { + "alias": "spouse_present_both_years", + "column_name": "sploc", + "comparison_type": "present_both_years", + }, + { + "alias": "mom_step_change", + "column_name": "stepmom", + "comparison_type": "parent_step_change", + }, + { + "alias": "m_caution", + "column_names": ["mbpl", "mother_birthyr", "stepmom", "momloc"], + "comparison_type": "caution_comp_4", + "comp_a": {"column_name": "mbpl", "comparison_type": "not_equals"}, + "comp_b": { + "column_name": "mother_birthyr", + "comparison_type": "abs_diff", + "lower_threshold": 5, + }, + "comp_c": { + "column_name": "stepmom", + "comparison_type": "parent_step_change", + }, + "comp_d": { + "column_name": "momloc", + "comparison_type": "present_both_years", + }, + }, + { + "alias": "sp_caution", + "column_names": ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"], + "comparison_type": "caution_comp_4", + "comp_a": {"column_name": "spouse_bpl", "comparison_type": "not_equals"}, + "comp_b": { + "column_name": "spouse_birthyr", + "comparison_type": "abs_diff", + "lower_threshold": 5, + }, + "comp_c": { + "column_name": "durmarr", + "comparison_type": "new_marr", + "upper_threshold": 7, + }, + "comp_d": {"column_name": "sploc", "comparison_type": "present_both_years"}, + }, + ] + + matching_comparison_conf["training"] = {} + + matching_comparison_conf["training"]["dependent_var"] = "matching" + matching_comparison_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_comparison_conf) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + assert potential_matches_df.shape == (12, 34) + assert not potential_matches_df.query("id_a == 10 and id_b == 10")[ + "sp_caution" + ].iloc[0] + assert not potential_matches_df.query("id_a == 10 and id_b == 10")[ + "m_caution" + ].iloc[0] + assert not potential_matches_df.query("id_a == 20 and id_b == 20")[ + "sp_caution" + ].iloc[0] + assert potential_matches_df.query("id_a == 20 and id_b == 20")["m_caution"].iloc[0] + assert potential_matches_df.query("id_a == 10 and id_b == 30")["sp_caution"].iloc[0] + assert not potential_matches_df.query("id_a == 10 and id_b == 30")[ + "m_caution" + ].iloc[0] + assert not potential_matches_df.query("id_a == 30 and id_b == 40")[ + "sp_caution" + ].iloc[0] + assert not potential_matches_df.query("id_a == 30 and id_b == 40")[ + "m_caution" + ].iloc[0] + assert not potential_matches_df.query("id_a == 10 and id_b == 10")["new_marr"].iloc[ + 0 + ] + assert potential_matches_df.query("id_a == 10 and id_b == 10")[ + "existing_marr" + ].iloc[0] + assert not potential_matches_df.query("id_a == 10 and id_b == 10")[ + "mom_step_change" + ].iloc[0] + assert potential_matches_df.query("id_a == 10 and id_b == 40")["new_marr"].iloc[0] + assert not potential_matches_df.query("id_a == 10 and id_b == 40")[ + "existing_marr" + ].iloc[0] + assert potential_matches_df.query("id_a == 10 and id_b == 40")[ + "mom_step_change" + ].iloc[0] + + +def test_step_2_neither_are_null( + spark, matching_household_conf, matching, preprocessing +): + """ Test a comparison feature with comp_a, comp_b, and comp_c using spouse caution feature as example """ + matching_household_conf["feature_selections"] = [ + { + "output_col": "spouse_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_std", + "person_pointer": "sploc", + "family_id": "serialp", + "person_id": "pernum", + }, + { + "output_col": "father_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_std", + "person_pointer": "poploc", + "family_id": "serialp", + "person_id": "pernum", + }, + { + "output_col": "mother_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_std", + "person_pointer": "momloc", + "family_id": "serialp", + "person_id": "pernum", + }, + ] + + matching_household_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + { + "alias": "sp_present", + "column_name": "spouse_namefrst", + "comparison_type": "neither_are_null", + }, + { + "alias": "m_present", + "column_name": "mother_namefrst", + "comparison_type": "neither_are_null", + }, + { + "alias": "f_present", + "column_name": "father_namefrst", + "comparison_type": "neither_are_null", + }, + ] + matching_household_conf["training"]["dependent_var"] = "match" + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 37 + assert len(potential_matches_df.sp_present) == 37 + assert len(potential_matches_df.mother_namefrst_a == 37) + assert pd.isnull( + potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["father_namefrst_a"].iloc[0] + ) + assert pd.isnull( + potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["father_namefrst_b"].iloc[0] + ) + assert ( + potential_matches_df.query( + "id_a == '92277f0b-1476-41f5-9dc8-bf83672616d0' and id_b == '9e807937-de09-414c-bfb2-ac821e112929 '" + )["f_present"].iloc[0] + == 0 + ) + assert pd.notnull( + potential_matches_df.query( + "id_a == 'bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ' and id_b == 'bfe1080e-2e67-4a8c-a6e1-ed94ea103712 '" + )["mother_namefrst_a"].iloc[0] + ) + assert pd.notnull( + potential_matches_df.query( + "id_a == 'bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ' and id_b == 'bfe1080e-2e67-4a8c-a6e1-ed94ea103712 '" + )["mother_namefrst_b"].iloc[0] + ) + assert ( + potential_matches_df.query( + "id_a == 'bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ' and id_b == 'bfe1080e-2e67-4a8c-a6e1-ed94ea103712 '" + )["m_present"].iloc[0] + == 1 + ) + + +def test_step_2_create_features_sql_condition( + spark, conf, matching, datasource_sql_condition_input +): + """ Test a comparison feature with comp_a, comp_b, and comp_c using spouse caution feature as example """ + conf["comparison_features"] = [ + { + "alias": "marst_warn", + "column_name": "marst", + "comparison_type": "sql_condition", + "condition": """case + when ((a.marst == 6) AND (b.marst > 0) AND (b.marst < 6)) then 1 + when ((a.marst > 0) and (a.marst < 6) AND (b.marst == 6)) then 1 + when (((a.marst == 4) OR (a.marst == 5)) AND ((b.marst == 1) OR (b.marst == 2))) then 1 + else 0 end""", + }, + { + "alias": "key_marst_warn", + "column_name": "key_marst_warn", + "comparison_type": "fetch_a", + }, + ] + conf["training"] = { + "dependent_var": "match", + "independent_vars": ["marst_warn", "key_marst_warn"], + } + + pa_path, pb_path, pm_path = datasource_sql_condition_input + + matching.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + matching.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + matching.spark.read.csv(pm_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("potential_matches") + + LinkStepScore(matching)._create_features(conf) + + pmp = spark.table("potential_matches_prepped").toPandas() + assert pmp["key_marst_warn"].equals(pmp["marst_warn"]) + + +def test_step_1_transform_calc_nativity( + preprocessing, spark, preprocessing_conf_19thc_nativity_conf, matching +): + """ Test attach_family_col transform on data containing households """ + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(preprocessing_conf_19thc_nativity_conf) + + pmp = spark.table("potential_matches_prepped").toPandas().sort_values(["id_a"]) + + assert pmp["key_mbpl_match"].equals(pmp["mbpl_match"]) + assert pmp["key_fbpl_match"].equals(pmp["fbpl_match"]) + assert pmp["key_mfbpl_match"].equals(pmp["mfbpl_match"]) + assert pmp["key_m_caution_1870_1880"].equals(pmp["m_caution_1870_1880"]) + assert pmp["key_m_caution_1850_1860"].equals(pmp["m_caution_1850_1860"]) + + +def test_step_1_transform_calc_mfbpl_match( + preprocessing, + spark, + preprocessing_conf_19thc_nativity_conf, + matching, + datasource_calc_mfbpl_pm_data, +): + """ Test attach_family_col transform on data containing households """ + path_a, path_b = datasource_calc_mfbpl_pm_data + + spark.read.csv(path_a, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + spark.read.csv(path_b, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(preprocessing_conf_19thc_nativity_conf) + + pmp = spark.table("potential_matches_prepped").toPandas().sort_values(["id_a"]) + + assert pmp["key_mbpl_match"].equals(pmp["mbpl_match"]) + assert pmp["key_fbpl_match"].equals(pmp["fbpl_match"]) + assert pmp["key_mfbpl_match"].equals(pmp["mfbpl_match"]) + assert pmp["key_m_caution_1870_1880"].equals(pmp["m_caution_1870_1880"]) + assert pmp["key_m_caution_1850_1860"].equals(pmp["m_caution_1850_1860"]) + + +def test_caution_comp_012( + preprocessing, + spark, + preprocessing_conf_19thc_caution_conf, + matching, +): + """ Test multiple clause comparison with 0, 1, and 2 outcome values. """ + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(preprocessing_conf_19thc_caution_conf) + + pmp = spark.table("potential_matches_prepped").toPandas().sort_values(["id_a"]) + + assert pmp["key_mbpl_match"].equals(pmp["mbpl_match"]) + assert pmp["key_fbpl_match"].equals(pmp["fbpl_match"]) + assert pmp["key_mfbpl_match"].equals(pmp["mfbpl_match"]) + assert pmp["key_m_caution_1870_1880"].equals(pmp["m_caution_1870_1880"]) + assert pmp["key_m_caution_1850_1860"].equals(pmp["m_caution_1850_1860"]) + + assert pmp["key_intermediate_mbpl_range_not_equals"].equals( + pmp["intermediate_mbpl_range_not_equals"] + ) + assert pmp["key_intermediate_mbpl_range_not_zero_and_not_equals"].equals( + pmp["intermediate_mbpl_range_not_zero_and_not_equals"] + ) + assert pmp["key_intermediate_mother_birthyr_abs_diff_5"].equals( + pmp["intermediate_mother_birthyr_abs_diff_5"] + ) + assert pmp["key_intermediate_stepmom_parent_step_change"].equals( + pmp["intermediate_stepmom_parent_step_change"] + ) + assert pmp["key_intermediate_momloc_present_both_years"].equals( + pmp["intermediate_momloc_present_both_years"] + ) + assert pmp["key_m_caution_cc3_012"].equals(pmp["m_caution_cc3_012"]) + assert pmp["key_m_caution_cc4_012"].equals(pmp["m_caution_cc4_012"]) diff --git a/hlink/tests/matching_geo_distance_test.py b/hlink/tests/matching_geo_distance_test.py new file mode 100755 index 0000000..f4bc3c5 --- /dev/null +++ b/hlink/tests/matching_geo_distance_test.py @@ -0,0 +1,427 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import math +import pandas as pd +import numpy as np +from hlink.linking.matching.link_step_score import LinkStepScore + + +def test_step_2_geo_distance_1_key_jaro_winkler( + spark, matching_conf, matching, state_dist_path +): + """ Test matching step 2 to ensure that comparison features are generated (both regular (represented by J/W) and as requiring a distance lookup file)""" + + matching_conf["comparison_features"] = [ + { + "alias": "state_distance", + "comparison_type": "geo_distance", + "key_count": 1, + "table_name": "state_distance_lookup", + "distances_file": state_dist_path, + "column_name": "bpl", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + }, + { + "alias": "distance_squared", + "comparison_type": "geo_distance", + "key_count": 1, + "table_name": "state_distance_lookup", + "distances_file": state_dist_path, + "column_name": "bpl", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "power": 2, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + matching_conf["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + # This confusing "comp_b" section is here to include the state_distance + # and distance_squared comparison features in the output of potential matches + "comp_b": { + "comp_a": { + "comp_a": { + "feature_name": "state_distance", + "threshold": 0, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "distance_squared", + "threshold": 0, + "comparison_type": "threshold", + }, + "operator": "AND", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + }, + "operator": "OR", + } + + matching.run_step(0) + matching.run_step(1) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 5 + assert len(potential_matches_df.state_distance) == 5 + assert potential_matches_df.query("id_a == 10")["state_distance"].iloc[0] == 1427.1 + assert ( + potential_matches_df.query("id_a == 10")["distance_squared"].iloc[0] == 2036329 + ) + assert ( + potential_matches_df.query("id_a == 20 and id_b == 30")["namelast_jw"].iloc[0] + == 1 + ) + assert pd.isnull( + potential_matches_df.query("id_a == 30 and id_b == 30")["state_distance"].iloc[ + 0 + ] + ) + assert ( + potential_matches_df.query("id_a == 30 and id_b == 50")["namelast_jw"] + .round(2) + .iloc[0] + == 0.85 + ) + + +def test_step_2_geo_distance_ids_only(spark, matching_conf, matching, state_dist_path): + """ Test matching step 2 to ensure that comparison features are generated (both regular (represented by J/W) and as requiring a distance lookup file)""" + + matching_conf["comparison_features"] = [ + { + "alias": "state_distance", + "comparison_type": "geo_distance", + "key_count": 1, + "table_name": "state_distance_lookup", + "distances_file": state_dist_path, + "column_name": "bpl", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + matching_conf["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + # This confusing "comp_b" section is here to include the state_distance + # in the output of potential matches + "comp_b": { + "comp_a": { + "feature_name": "state_distance", + "threshold": 0, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + }, + "operator": "OR", + } + matching_conf["streamline_potential_match_generation"] = True + + matching.run_step(0) + matching.run_step(1) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 5 + assert len(potential_matches_df.id_b) == 5 + assert potential_matches_df.shape == (5, 4) + + +def test_step_2_geo_distance_2_keys( + spark, preprocessing, matching, matching_conf_counties, county_dist_path +): + """ Test county distance code transform """ + matching_conf_counties["column_mappings"] = [ + {"column_name": "county_p", "alias": "county"}, + {"column_name": "statefip_p", "alias": "statefip"}, + {"column_name": "namelast"}, + {"column_name": "sex"}, + ] + matching_conf_counties["blocking"] = [{"column_name": "sex"}] + matching_conf_counties["comparison_features"] = [ + { + "alias": "county_distance", + "comparison_type": "geo_distance", + "key_count": 2, + "table_name": "county_distance_lookup", + "distances_file": county_dist_path, + "column_names": ["county", "state"], + "source_column_a": "county", + "source_column_b": "statefip", + "loc_a_0": "county0", + "loc_a_1": "county1", + "loc_b_0": "state0", + "loc_b_1": "state1", + "distance_col": "distance", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + + matching_conf_counties["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + # This confusing "comp_b" section is here to include the state_distance + # in the output of potential matches + "comp_b": { + "comp_a": { + "feature_name": "county_distance", + "threshold": 0, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + }, + "operator": "OR", + } + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + matches = spark.table("potential_matches").toPandas() + assert ( + math.floor( + matches.query("id_a == 10 and id_b == 40")["county_distance"].iloc[0] + ) + == 1613771 + ) + assert math.isnan( + matches.query("id_a == 30 and id_b == 40")["county_distance"].iloc[0] + ) + + +def test_step_2_geo_distance_secondary_lookup( + spark, + preprocessing, + matching, + matching_conf_counties, + county_dist_path, + state_dist_path, +): + """ Test county distance code transform """ + matching_conf_counties["column_mappings"] = [ + {"column_name": "county_p", "alias": "county"}, + {"column_name": "statefip_p", "alias": "statefip"}, + {"column_name": "namelast"}, + {"column_name": "sex"}, + ] + matching_conf_counties["blocking"] = [{"column_name": "sex"}] + matching_conf_counties["comparison_features"] = [ + { + "alias": "county_distance", + "comparison_type": "geo_distance", + "key_count": 2, + "table_name": "county_distance_lookup", + "distances_file": county_dist_path, + "column_names": ["county", "state"], + "source_column_a": "county", + "source_column_b": "statefip", + "loc_a_0": "county0", + "loc_a_1": "county1", + "loc_b_0": "state0", + "loc_b_1": "state1", + "distance_col": "distance", + "secondary_key_count": 1, + "secondary_table_name": "state_distance_lookup", + "secondary_distances_file": state_dist_path, + "secondary_source_column": "statefip", + "secondary_loc_a": "statecode1", + "secondary_loc_b": "statecode2", + "secondary_distance_col": "dist", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + + matching_conf_counties["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + # This confusing "comp_b" section is here to include the state_distance + # in the output of potential matches + "comp_b": { + "comp_a": { + "feature_name": "county_distance", + "threshold": 0, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + }, + "operator": "OR", + } + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + matches = spark.table("potential_matches").toPandas() + assert ( + math.floor( + matches.query("id_a == 10 and id_b == 40")["county_distance"].iloc[0] + ) + == 1613771 + ) + assert ( + math.floor( + matches.query("id_a == 30 and id_b == 40")["county_distance"].iloc[0] + ) + == 785 + ) + assert ( + math.floor( + matches.query("id_a == 30 and id_b == 20")["county_distance"].iloc[0] + ) + == 695 + ) + + +def test_step_2_geo_distance_1_and_2_keys( + spark, + preprocessing, + matching, + matching_conf_counties, + county_dist_path, + state_dist_path, +): + """ Test county distance code transform """ + matching_conf_counties["column_mappings"] = [ + {"column_name": "county_p", "alias": "county"}, + {"column_name": "statefip_p", "alias": "statefip"}, + {"column_name": "namelast"}, + {"column_name": "sex"}, + ] + matching_conf_counties["blocking"] = [{"column_name": "sex"}] + matching_conf_counties["comparisons"] = { + "comp_a": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "operator": "AND", + } + + matching_conf_counties["comparison_features"] = [ + { + "alias": "county_distance", + "comparison_type": "geo_distance", + "key_count": 2, + "table_name": "county_distance_lookup", + "distances_file": county_dist_path, + "column_names": ["county", "state"], + "source_column_a": "county", + "source_column_b": "statefip", + "loc_a_0": "county0", + "loc_a_1": "county1", + "loc_b_0": "state0", + "loc_b_1": "state1", + "distance_col": "distance", + }, + { + "alias": "state_distance", + "comparison_type": "geo_distance", + "key_count": 1, + "table_name": "state_distance_lookup", + "distances_file": state_dist_path, + "column_name": "statefip", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + matching_conf_counties["training"] = {} + matching_conf_counties["training"]["dependent_var"] = "matching" + matching_conf_counties["training"]["independent_vars"] = [] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_conf_counties) + + matches = spark.table("potential_matches_prepped").toPandas() + assert ( + math.floor( + matches.query("id_a == 10 and id_b == 40")["county_distance"].iloc()[0] + ) + == 1613771 + ) + + assert ( + matches.query("id_a == 10 and id_b == 40")["state_distance"].iloc()[0] == 917.5 + ) + + assert np.isnan( + matches.query("id_a == 30 and id_b == 40")["county_distance"].iloc()[0] + ) diff --git a/hlink/tests/matching_potential_matches_test.py b/hlink/tests/matching_potential_matches_test.py new file mode 100755 index 0000000..5157b7f --- /dev/null +++ b/hlink/tests/matching_potential_matches_test.py @@ -0,0 +1,89 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pytest + + +@pytest.mark.skip( + reason="We still want to test that these aggregate features are being created correctly, but we need to refactor this test to account for the fact that aggregate features are now being created in a different step (step 4 doesn't exist anymore and the functionality was moved in the code)." +) +def test_step_4_aggregate_features( + spark, matching_conf, matching, potential_matches_agg_path +): + """ Test adding aggregate features (hits, hits2, exact_all_mult, etc.) to potential matches """ + matching_conf["id_column"] = "histid" + matching_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + {"alias": "exact"}, + {"alias": "exact_all"}, + ] + matching_conf["training"] = { + "independent_vars": [ + "namelast_jw", + "exact", + "exact_all", + "hits", + "hits2", + "exact_mult", + "exact_all_mult", + "exact_all_mult2", + ] + } + + potential_matches = matching.spark.read.csv( + potential_matches_agg_path, header=True, inferSchema=True + ) + potential_matches.write.mode("overwrite").saveAsTable("potential_matches") + matching.step_4_aggregate_features() + + pm_df = matching.spark.table("potential_matches").toPandas() + + assert pm_df.shape == (30, 21) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["exact"].iloc[0] + == 1 + ) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["exact_all"].iloc[0] + == 1 + ) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["hits"].iloc[0] + == 3 + ) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["hits2"].iloc[0] + == 9 + ) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["exact_mult"].iloc[0] + == 3 + ) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["exact_all_mult"].iloc[0] + == 3 + ) + assert ( + pm_df.query( + "namelast_clean_a == 'cridlebaugh' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'" + )["exact_all_mult2"].iloc[0] + == 9 + ) diff --git a/hlink/tests/matching_potential_matches_universe_test.py b/hlink/tests/matching_potential_matches_universe_test.py new file mode 100755 index 0000000..002ab1e --- /dev/null +++ b/hlink/tests/matching_potential_matches_universe_test.py @@ -0,0 +1,70 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + + +def test_men_only(spark, blocking_explode_conf, matching_test_input, matching, main): + """Exclude women from potential matches.""" + table_a, table_b = matching_test_input + table_a.createOrReplaceTempView("prepped_df_a") + table_b.createOrReplaceTempView("prepped_df_b") + + # Get the number of men and women; the numbers in the + # potential matching universe should be <= men - women + prepped_a = spark.table("prepped_df_a").toPandas() + prepped_b = spark.table("prepped_df_b").toPandas() + + men_a = len(prepped_a.query("sex == 1")) + men_b = len(prepped_b.query("sex == 1")) + women_a = len(prepped_a.query("sex == 2")) + women_b = len(prepped_b.query("sex == 2")) + + unknown_a = len(prepped_a) - (men_a + women_a) + unknown_b = len(prepped_b) - (men_b + women_b) + + # There can be unknown SEX values + assert (unknown_a + men_a + women_a) == len(prepped_a) + assert (unknown_b + men_b + women_b) == len(prepped_b) + + # For the test setup to be valid there must be women to + # start with in the fixtures + assert women_b > 0 + assert women_a > 0 + + # Limit the universe to just men. + blocking_explode_conf["potential_matches_universe"] = [{"expression": "sex == 1"}] + + matching.run_step(0) + + # The explode step will take the spark versions of these as inputs + univ_a = spark.table("match_universe_df_a").toPandas() + univ_b = spark.table("match_universe_df_b").toPandas() + + # The exploded step will produce these tables + exploded_univ_a = spark.table("exploded_df_a").toPandas() + exploded_univ_b = spark.table("exploded_df_b").toPandas() + + assert all( + elem in list(exploded_univ_a.columns) + for elem in ["namefrst", "namelast", "sex", "birthyr_3"] + ) + assert all( + elem in list(exploded_univ_b.columns) + for elem in ["namefrst", "namelast", "sex", "birthyr_3"] + ) + + # Check there are no women in either universe. + assert len(univ_a.query("sex == 2")) == 0 + assert len(univ_b.query("sex == 2")) == 0 + + # Shows there aren't any unknown values of SEX + assert men_a == len(univ_a) + assert men_b == len(univ_b) + + assert men_a == len(univ_a.query("sex == 1")) + assert men_b == len(univ_b.query("sex == 1")) + + # Check that there are no women in the exploded rows + assert len(exploded_univ_a.query("sex == 2")) == 0 + assert len(exploded_univ_b.query("sex == 2")) == 0 diff --git a/hlink/tests/matching_scoring_test.py b/hlink/tests/matching_scoring_test.py new file mode 100755 index 0000000..3509b11 --- /dev/null +++ b/hlink/tests/matching_scoring_test.py @@ -0,0 +1,172 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.tests +import pandas as pd +import pytest +import hlink.linking.core.threshold as threshold_core +from hlink.linking.matching.link_step_score import LinkStepScore + + +@pytest.mark.skip( + reason="We still want to test that whatever 'secondary_threshold' became is being applied correctly, but we need to refactor this test to account for the fact that this was totally renamed and is now being carried out in a different step (step 3 doesn't exist anymore)." +) +def test_step_3_uniq_and_secondary_threshold(spark, matching_conf, matching): + """ Test a secondary threshold with uniqueness """ + matching_conf["comparison_features"] = [ + { + "alias": "namefrst_jw", + "column_name": "namefrst", + "comparison_type": "jaro_winkler", + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + + matching_conf["comparisons"] = { + "comp_a": { + "feature_name": "namefrst_jw", + "threshold": 0.8, + "comparison_type": "threshold", + }, + "comp_b": { + "feature_name": "namelast_jw", + "comparison_type": "threshold", + "threshold": 0.8, + }, + "operator": "AND", + } + + matching_conf["secondary_threshold"] = { + "threshold_a": { + "feature_name": "namefrst_jw", + "comparison_type": "threshold", + "threshold": 0.9, + }, + "threshold_b": { + "feature_name": "namelast_jw", + "comparison_type": "threshold", + "threshold": 0.9, + }, + "unique_true": {"id_a": "id_a", "id_b": "id_b"}, + "operator": "AND", + "secondary": True, + } + + matching.step_0_explode() + matching.step_1_match() + hlink.linking.matching._step_2_score.__create_features(matching, matching_conf) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # matching.step_3_secondary_threshold() + # unique_matches_df = spark.table("potential_matches").toPandas() + unique_high_matches_df = spark.table("potential_matches_prepped").toPandas() + + assert len(potential_matches_df.id_a) == 5 + # assert (len(unique_matches_df.id_a) == 1) + # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] > 0.8) + # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] < 0.9) + # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.8) + # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.9) + assert unique_high_matches_df.empty + + +# TODO: is there a step 3 anymore? +def test_step_3_skip_on_no_conf(spark, matching_conf, matching, capsys): + """ Test matching step 3 doesn't run if no training config""" + + matching_conf["comparison_features"] = [ + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + } + ] + + matching.run_step(0) + matching.run_step(1) + matching.run_step(2) + + captured = capsys.readouterr() + + assert ( + "WARNING: Skipping step 'score'. Your config file either does not contain a 'training' section or a 'chosen_model' section within the 'training' section." + in captured.out + ) + + +# TODO: is there a step 3 any more? +def test_step_3_alpha_beta_thresholds( + spark, matching, matching_conf, threshold_ratio_data_path_2 +): + """ Test matching step 3 with both probability and ratio thresholds """ + + matching.spark.read.csv( + threshold_ratio_data_path_2, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("score_tmp") + score_tmp = matching.spark.table("score_tmp") + + matching_conf["id_column"] = "histid" + matching_conf["training"]["decision"] = "drop_duplicate_with_threshold_ratio" + matching_conf["drop_data_from_scored_matches"] = True + threshold_ratio = 1.0 + alpha_threshold = 0.5 + + predictions = threshold_core.predict_using_thresholds( + score_tmp, + alpha_threshold, + threshold_ratio, + matching_conf["training"], + matching_conf["id_column"], + ) + predictions.write.mode("overwrite").saveAsTable("predictions") + + link_step_score = LinkStepScore(matching) + link_step_score._save_table_with_requested_columns( + "pm", "pmp", predictions, "histid_a", "histid_b" + ) + link_step_score._save_predicted_matches(matching_conf, "histid_a", "histid_b") + + tp = predictions.toPandas() + pm = matching.spark.table("predicted_matches").toPandas() + + assert sorted(tp.columns) == [ + "histid_a", + "histid_b", + "prediction", + "probability", + "ratio", + "second_best_prob", + ] + assert sorted(pm.columns) == ["histid_a", "histid_b", "prediction", "probability"] + assert tp["prediction"].sum() == 5 + assert pm["prediction"].sum() == 3 + assert pd.isnull(tp.query("histid_a == '0a' and histid_b == '1b'")["ratio"].iloc[0]) + assert pd.notnull( + tp.query("histid_a == '0a' and histid_b == '1b'")["second_best_prob"].iloc[0] + ) + assert tp.query("histid_a == '0a' and histid_b == '1b'")["prediction"].iloc[0] == 0 + assert pd.notnull( + tp.query("histid_a == '0a' and histid_b == '0b'")["second_best_prob"].iloc[0] + ) + assert tp.query("histid_a == '0a' and histid_b == '0b'")["prediction"].iloc[0] == 0 + assert tp.query("histid_a == '1a' and histid_b == '3b'")["prediction"].iloc[0] == 1 + assert tp.query("histid_a == '2a' and histid_b == '4b'")["prediction"].iloc[0] == 1 + assert tp.query("histid_a == '3a' and histid_b == '4b'")["prediction"].iloc[0] == 1 + + assert tp.query("histid_a == '6a' and histid_b == '9b'")["prediction"].iloc[0] == 0 + assert pd.isnull(tp.query("histid_a == '6a' and histid_b == '9b'")["ratio"].iloc[0]) + + assert "4b" not in list(pm["histid_b"]) + assert "10b" not in list(pm["histid_b"]) + assert "0a" not in list(pm["histid_a"]) + + assert tp.query("histid_a == '5a' and histid_b == '7b'")["prediction"].iloc[0] == 1 + assert tp.query("histid_a == '5a' and histid_b == '6b'")["prediction"].iloc[0] == 0 diff --git a/hlink/tests/matching_test.py b/hlink/tests/matching_test.py new file mode 100755 index 0000000..8f42c43 --- /dev/null +++ b/hlink/tests/matching_test.py @@ -0,0 +1,219 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pandas as pd +from hlink.linking.matching.link_step_score import LinkStepScore + + +def test_removal_of_duplicate_histid_b( + spark, matching, matching_conf, scored_matches_test_data +): + """ Test all hh matching and training steps to ensure they work as a workflow """ + path_pms = scored_matches_test_data + matching.spark.read.csv(path_pms, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("scored_potential_matches") + + LinkStepScore(matching)._save_predicted_matches( + matching_conf, "histid_a", "histid_b" + ) + + pred_matches = matching.spark.table("predicted_matches").toPandas() + + assert pred_matches.shape == (1, 3) + assert ( + pred_matches.query("histid_a == 'A004' and histid_b == 'B003'")[ + "prediction" + ].iloc[0] + == 1 + ) + + +def test_step_2_any_equals(spark, matching_household_conf, matching, preprocessing): + + matching_household_conf["column_mappings"].append( + { + "column_name": "namefrst_std", + "alias": "namefrst_split", + "transforms": [{"type": "split"}], + } + ) + + matching_household_conf["column_mappings"].append( + { + "column_name": "namefrst_split", + "alias": "namefrst_mid_init", + "transforms": [ + {"type": "array_index", "value": 1}, + {"type": "substring", "values": [0, 1]}, + ], + } + ) + + matching_household_conf["column_mappings"].append( + { + "column_name": "namefrst_split", + "alias": "namefrst_unstd", + "transforms": [{"type": "array_index", "value": 0}], + } + ) + + matching_household_conf["comparison_features"] = [ + { + "alias": "mid_init_match", + "column_names": ["namefrst_mid_init", "namefrst_unstd"], + "comparison_type": "any_equals", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + ] + + matching_household_conf["training"]["dependent_var"] = "match" + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + + # Make assertions on the data + assert len(potential_matches_df.id_a) == 37 + assert len(potential_matches_df.mid_init_match) == 37 + assert len(potential_matches_df.namefrst_mid_init_a == 37) + assert ( + potential_matches_df.query( + "id_a == '50b33ef6-259d-43af-8cdc-56a61f881169 ' and id_b == '50b33ef6-259d-43af-8cdc-56a61f881169 '" + )["mid_init_match"].iloc[0] + == 1 + ) + assert pd.isna( + potential_matches_df.query( + "id_a == '7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ' and id_b == '7fb55d25-2a7d-486d-9efa-27b9d7e60c24 '" + )["mid_init_match"].iloc[0] + ) + assert ( + potential_matches_df.query( + "id_a == '7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ' and id_b == '7fb55d25-2a7d-486d-9efa-27b9d7e60c24 '" + )["namefrst_mid_init_a"].iloc[0] + is None + ) + assert ( + potential_matches_df.query( + "id_a == '7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ' and id_b == '7fb55d25-2a7d-486d-9efa-27b9d7e60c24 '" + )["namefrst_unstd_a"].iloc[0] + == "phineas" + ) + + +def test_step_2_sum(spark, matching_household_conf, matching, preprocessing): + + matching_household_conf["feature_selections"] = [ + { + "output_col": "namelast_popularity", + "input_cols": ["sex", "bpl", "namelast_clean"], + "transform": "popularity", + } + ] + + matching_household_conf["comparison_features"] = [ + { + "alias": "namelast_popularity_sum", + "column_name": "namelast_popularity", + "comparison_type": "sum", + }, + { + "alias": "namelast_jw", + "column_name": "namelast_clean", + "comparison_type": "jaro_winkler", + }, + ] + + matching_household_conf["training"]["dependent_var"] = "match" + matching_household_conf["training"]["independent_vars"] = [ + "neighbor_namelast_jw_rate", + "neighbor_namelast_jw_rate_threshold", + "namelast_jw", + ] + preprocessing.run_step(0) + preprocessing.run_step(1) + matching.run_step(0) + matching.run_step(1) + + LinkStepScore(matching)._create_features(matching_household_conf) + + # Create pandas DFs of the step_2 potential matches table + potential_matches_df = spark.table("potential_matches_prepped").toPandas() + assert len(potential_matches_df.namelast_popularity_a) == 37 + assert len(potential_matches_df.namelast_popularity_b) == 37 + assert len(potential_matches_df.namelast_popularity_sum) == 37 + assert ( + potential_matches_df.query( + "id_a == 'ae7261c3-7d71-4ea1-997f-5d1a68c18777 ' and id_b == 'ad6442b5-42bc-4c2e-a517-5a951d989a92 '" + )["namelast_popularity_a"].iloc[0] + == 3 + ) + assert ( + potential_matches_df.query( + "id_a == 'ae7261c3-7d71-4ea1-997f-5d1a68c18777 ' and id_b == 'ad6442b5-42bc-4c2e-a517-5a951d989a92 '" + )["namelast_popularity_b"].iloc[0] + == 2 + ) + assert ( + potential_matches_df.query( + "id_a == 'ae7261c3-7d71-4ea1-997f-5d1a68c18777 ' and id_b == 'ad6442b5-42bc-4c2e-a517-5a951d989a92 '" + )["namelast_popularity_sum"].iloc[0] + == 5 + ) + + +# +## TODO: fix hh compare rate java function +## def test_step_2_hh_compare_rate(spark, matching_household_conf, matching, preprocessing): +## matching_household_conf['feature_selections'] = [ +## { +## "output_col": "namefrst_related_rows", +## "input_cols": ["namefrst_std", "bpl", "sex"], +## "transform": "related_individual_rows", +## "family_id": "serialp", +## "relate_col": "relate", +## "top_code": 10, +## "bottom_code": 3 +## } +## ] +# +## matching_household_conf['comparison_features'] = [ +## { +## "alias": "namelast_jw", +## "column_name": "namelast_clean", +## "comparison_type": "jaro_winkler" +## }, +## { +## "alias": "related_match_rate", +## "column_name": "namefrst_related_rows", +## "comparison_type": "hh_compare_rate" +## } +## ] +# +## preprocessing.step_0_register_raw_dfs() +## preprocessing.step_1_prep_dataframe() +## matching.step_0_explode() +## matching.step_1_match() +# +## Create pandas DFs of the step_2 potential matches table +## potential_matches_df = spark.table("potential_matches").toPandas() +## assert (len(potential_matches_df.namelast_popularity_a) == 41) +# +# diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py new file mode 100644 index 0000000..3ea09c4 --- /dev/null +++ b/hlink/tests/model_exploration_test.py @@ -0,0 +1,714 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pytest +import pandas as pd + +import hlink.linking.core.threshold as threshold_core +import hlink.tests +from hlink.linking.model_exploration.link_step_train_test_models import ( + LinkStepTrainTestModels, +) + + +def test_all( + spark, + main, + training_conf, + model_exploration, + state_dist_path, + training_data_doubled_path, +): + """ Test training step 2 with probit model""" + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = training_data_doubled_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["use_training_data_features"] = False + training_conf["training"]["decision"] = "drop_duplicate_with_threshold_ratio" + training_conf["training"]["n_training_iterations"] = 4 + training_conf["training"]["seed"] = 120 + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["model_parameters"] = [ + {"type": "probit", "threshold": 0.8, "threshold_ratio": [1.01, 1.3]}, + { + "type": "random_forest", + "maxDepth": 5.0, + "numTrees": 75.0, + "threshold_ratio": 1.2, + "threshold": 0.2, + }, + ] + training_conf["training"]["get_precision_recall_curve"] = True + + model_exploration.run_step(0) + model_exploration.run_step(1) + model_exploration.run_step(2) + + prc = spark.table("model_eval_precision_recall_curve_probit__").toPandas() + assert all( + elem in list(prc.columns) + for elem in ["params", "precision", "recall", "threshold_gt_eq"] + ) + prc_rf = spark.table( + "model_eval_precision_recall_curve_random_forest__maxdepth___5_0___numtrees___75_0_" + ).toPandas() + assert all( + elem in list(prc_rf.columns) + for elem in ["params", "precision", "recall", "threshold_gt_eq"] + ) + + tr = spark.table("model_eval_training_results").toPandas() + + assert tr.__len__() == 3 + assert tr.query("threshold_ratio == 1.01")["precision_test_mean"].iloc[0] >= 0.5 + assert tr.query("threshold_ratio == 1.3")["alpha_threshold"].iloc[0] == 0.8 + assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 5 + assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 0.8 + assert ( + tr.query("threshold_ratio == 1.01")["pr_auc_mean"].iloc[0] + == tr.query("threshold_ratio == 1.3")["pr_auc_mean"].iloc[0] + ) + + preds = spark.table("model_eval_predictions").toPandas() + assert ( + preds.query("id_a == 20 and id_b == 30")["second_best_prob"].round(2).iloc[0] + < 0.1 + ) + assert ( + preds.query("id_a == 20 and id_b == 30")["probability"].round(2).iloc[0] > 0.5 + ) + assert preds.query("id_a == 30 and id_b == 30")["prediction"].iloc[0] == 0 + assert pd.isnull( + preds.query("id_a == 20 and id_b == 50")["second_best_prob"].iloc[0] + ) + + pred_train = spark.table("model_eval_predict_train").toPandas() + assert pred_train.query("id_a == 20 and id_b == 30")["match"].iloc[0] == 1 + assert pd.isnull( + pred_train.query("id_a == 20 and id_b == 50")["second_best_prob"].iloc[0] + ) + assert pred_train.query("id_a == 20 and id_b == 30")["prediction"].iloc[0] == 1 + + main.do_drop_all("") + + +def test_step_2_param_grid(spark, main, training_conf, model_exploration, fake_self): + """ Test matching step 2 training to see if the custom param grid builder is working """ + + training_conf["training"]["model_parameters"] = [ + {"type": "random_forest", "maxDepth": [3, 4, 5], "numTrees": [50, 100]}, + {"type": "probit", "threshold": [0.5, 0.7]}, + ] + + link_step = LinkStepTrainTestModels(model_exploration) + param_grid = link_step._custom_param_grid_builder(training_conf) + + expected = [ + {"maxDepth": 3, "numTrees": 50, "type": "random_forest"}, + {"maxDepth": 3, "numTrees": 100, "type": "random_forest"}, + {"maxDepth": 4, "numTrees": 50, "type": "random_forest"}, + {"maxDepth": 4, "numTrees": 100, "type": "random_forest"}, + {"maxDepth": 5, "numTrees": 50, "type": "random_forest"}, + {"maxDepth": 5, "numTrees": 100, "type": "random_forest"}, + {"type": "probit", "threshold": [0.5, 0.7]}, + ] + + assert len(param_grid) == len(expected) + assert all([m in expected for m in param_grid]) + + main.do_drop_all("") + + +# ------------------------------------- +# Tests that probably should be moved +# ------------------------------------- + + +@pytest.fixture(scope="function") +def feature_conf(training_conf): + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + + training_conf["training"]["independent_vars"] = ["namelast_jw", "regionf"] + + training_conf["training"]["model_parameters"] = [] + training_conf["training"]["n_training_iterations"] = 2 + return training_conf + + +def test_step_2_probability_ratio_threshold( + spark, main, feature_conf, model_exploration, threshold_ratio_data_path +): + """ Test probability threshold ratio decision boundary to remove too close multi-matches """ + feature_conf["id_column"] = "histid" + feature_conf["training"]["dataset"] = threshold_ratio_data_path + feature_conf["training"]["decision"] = "drop_duplicate_with_threshold_ratio" + threshold_ratio = 1.2 + alpha_threshold = 0.5 + + model_exploration.run_step(0) + predictions = spark.table("model_eval_training_data") + threshold_predictions = threshold_core._apply_threshold_ratio( + predictions.drop("prediction"), + alpha_threshold, + threshold_ratio, + feature_conf["id_column"], + ) + tp = threshold_predictions.toPandas() + assert sorted(tp.columns) == [ + "histid_a", + "histid_b", + "prediction", + "probability", + "ratio", + "second_best_prob", + ] + assert tp["prediction"].sum() == 4 + assert pd.isnull( + tp.query("histid_a == 6 and histid_b == 6")["second_best_prob"].iloc[0] + ) + assert tp.query("histid_a == 6 and histid_b == 6")["prediction"].iloc[0] == 1 + assert pd.isnull( + tp.query("histid_a == 7 and histid_b == 7")["second_best_prob"].iloc[0] + ) + assert tp.query("histid_a == 7 and histid_b == 7")["prediction"].iloc[0] == 0 + + assert tp.query("histid_a == 2 and histid_b == 2")["prediction"].iloc[0] == 1 + assert tp.query("histid_a == 1 and histid_b == 1")["prediction"].iloc[0] == 1 + assert tp.query("histid_a == 1 and histid_b == 0")["prediction"].iloc[0] == 0 + assert tp.query("histid_a == 0 and histid_b == 0")["prediction"].iloc[0] == 0 + assert tp.query("histid_a == 0 and histid_b == 0")["ratio"].iloc[0] > 1 + assert pd.isnull(tp.query("histid_a == 0 and histid_b == 1")["ratio"].iloc[0]) + + +def test_step_1_OneHotEncoding( + spark, feature_conf, model_exploration, state_dist_path, training_data_path +): + """ Test matching step 2 training to see if the OneHotEncoding is working """ + + model_exploration.run_step(0) + model_exploration.run_step(1) + + training_v = spark.table("model_eval_training_vectorized").toPandas() + columns_expected = [ + "match", + "id_a", + "id_b", + "namelast_jw", + "regionf", + "namelast_jw_imp", + "regionf_onehotencoded", + "features_vector", + ] + assert training_v.shape[0] == 9 + assert all([c in training_v.columns for c in columns_expected]) + assert len(training_v["features_vector"][0]) == 5 + + +def test_step_2_scale_values( + spark, feature_conf, model_exploration, state_dist_path, training_data_path +): + feature_conf["training"]["scale_data"] = True + + model_exploration.run_step(0) + model_exploration.run_step(1) + + training_v = spark.table("model_eval_training_vectorized").toPandas() + + assert training_v.shape == (9, 9) + assert len(training_v["features_vector"][0]) == 5 + assert training_v["features_vector"][0][0].round(2) == 2.85 + + +def test_step_2_train_random_forest_spark( + spark, main, feature_conf, model_exploration, state_dist_path +): + """ Test training step 2 with random forest model""" + feature_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": 3, + "numTrees": 3, + "featureSubsetStrategy": "sqrt", + } + ] + feature_conf["training"]["output_suspicious_TD"] = True + feature_conf["training"]["n_training_iterations"] = 10 + + model_exploration.run_step(0) + model_exploration.run_step(1) + model_exploration.run_step(2) + + tr = spark.table("model_eval_training_results").toPandas() + # assert tr.shape == (1, 18) + assert tr.query("model == 'random_forest'")["precision_test_mean"].iloc[0] > 0.5 + assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3 + + FNs = spark.table("model_eval_repeat_FNs").toPandas() + assert FNs.shape == (3, 4) + assert FNs.query("id_a == 30")["count"].iloc[0] > 5 + + main.do_drop_all("") + + +def test_step_2_train_logistic_regression_spark( + spark, main, feature_conf, model_exploration, state_dist_path, training_data_path +): + """ Test training step 2 with logistic regression model""" + feature_conf["training"]["model_parameters"] = [ + {"type": "logistic_regression", "threshold": 0.7} + ] + + model_exploration.run_step(0) + model_exploration.run_step(1) + model_exploration.run_step(2) + + tr = spark.table("model_eval_training_results").toPandas() + + # assert tr.shape == (1, 16) + assert tr.query("model == 'logistic_regression'")["precision_test_mean"].iloc[0] > 0 + assert ( + round(tr.query("model == 'logistic_regression'")["alpha_threshold"].iloc[0], 1) + == 0.7 + ) + main.do_drop_all("") + + +def test_step_2_train_decision_tree_spark( + spark, main, feature_conf, model_exploration, state_dist_path, training_data_path +): + """ Test training step 2 with decision tree model""" + feature_conf["training"]["model_parameters"] = [ + {"type": "decision_tree", "maxDepth": 3, "minInstancesPerNode": 1, "maxBins": 7} + ] + + model_exploration.run_step(0) + model_exploration.run_step(1) + model_exploration.run_step(2) + + tr = spark.table("model_eval_training_results").toPandas() + + # assert tr.shape == (1, 18) + assert tr.query("model == 'decision_tree'")["precision_test_mean"].iloc[0] > 0 + assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3 + assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1 + assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7 + + main.do_drop_all("") + + +def test_step_2_train_gradient_boosted_trees_spark( + spark, main, feature_conf, model_exploration, state_dist_path, training_data_path +): + """ Test training step 2 with gradient boosted tree model""" + feature_conf["training"]["model_parameters"] = [ + { + "type": "gradient_boosted_trees", + "maxDepth": 5, + "minInstancesPerNode": 1, + "maxBins": 5, + } + ] + + model_exploration.run_step(0) + model_exploration.run_step(1) + model_exploration.run_step(2) + + tr = spark.table("model_eval_training_results").toPandas() + preds = spark.table("model_eval_predictions").toPandas() + + assert "probability_array" in list(preds.columns) + + # assert tr.shape == (1, 18) + assert ( + tr.query("model == 'gradient_boosted_trees'")["precision_test_mean"].iloc[0] > 0 + ) + assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5 + assert ( + tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0] + == 1 + ) + assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5 + + main.do_drop_all("") + + +def test_step_2_interact_categorial_vars( + spark, training_conf, model_exploration, state_dist_path, training_data_path +): + """ Test matching step 2 training to see if the OneHotEncoding is working """ + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "td_match", + "column_name": "match", + "comparison_type": "fetch_td", + "categorical": True, + }, + ] + + training_conf["pipeline_features"] = [ + { + "input_columns": ["regionf", "td_match"], + "output_column": "regionf_interacted_tdmatch", + "transformer_type": "interaction", + } + ] + + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "regionf_interacted_tdmatch", + ] + + model_exploration.run_step(0) + model_exploration.run_step(1) + + prepped_data = spark.table("model_eval_training_vectorized").toPandas() + + assert prepped_data.shape == (9, 11) + assert list( + prepped_data.query("id_a == 10 and id_b == 10")["regionf_onehotencoded"].iloc[0] + ) == [0, 1, 0, 0] + assert list( + prepped_data.query("id_a == 20 and id_b == 50")["regionf_onehotencoded"].iloc[0] + ) == [0, 0, 1, 0] + assert list( + prepped_data.query("id_a == 10 and id_b == 10")["td_match_onehotencoded"].iloc[ + 0 + ] + ) == [0, 1, 0] + assert list( + prepped_data.query("id_a == 20 and id_b == 50")["td_match_onehotencoded"].iloc[ + 0 + ] + ) == [1, 0, 0] + assert list( + prepped_data.query("id_a == 10 and id_b == 50")[ + "regionf_interacted_tdmatch" + ].iloc[0] + ) == [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0] + assert list( + prepped_data.query("id_a == 10 and id_b == 10")[ + "regionf_interacted_tdmatch" + ].iloc[0] + ) == [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0] + assert list( + prepped_data.query("id_a == 30 and id_b == 50")[ + "regionf_interacted_tdmatch" + ].iloc[0] + ) == [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] + assert ( + len( + list( + prepped_data.query("id_a == 10 and id_b == 10")["features_vector"].iloc[ + 0 + ] + ) + ) + == 17 + ) + + +def test_step_2_VectorAssembly( + spark, main, training_conf, model_exploration, state_dist_path, training_data_path +): + """ Test training step 1 training to see if the OneHotEncoding is working """ + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + + training_conf["training"]["model_parameters"] = [] + + model_exploration.run_step(0) + model_exploration.run_step(1) + + vdf = spark.table("model_eval_training_vectorized").toPandas() + + assert len(vdf.query("id_a == 20 and id_b == 30")["features_vector"].iloc[0]) == 6 + assert 3187 in ( + vdf.query("id_a == 20 and id_b == 30")["features_vector"].iloc[0].values.round() + ) + assert sorted( + vdf.query("id_a == 10 and id_b == 50")["features_vector"].iloc[0].values.round() + ) == [1, 1909] + main.do_drop_all("") + + +def test_step_2_split_by_id_a( + spark, + main, + training_conf, + model_exploration, + state_dist_path, + training_data_path, + fake_self, +): + """ Tests train-test-split which keeps all potential_matches of an id_a together in the same split """ + + training_conf["training"]["n_training_iterations"] = 4 + training_conf["training"]["split_by_id_a"] = True + + prepped_data = spark.read.csv(training_data_path, header=True) + id_a = training_conf["id_column"] + "_a" + n_training_iterations = training_conf["training"].get("n_training_iterations", 10) + seed = training_conf["training"].get("seed", 2133) + + link_step = LinkStepTrainTestModels(model_exploration) + splits = link_step._get_splits(prepped_data, id_a, n_training_iterations, seed) + + assert len(splits) == 4 + + assert splits[0][0].toPandas()["id_a"].unique().tolist() == ["10", "20"] + assert splits[0][1].toPandas()["id_a"].unique().tolist() == ["30"] + + assert splits[1][0].toPandas()["id_a"].unique().tolist() == ["10"] + assert splits[1][1].toPandas()["id_a"].unique().tolist() == ["20", "30"] + + main.do_drop_all("") + + +@pytest.mark.skip( + reason="Need to get tests working for new version of feature importances" +) +def test_step_3_get_feature_importances_random_forest( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + model_exploration, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "random_forest", + "maxDepth": 6, + "numTrees": 100, + "featureSubsetStrategy": "sqrt", + } + + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["training"]["feature_importances"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + + model_exploration.run_step(3) + + fi_df = training.spark.table("feature_importances").toPandas() + + assert fi_df.shape == (6, 3) + assert 1 > fi_df.query("idx == 0")["score"].iloc()[0] >= 0 + assert "regionf_onehotencoded_2" in list(fi_df["name"]) + assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"]) + + +@pytest.mark.skip( + reason="Need to get tests working for new version of feature importances" +) +def test_step_3_get_feature_importances_probit( + spark, + training_conf, + training, + state_dist_path, + datasource_training_input, + potential_matches_path, + spark_test_tmp_dir_path, + matching, +): + """ Test running the chosen model on potential matches dataset """ + td_path, pa_path, pb_path = datasource_training_input + + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = td_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + + training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5} + + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["training"]["feature_importances"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + training.spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ).write.mode("overwrite").saveAsTable("potential_matches") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + matching.run_step(2) + training.run_step(3) + + fi_df = training.spark.table("feature_importances").toPandas() + + assert fi_df.shape == (6, 3) + assert 25 > fi_df.query("idx == 0")["score"].iloc()[0] >= -5 + assert "regionf_onehotencoded_2" in list(fi_df["name"]) + assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"]) diff --git a/hlink/tests/plugins/__init__.py b/hlink/tests/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlink/tests/plugins/datasources.py b/hlink/tests/plugins/datasources.py new file mode 100755 index 0000000..897fc73 --- /dev/null +++ b/hlink/tests/plugins/datasources.py @@ -0,0 +1,926 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.tests +import pytest +import os +from pyspark.sql.types import * +from pyspark.sql import Row + + +@pytest.fixture(scope="session") +def base_datasources(spark, tmpdir_factory): + """Create a fixture for conf datasource input. These test data are suitable for use in most of the preprocessing tests, and include really messy names for testing some name cleaning transforms, as well as bpl, age, serialp, and sex data. """ + datasources = tmpdir_factory.mktemp("datasources") + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("serialp", StringType(), True), + StructField("namelast", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("namemiddle", StringType(), True), + StructField("bpl", LongType(), True), + StructField("sex", LongType(), True), + StructField("age", LongType(), True), + ] + ) + data_a = [ + { + "id": 10, + "serialp": "A", + "namelast": "", + "namefrst": " John_M ", + "bpl": 100, + "sex": 1, + "age": 23, + }, + { + "id": 20, + "serialp": "B", + "namelast": "Mc Last", + "namefrst": "J Marc'ell III", + "bpl": 200, + "sex": 2, + "age": 30, + }, + { + "id": 30, + "serialp": "B", + "namelast": "L.T.", + "namefrst": "Mr. Jon Jr.", + "bpl": 300, + "sex": 1, + }, + ] + pathname_a = os.path.join(datasources, "df1.parquet") + spark.createDataFrame(data_a, schema=df_schema).write.parquet(pathname_a) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + { + "id": 10, + "serialp": "C", + "namelast": "Name", + "namefrst": "John?", + "namemiddle": "M", + "bpl": 400, + "sex": 1, + }, + { + "id": 30, + "serialp": "D", + "namelast": None, + "namemiddle": None, + "bpl": 500, + "sex": 0, + }, + { + "id": 50, + "serialp": "E", + "namefrst": "Je-an or Jeanie", + "namemiddle": "Marc", + "bpl": 700, + "sex": 2, + }, + ] + pathname_b = os.path.join(datasources, "df2.parquet") + spark.createDataFrame(data_b, schema=df_schema).write.parquet(pathname_b) + return pathname_a, pathname_b + + +@pytest.fixture(scope="session") +def county_dist_datasources(spark, tmpdir_factory): + """Create a fixture for conf datasource input. These test data are suitable for use in testing county distance code calculation as well as the distance calculation itself. """ + datasources = tmpdir_factory.mktemp("datasources") + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("namelast", StringType(), True), + StructField("sex", LongType(), True), + StructField("county_p", LongType(), True), + StructField("statefip_p", LongType(), True), + ] + ) + data_a = [ + {"id": 10, "namelast": "Last", "sex": 0, "statefip_p": 3400, "county_p": 170}, + {"id": 20, "namelast": "Last", "sex": 0, "statefip_p": 5500, "county_p": 1210}, + {"id": 30, "namelast": "Lost", "sex": 0, "statefip_p": 1100, "county_p": 44999}, + ] + pathname_a = os.path.join(datasources, "df1.parquet") + spark.createDataFrame(data_a, schema=df_schema).write.parquet(pathname_a) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + {"id": 20, "namelast": "Last", "sex": 0, "statefip_p": 100, "county_p": 10}, + {"id": 40, "namelast": "Last", "sex": 0, "statefip_p": 1200, "county_p": 570}, + ] + pathname_b = os.path.join(datasources, "df2.parquet") + spark.createDataFrame(data_b, schema=df_schema).write.parquet(pathname_b) + return pathname_a, pathname_b + + +@pytest.fixture(scope="function") +def datasource_preprocessing_simple_names(spark, conf, tmpdir_factory): + """ Synthetic data with name variants and sex data, designed for testing name substitution from gendered name files. """ + datasources = tmpdir_factory.mktemp("datasources") + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("sex", LongType(), True), + ] + ) + data_a = [ + {"id": "10ah", "namefrst": "Cat", "sex": 2}, + {"id": "20bc", "namefrst": "Barney", "sex": 1}, + {"id": "34hi", "namefrst": "Cathy", "sex": 2}, + {"id": "54de", "namefrst": "Kat", "sex": 1}, + ] + pathname_a = os.path.join(datasources, "df1.parquet") + spark.createDataFrame(data_a, schema=df_schema).write.parquet(pathname_a) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + {"id": "c23", "namefrst": "Barry", "sex": 1}, + {"id": "d45", "namefrst": "Katie", "sex": 2}, + {"id": "e77", "namefrst": "Bernard", "sex": 1}, + ] + pathname_b = os.path.join(datasources, "df2.parquet") + spark.createDataFrame(data_b, schema=df_schema).write.parquet(pathname_b) + return pathname_a, pathname_b + + +@pytest.fixture(scope="function") +def datasource_synthetic_households(spark, conf, tmpdir_factory): + """ This configuration includes data synthesized for testing the union feature on simple household/neighbors data. """ + datasources = tmpdir_factory.mktemp("datasources") + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", StringType(), True), + StructField("namelast", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("neighbors", ArrayType(StringType()), True), + StructField("nonfamily_household", ArrayType(StringType()), True), + ] + ) + data_a = [ + { + "id": "10ah", + "namefrst": "jane", + "neighbors": ["edie", "gerald"], + "nonfamily_household": ["elmer"], + }, + { + "id": "20bc", + "namefrst": "judy", + "neighbors": ["edie", "elmer"], + "nonfamily_household": [], + }, + { + "id": "34hi", + "namefrst": "janice", + "neighbors": ["edie"], + "nonfamily_household": ["edie"], + }, + ] + pathname_a = os.path.join(datasources, "df1.parquet") + spark.createDataFrame(data_a, schema=df_schema).write.parquet(pathname_a) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + {"id": "c23", "neighbors": [], "nonfamily_household": []}, + { + "id": "d45", + "namefrst": "gary", + "neighbors": [], + "nonfamily_household": ["colleen"], + }, + ] + pathname_b = os.path.join(datasources, "df2.parquet") + spark.createDataFrame(data_b, schema=df_schema).write.parquet(pathname_b) + return pathname_a, pathname_b + + +@pytest.fixture(scope="function") +def datasource_real_households(spark, conf, tmpdir_factory): + """ This configuration includes datasets for testing addition of household and neighbors features. It's pulled from a sample of actual census data. """ + + path_a = "input_data/training_data_households.parquet" + path_b = "input_data/households_b.parquet" + + package_path = os.path.dirname(hlink.tests.__file__) + + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + return full_path_a, full_path_b + + +@pytest.fixture(scope="function") +def datasource_19thc_nativity_households_data(spark, conf): + path_a = "input_data/19thc_nativity_test_hhs_a.csv" + path_b = "input_data/19thc_nativity_test_hhs_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + + return full_path_a, full_path_b + + +@pytest.fixture(scope="function") +def datasource_calc_mfbpl_pm_data(spark, conf): + path_a = "input_data/calc_mfbpl_a.csv" + path_b = "input_data/calc_mfbpl_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + + return full_path_a, full_path_b + + +@pytest.fixture(scope="function") +def datasource_matching(spark, conf, matching): + """Create the prepped_df_(a/b) dataframes for testing matching steps """ + + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("serialp", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("namelast", StringType(), True), + StructField("bpl", LongType(), True), + StructField("sex", LongType(), True), + StructField("street", StringType(), True), + StructField("enum_dist", LongType(), True), + ] + ) + data_a = [ + { + "id": 10, + "serialp": "A", + "namefrst": "Firste", + "namelast": "Named", + "bpl": 100, + "sex": 1, + "street": "First Avenue", + "enum_dist": 0, + }, + { + "id": 20, + "serialp": "B", + "namefrst": "Firt", + "namelast": "Last", + "bpl": 200, + "sex": 2, + "street": "First Ave", + "enum_dist": 0, + }, + { + "id": 30, + "serialp": "B", + "namefrst": "Frost", + "namelast": "Lest", + "bpl": 300, + "sex": 2, + "street": "Lyndale", + "enum_dist": 2, + }, + ] + matching.run_register_python( + "prepped_df_a", + lambda: spark.createDataFrame(data_a, schema=df_schema), + persist=True, + overwrite_preexisting_tables=True, + ) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + { + "id": 10, + "serialp": "C", + "namefrst": "First", + "namelast": "Nameish", + "bpl": 400, + "sex": 1, + "street": "First Ave", + "enum_dist": 0, + }, + { + "id": 30, + "serialp": "D", + "namefrst": "Firt", + "namelast": "Last", + "bpl": 500, + "sex": 2, + "street": "1st Avenue", + "enum_dist": 1, + }, + { + "id": 50, + "serialp": "E", + "namefrst": "Frst", + "namelast": "List", + "bpl": 700, + "sex": 2, + "street": "Franklin", + "enum_dist": 2, + }, + ] + matching.run_register_python( + "prepped_df_b", + lambda: spark.createDataFrame(data_b, schema=df_schema), + persist=True, + overwrite_preexisting_tables=True, + ) + + +@pytest.fixture(scope="function") +def datasource_matching_comparisons(spark, conf, matching): + """Create the prepped_df_(a/b) dataframes for testing matching comparison steps """ + + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("sex", LongType(), True), + StructField("namelast", StringType(), True), + StructField("mbpl", LongType(), True), + StructField("mother_birthyr", LongType(), True), + StructField("stepmom", LongType(), True), + StructField("spouse_bpl", LongType(), True), + StructField("spouse_birthyr", LongType(), True), + StructField("durmarr", LongType(), True), + StructField("mother_namefrst", StringType(), True), + StructField("spouse_namefrst", StringType(), True), + StructField("momloc", LongType(), True), + StructField("sploc", LongType(), True), + ] + ) + data_a = [ + { + "id": 10, + "sex": 0, + "namelast": "Last", + "mbpl": 100, + "mother_birthyr": 1925, + "stepmom": 0, + "spouse_bpl": 200, + "spouse_birthyr": 1955, + "durmarr": 2, + "mother_namefrst": "eliza", + "spouse_namefrst": "first", + "momloc": 0, + "sploc": 2, + }, + { + "id": 20, + "sex": 0, + "namelast": "Last", + "mbpl": 100, + "mother_birthyr": 1925, + "stepmom": 0, + "spouse_bpl": 200, + "spouse_birthyr": 1955, + "durmarr": 2, + "mother_namefrst": "ellie", + "spouse_namefrst": "frst", + "momloc": 1, + "sploc": 2, + }, + { + "id": 30, + "sex": 0, + "namelast": "Last", + "mbpl": 100, + "mother_birthyr": 1925, + "stepmom": 0, + "spouse_bpl": 200, + "spouse_birthyr": 1955, + "durmarr": 2, + "mother_namefrst": "elizabeth", + "spouse_namefrst": "firsty", + "momloc": 3, + "sploc": 0, + }, + ] + matching.run_register_python( + "prepped_df_a", + lambda: spark.createDataFrame(data_a, schema=df_schema), + persist=True, + overwrite_preexisting_tables=True, + ) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + { + "id": 10, + "sex": 0, + "namelast": "Last", + "mbpl": 100, + "mother_birthyr": 1925, + "stepmom": 0, + "spouse_bpl": 200, + "spouse_birthyr": 1955, + "durmarr": 12, + "mother_namefrst": "eliza", + "spouse_namefrst": "fast", + "momloc": 2, + "sploc": 0, + }, + { + "id": 20, + "sex": 0, + "namelast": "Last", + "mbpl": 200, + "mother_birthyr": 1925, + "stepmom": 0, + "spouse_bpl": 300, + "spouse_birthyr": 1955, + "durmarr": 12, + "mother_namefrst": "eliza", + "momloc": 2, + "sploc": 0, + }, + { + "id": 30, + "sex": 0, + "namelast": "Last", + "mbpl": 100, + "mother_birthyr": 1935, + "stepmom": 0, + "spouse_bpl": 200, + "spouse_birthyr": 1961, + "durmarr": 12, + "spouse_namefrst": "fist", + "momloc": 0, + "sploc": 1, + }, + { + "id": 40, + "sex": 0, + "namelast": "Last", + "mbpl": 100, + "mother_birthyr": 1935, + "stepmom": 3, + "spouse_bpl": 200, + "spouse_birthyr": 1955, + "durmarr": 4, + "momloc": 0, + "sploc": 0, + }, + ] + matching.run_register_python( + "prepped_df_b", + lambda: spark.createDataFrame(data_b, schema=df_schema), + persist=True, + overwrite_preexisting_tables=True, + ) + + +@pytest.fixture(scope="function") +def datasource_training(spark, conf, matching): + """Create the prepped_df_(a/b) dataframes and populate basic config values""" + + # Create the first spark dataframe with test data and save it as table + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("serialp", StringType(), True), + StructField("namelast", StringType(), True), + StructField("bpl", LongType(), True), + StructField("sex", LongType(), True), + StructField("region", LongType(), True), + ] + ) + data_a = [ + { + "id": 10, + "serialp": "A", + "namelast": "Name", + "bpl": 100, + "sex": 1, + "region": 1, + }, + { + "id": 20, + "serialp": "B", + "namelast": "Last", + "bpl": 200, + "sex": 2, + "region": 2, + }, + { + "id": 30, + "serialp": "B", + "namelast": "Lest", + "bpl": 300, + "sex": 2, + "region": 2, + }, + ] + + matching.run_register_python( + "prepped_df_a", + lambda: spark.createDataFrame(data_a, schema=df_schema), + persist=True, + overwrite_preexisting_tables=True, + ) + + # Create the second spark dataframe with test data and save it as table + data_b = [ + { + "id": 10, + "serialp": "C", + "namelast": "Nameish", + "bpl": 400, + "sex": 1, + "region": 1, + }, + { + "id": 30, + "serialp": "D", + "namelast": "Last", + "bpl": 500, + "sex": 2, + "region": 2, + }, + { + "id": 50, + "serialp": "E", + "namelast": "List", + "bpl": 700, + "sex": 2, + "region": 2, + }, + ] + + matching.run_register_python( + "prepped_df_b", + lambda: spark.createDataFrame(data_b, schema=df_schema), + persist=True, + overwrite_preexisting_tables=True, + ) + + +@pytest.fixture(scope="function") +def datasource_training_input(spark, conf, tmpdir_factory): + """ This configuration includes datasets for testing specification of input data for a batch training step. """ + training_data = "input_data/training_data_long.csv" + prepped_a_data = "input_data/training_data_long_a.csv" + prepped_b_data = "input_data/training_data_long_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + td_path = os.path.join(package_path, training_data) + pa_path = os.path.join(package_path, prepped_a_data) + pb_path = os.path.join(package_path, prepped_b_data) + + return td_path, pa_path, pb_path + + +@pytest.fixture(scope="function") +def datasource_rel_jw_input(spark): + """ Create tables for testing rel_jaro_winkler""" + + schema = StructType( + [ + StructField("id", StringType(), True), + StructField( + "namefrst_related_rows_birthyr", + ArrayType( + StructType( + [ + StructField("birthyr", LongType(), True), + StructField("namefrst_std", StringType(), True), + StructField("sex", LongType(), True), + ] + ), + True, + ), + True, + ), + StructField( + "namefrst_related_rows", + ArrayType( + StructType( + [ + StructField("namefrst_std", StringType(), True), + StructField("replaced_birthyr", LongType(), True), + StructField("sex", LongType(), True), + ] + ), + True, + ), + True, + ), + ] + ) + + table_a = spark.createDataFrame( + [ + ( + 0, + [ + Row(namefrst_std="martha", birthyr=1855, sex=2), + Row(namefrst_std="minnie", birthyr=1857, sex=2), + Row(namefrst_std="martin", birthyr=1859, sex=1), + ], + [ + Row(namefrst_std="martha", replaced_birthyr=1855, sex=2), + Row(namefrst_std="minnie", replaced_birthyr=1857, sex=2), + Row(namefrst_std="martin", replaced_birthyr=1859, sex=1), + ], + ) + ], + schema, + ) + + table_b = spark.createDataFrame( + [ + ( + 0, + [Row(namefrst_std="martha", birthyr=1855, sex=2)], + [Row(namefrst_std="martha", replaced_birthyr=1855, sex=2)], + ), + ( + 1, + [Row(namefrst_std="tanya", birthyr=1855, sex=2)], + [Row(namefrst_std="tanya", replaced_birthyr=1855, sex=2)], + ), + ], + schema, + ) + + return table_a, table_b + + +@pytest.fixture(scope="function") +def datasource_extra_children_input(spark): + """ Create tables for testing rel_children""" + schema = StructType( + [ + StructField("histid", LongType(), True), + StructField("relate", LongType(), True), + StructField( + "namefrst_related_rows", + ArrayType( + StructType( + [ + StructField("histid", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("birthyr", LongType(), True), + StructField("sex", LongType(), True), + StructField("relate", LongType(), True), + ] + ), + True, + ), + True, + ), + ] + ) + + table_a = spark.createDataFrame( + [ + ( + 0, + 101, # head of household + [ + Row( + histid=1, namefrst="martha", birthyr=1855, sex=2, relate=201 + ), # age 45 + Row( + histid=2, namefrst="minnie", birthyr=1887, sex=2, relate=301 + ), # age 13 + Row( + histid=3, namefrst="martin", birthyr=1897, sex=1, relate=301 + ), # age 3 + ], + ), + ( + 4, + 301, # child in the household + [ + Row( + histid=5, namefrst="george", birthyr=1887, sex=1, relate=301 + ), # age 18 + Row( + histid=6, namefrst="marty", birthyr=1897, sex=1, relate=301 + ), # age 3 + Row( + histid=7, namefrst="jean", birthyr=1835, sex=2, relate=601 + ), # age 65, mother in law + ], + ), + ( + 7, + 601, + [ + Row( + histid=5, namefrst="george", birthyr=1887, sex=1, relate=301 + ), # age 18 + Row( + histid=6, namefrst="marty", birthyr=1897, sex=1, relate=301 + ), # age 3 + Row(histid=4, namefrst="joe", birthyr=1896, sex=1, relate=301), + ], + ), + (17, 101, []), + ], + schema, + ) + + table_b = spark.createDataFrame( + [ + ( + 8, + 101, + [ + Row(histid=9, namefrst="martha", birthyr=1855, sex=2, relate=201), + Row(histid=10, namefrst="martin", birthyr=1897, sex=1, relate=301), + ], + ), + ( + 11, + 301, + [ + Row(histid=12, namefrst="marc", birthyr=1888, sex=1, relate=301), + Row(histid=13, namefrst="tanya", birthyr=1899, sex=2, relate=301), + Row(histid=14, namefrst="erik", birthyr=1902, sex=1, relate=301), + ], + ), + ( + 15, + 101, + [Row(histid=16, namefrst="marty", birthyr=1888, sex=1, relate=101)], + ), + ], + schema, + ) + + return table_a, table_b + + +@pytest.fixture(scope="function") +def matching_test_input(spark, conf, tmpdir_factory): + """ This configuration includes datasets for testing matching steps. """ + prepped_a_data = "input_data/matching_test_a.csv" + prepped_b_data = "input_data/matching_test_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + pa_path = os.path.join(package_path, prepped_a_data) + pb_path = os.path.join(package_path, prepped_b_data) + + schema = StructType( + [ + StructField("id", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("namelast", StringType(), True), + StructField("birthyr", LongType(), True), + StructField("sex", LongType(), True), + ] + ) + + pdfa = spark.read.csv(pa_path, schema) + pdfb = spark.read.csv(pb_path, schema) + + return pdfa, pdfb + + +@pytest.fixture(scope="function") +def datasource_mi_comparison(spark, conf): + """Create the prepped_df_(a/b) dataframes and populate basic config values""" + + # Create the first spark dataframe with test data and save it as table + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("namefrst_mid_init", StringType(), True), + ] + ) + data_a = [ + {"id": 10, "namefrst_mid_init": "a"}, + {"id": 20, "namefrst_mid_init": "b"}, + {"id": 30}, + ] + table_a = spark.createDataFrame(data_a, schema=df_schema) + + data_b = [ + {"id": 40, "namefrst_mid_init": "a"}, + {"id": 50, "namefrst_mid_init": ""}, + {"id": 60}, + ] + + table_b = spark.createDataFrame(data_b, schema=df_schema) + + return table_a, table_b + + +@pytest.fixture(scope="session") +def datasource_unrestricted_blank_columns(spark, tmpdir_factory): + """Create a fixture for conf datasource input. These test data are suitable for use in the preprocessing tests which check for all-space columns in unrestricted data file. """ + + datasources = tmpdir_factory.mktemp("datasources") + # Create the first spark dataframe with test data and save it as parquet + df_schema = StructType( + [ + StructField("id", LongType(), True), + StructField("serialp", StringType(), True), + StructField("namelast", StringType(), True), + StructField("namefrst", StringType(), True), + StructField("namemiddle", StringType(), True), + StructField("bpl", LongType(), True), + StructField("sex", LongType(), True), + StructField("age", LongType(), True), + StructField("street", StringType(), True), + ] + ) + data_a = [ + { + "id": 10, + "serialp": "A", + "namelast": " ", + "namefrst": " John_M ", + "bpl": 100, + "sex": 1, + "age": 23, + "street": " ", + }, + { + "id": 20, + "serialp": "B", + "namelast": " ", + "namefrst": "J Marc'ell III", + "bpl": 200, + "sex": 2, + "age": 30, + "street": " ", + }, + { + "id": 30, + "serialp": "B", + "namelast": " ", + "namefrst": "Mr. Jon Jr.", + "bpl": 300, + "sex": 1, + "street": " ", + }, + ] + pathname_a = os.path.join(datasources, "df1.parquet") + spark.createDataFrame(data_a, schema=df_schema).write.parquet(pathname_a) + + # Create the second spark dataframe with test data and save it as parquet + data_b = [ + { + "id": 10, + "serialp": "C", + "namelast": "Name", + "namefrst": "John?", + "namemiddle": "M", + "bpl": 400, + "sex": 1, + }, + { + "id": 30, + "serialp": "D", + "namelast": None, + "namemiddle": None, + "bpl": 500, + "sex": 0, + }, + { + "id": 50, + "serialp": "E", + "namefrst": "Je-an or Jeanie", + "namemiddle": "Marc", + "bpl": 700, + "sex": 2, + }, + ] + pathname_b = os.path.join(datasources, "df2.parquet") + spark.createDataFrame(data_b, schema=df_schema).write.parquet(pathname_b) + return pathname_a, pathname_b + + +@pytest.fixture(scope="function") +def datasource_sql_condition_input(spark, conf, tmpdir_factory): + """ This configuration includes datasets for testing specification of input data for a batch training step. """ + prepped_a_data = "input_data/sql_condition_marst_warn_a.csv" + prepped_b_data = "input_data/sql_condition_marst_warn_b.csv" + potential_matches = "input_data/potential_matches_sql_condition_marst_warn.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + pa_path = os.path.join(package_path, prepped_a_data) + pb_path = os.path.join(package_path, prepped_b_data) + pm_path = os.path.join(package_path, potential_matches) + + return pa_path, pb_path, pm_path diff --git a/hlink/tests/plugins/external_data_paths.py b/hlink/tests/plugins/external_data_paths.py new file mode 100755 index 0000000..a33af3a --- /dev/null +++ b/hlink/tests/plugins/external_data_paths.py @@ -0,0 +1,406 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import hlink.tests +import pytest +import os + + +@pytest.fixture(scope="module") +def handle_null_path(spark): + """Create a fixture with the path to the region codes file""" + + path = "input_data/handle_null.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def region_code_path(spark): + """Create a fixture with the path to the region codes file""" + + path = "input_data/regioncode.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def substitutions_womens_names_path(spark): + """ Create a fixture with the path to Jonas's file for name substitutions for sex = 2""" + path = "input_data/female.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def birthyr_replace_path(spark): + """ Create a fixture with the path to Jonas's file for name substitutions for sex = 2""" + path = "input_data/birthyr_replace.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def substitutions_mens_names_path(spark): + """ Create a fixture with the path to Jonas's file for name substitutions for sex = 1""" + path = "input_data/male.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def substitutions_street_abbrevs_path(spark): + """ Create a fixture with the path to Jonas's file for name substitutions for sex = 1""" + path = "input_data/street_abbrevs_most_common.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def test_street_names_data_path(spark): + """ Create a fixture with the path to Jonas's file for name substitutions for sex = 1""" + path = "input_data/test_street_names_data.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def state_dist_path(spark): + """Create a fixture with the path to the distances lookup file""" + + path = "input_data/statedist.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def county_dist_path(spark): + """Create a fixture with the path to the distances lookup file""" + + path = "input_data/county_distances.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def ext_path_preprocessing_popularity(spark): + """ Create a fixture with the path to the test potential_matches csv file """ + + path = "input_data/popularity.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def potential_matches_path(spark): + """ Create a fixture with the path to the test potential_matches csv file """ + + path = "input_data/potential_matches.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def potential_matches_path_ids_only(spark): + """ Create a fixture with the path to the test potential_matches csv file """ + + path = "input_data/potential_matches_ids_only.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def potential_matches_agg_path(spark): + """ Create a fixture with the path to the test potential_matches csv file """ + + path = "input_data/potential_matches_agg.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def nativity_datasources(spark): + """Create a fixture with the path to the test training data file""" + + path_a = "input_data/nativity_test_data_a.csv" + path_b = "input_data/nativity_test_data_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + + return full_path_a, full_path_b + + +@pytest.fixture(scope="module") +def training_data_path(spark): + """Create a fixture with the path to the test training data file""" + + path = "input_data/training_data.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def training_data_doubled_path(spark): + """Create a fixture with the path to the test training data file""" + + path = "input_data/training_data_doubled.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def threshold_ratio_data_path(spark): + """Create a fixture with the path to the test training data file""" + + path = "input_data/threshold_ratio_test.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def threshold_ratio_data_path_2(spark): + """Create a fixture with the path to the test training data file""" + + path = "input_data/threshold_ratio_test_data_2.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def hh_matching_stubs(spark): + """Create a fixture with the path to the test training data file""" + + path_a = "input_data/hh_year_a.csv" + path_b = "input_data/hh_year_b.csv" + path_matches = "input_data/scored_matches_household_test.csv" + path_pred_matches = "input_data/predicted_matches_test.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + full_path_matches = os.path.join(package_path, path_matches) + full_path_pred_matches = os.path.join(package_path, path_pred_matches) + + return full_path_a, full_path_b, full_path_matches, full_path_pred_matches + + +@pytest.fixture(scope="module") +def hh_integration_test_data(spark): + """Create a fixture with the path to the test training data file""" + + path_a = "input_data/hh_real_a.csv" + path_b = "input_data/hh_real_b.csv" + path_matches = "input_data/matched_men.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + full_path_matches = os.path.join(package_path, path_matches) + + return full_path_a, full_path_b, full_path_matches + + +@pytest.fixture(scope="module") +def scored_matches_test_data(spark): + """Create a fixture with the path to the test training data file""" + + path_matches = "input_data/scored_matches_test_data.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path_matches = os.path.join(package_path, path_matches) + + return full_path_matches + + +@pytest.fixture(scope="module") +def hh_agg_features_test_data(spark): + """Create a fixture with the path to the test training data file""" + + path_a = "input_data/ha_source.csv" + path_b = "input_data/hb_source.csv" + path_pms = "input_data/hhpm_agg_test.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path_a = os.path.join(package_path, path_a) + full_path_b = os.path.join(package_path, path_b) + full_path_pms = os.path.join(package_path, path_pms) + + return full_path_a, full_path_b, full_path_pms + + +@pytest.fixture(scope="module") +def hh_training_data_path(spark): + """ Create a fixture with the path to the test HH training data file """ + + td_path = "input_data/new_hh_test_td.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path_td = os.path.join(package_path, td_path) + + return full_path_td + + +@pytest.fixture(scope="module") +def training_validation_path(spark): + """Create a fixture with the path to the test training data file""" + + path = "validation_data/training_all.parquet" + + package_path = os.path.dirname(hlink.tests.__file__) + full_path = os.path.join(package_path, path) + + return full_path + + +@pytest.fixture(scope="module") +def reporting_test_data_r2_pct(spark): + """Create a fixture with the path to the test training data file""" + + pdfa_path = "input_data/reporting_prepped_df_a.csv" + pm_path = "input_data/reporting_predicted_matches.csv" + hhpm_path = "input_data/reporting_hh_predicted_matches.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_pdfa_path = os.path.join(package_path, pdfa_path) + full_pm_path = os.path.join(package_path, pm_path) + full_hhpm_path = os.path.join(package_path, hhpm_path) + + return full_pdfa_path, full_pm_path, full_hhpm_path + + +@pytest.fixture(scope="module") +def reporting_test_data_representivity(spark): + """Create a fixture with the path to the test training data file""" + + rdf_path = "input_data/raw_df_reporting.csv" + pdf_path = "input_data/prepped_df_reporting.csv" + pm_path = "input_data/predicted_matches_reporting.csv" + hhpm_path = "input_data/hh_predicted_matches_reporting.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_rdf_path = os.path.join(package_path, rdf_path) + full_pdf_path = os.path.join(package_path, pdf_path) + full_pm_path = os.path.join(package_path, pm_path) + full_hhpm_path = os.path.join(package_path, hhpm_path) + + return full_rdf_path, full_pdf_path, full_pm_path, full_hhpm_path + + +@pytest.fixture(scope="module") +def test_data_rel_rows_age(spark): + """Create a fixture with the path to the test training data file""" + + raw_a = "input_data/rel_rows_test_a.csv" + raw_b = "input_data/rel_rows_test_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_a_path = os.path.join(package_path, raw_a) + full_b_path = os.path.join(package_path, raw_b) + + return full_a_path, full_b_path + + +@pytest.fixture(scope="module") +def test_data_blocking_double_comparison(spark): + """Create a fixture with the path to the test training data file""" + + raw_a = "input_data/jw_blocking_test_a.csv" + raw_b = "input_data/jw_blocking_test_b.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_a_path = os.path.join(package_path, raw_a) + full_b_path = os.path.join(package_path, raw_b) + + return full_a_path, full_b_path + + +@pytest.fixture(scope="module") +def crosswalk_input_paths(spark): + """Create a fixture with the path to the test training data file""" + + raw_df_a = "input_data/crosswalk/raw_df_a.csv" + raw_df_b = "input_data/crosswalk/raw_df_b.csv" + predicted_matches = "input_data/crosswalk/predicted_matches.csv" + hh_predicted_matches = "input_data/crosswalk/hh_predicted_matches.csv" + + package_path = os.path.dirname(hlink.tests.__file__) + full_raw_df_a_path = os.path.join(package_path, raw_df_a) + full_raw_df_b_path = os.path.join(package_path, raw_df_b) + full_predicted_matches_path = os.path.join(package_path, predicted_matches) + full_hh_predicted_matches_path = os.path.join(package_path, hh_predicted_matches) + + return ( + full_raw_df_a_path, + full_raw_df_b_path, + full_predicted_matches_path, + full_hh_predicted_matches_path, + ) + + +@pytest.fixture(scope="module") +def crosswalk_validation_path(spark): + package_path = os.path.dirname(hlink.tests.__file__) + return os.path.join(package_path, "validation_data/crosswalks/crosswalk.csv") + + +@pytest.fixture(scope="module") +def crosswalk_with_round_validation_path(spark): + package_path = os.path.dirname(hlink.tests.__file__) + return os.path.join( + package_path, "validation_data/crosswalks/crosswalk_with_round.csv" + ) diff --git a/hlink/tests/preprocessing_test.py b/hlink/tests/preprocessing_test.py new file mode 100755 index 0000000..5c35508 --- /dev/null +++ b/hlink/tests/preprocessing_test.py @@ -0,0 +1,1845 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import os +import pandas as pd +import pytest +from pyspark.sql.types import StructType, StructField, LongType +from hlink.errors import DataError + + +def test_step_0(preprocessing, spark, preprocessing_conf): + """ Test preprocessing step 0 to ensure that temporary raw_df_unpartitioned_(a/b) tables are created (exact copies of datasources from config). Also test that the presistent raw_df_(a/b) tables are created. Should be same as raw datasources with filters applied""" + + # Run the preprocessing step (this will use the test data) + preprocessing.run_step(0) + + # Create pandas DFs of the step_0 preprocessed test data + up_pdf_a = spark.table("raw_df_unpartitioned_a").toPandas() + up_pdf_b = spark.table("raw_df_unpartitioned_b").toPandas() + + pdf_a = spark.table("raw_df_a").toPandas() + pdf_b = spark.table("raw_df_b").toPandas() + + # Make assertions on the data + assert pdf_a.query("id == 10")["serialp"].iloc[0] == "A" + assert pdf_a.query("id == 20")["serialp"].iloc[0] == "B" + + assert pdf_b.query("id == 10")["serialp"].iloc[0] == "C" + assert pdf_b.query("id == 30")["serialp"].iloc[0] == "D" + + assert up_pdf_a.query("id == 10")["serialp"].iloc[0] == "A" + assert up_pdf_a.query("id == 30")["serialp"].iloc[0] == "B" + + assert up_pdf_b.query("id == 30")["serialp"].iloc[0] == "D" + assert up_pdf_b.query("id == 50")["serialp"].iloc[0] == "E" + + +def test_step_0_datasource_parquet_file( + preprocessing, spark, preprocessing_conf, input_data_dir_path +): + """ Test preprocessing step 0 to ensure that temporary raw_df_unpartitioned_(a/b) tables are created (exact copies of datasources from config). Also test that the presistent raw_df_(a/b) tables are created. Should be same as raw datasources with filters applied""" + + preprocessing_conf["datasource_a"] = { + "parquet_file": os.path.join( + input_data_dir_path, "test_parquet_data_a.parquet" + ), + "alias": "parquet_file_test_a", + } + + # Run the preprocessing step (this will use the test data) + preprocessing.run_step(0) + + # Create pandas DFs of the step_0 preprocessed test data + up_pdf_a = spark.table("raw_df_unpartitioned_a").toPandas() + + pdf_a = spark.table("raw_df_a").toPandas() + + # Make assertions on the data + assert pdf_a.query("id == 10")["bpl"].iloc[0] == 120 + assert pdf_a.query("id == 20")["bpl"].iloc[0] == 240 + + assert up_pdf_a.query("id == 30")["bpl"].iloc[0] == 360 + assert up_pdf_a.query("id == 20")["bpl"].iloc[0] == 240 + + +def test_step_0_datasource_file_parquet( + preprocessing, spark, preprocessing_conf, input_data_dir_path +): + """ Test preprocessing step 0 to ensure that temporary raw_df_unpartitioned_(a/b) tables are created (exact copies of datasources from config). Also test that the presistent raw_df_(a/b) tables are created. Should be same as raw datasources with filters applied""" + + preprocessing_conf["datasource_a"] = { + "file": os.path.join(input_data_dir_path, "test_parquet_data_b.parquet"), + "alias": "parquet_file_test_b", + } + + # Run the preprocessing step (this will use the test data) + preprocessing.run_step(0) + + # Create pandas DFs of the step_0 preprocessed test data + up_pdf_a = spark.table("raw_df_unpartitioned_a").toPandas() + + pdf_a = spark.table("raw_df_a").toPandas() + + # Make assertions on the data + assert pdf_a.query("id == 10")["bpl"].iloc[0] == 460 + assert pdf_a.query("id == 30")["bpl"].iloc[0] == 540 + + assert up_pdf_a.query("id == 10")["bpl"].iloc[0] == 460 + assert up_pdf_a.query("id == 50")["bpl"].iloc[0] == 710 + + +def test_step_0_datasource_file_csv( + preprocessing, spark, preprocessing_conf, input_data_dir_path +): + """ Test preprocessing step 0 to ensure that temporary raw_df_unpartitioned_(a/b) tables are created (exact copies of datasources from config). Also test that the presistent raw_df_(a/b) tables are created. Should be same as raw datasources with filters applied""" + + preprocessing_conf["datasource_a"] = { + "file": os.path.join(input_data_dir_path, "test_csv_data_a.csv"), + "alias": "csv_file_test_a", + } + preprocessing_conf["datasource_b"] = { + "file": os.path.join(input_data_dir_path, "test_csv_data_b.csv"), + "alias": "csv_file_test_b", + } + + # Run the preprocessing step (this will use the test data) + preprocessing.run_step(0) + + # Create pandas DFs of the step_0 preprocessed test data + up_pdf_a = spark.table("raw_df_unpartitioned_a").toPandas() + up_pdf_b = spark.table("raw_df_unpartitioned_b").toPandas() + + pdf_a = spark.table("raw_df_a").toPandas() + pdf_b = spark.table("raw_df_b").toPandas() + + # Make assertions on the data + assert pdf_a.query("id == 10")["bpl"].iloc[0] == 120 + assert pdf_a.query("id == 30")["bpl"].iloc[0] == 360 + + assert pdf_b.query("id == 30")["bpl"].iloc[0] == 540 + assert pdf_b.query("id == 50")["bpl"].iloc[0] == 710 + + assert up_pdf_a.query("id == 20")["bpl"].iloc[0] == 240 + assert up_pdf_a.query("id == 10")["bpl"].iloc[0] == 120 + + assert up_pdf_b.query("id == 10")["bpl"].iloc[0] == 460 + assert up_pdf_b.query("id == 30")["bpl"].iloc[0] == 540 + + +def test_step_0_filters_training_data(preprocessing, spark, preprocessing_conf): + """ Test filter run in preprocessing step 0 which selects any person in a household which has a person who is also represented in the test data""" + + # create some training data we can use for testing + td_schema = StructType( + [StructField("id_a", LongType(), True), StructField("id_b", LongType(), True)] + ) + data_training = [{"id_a": 10, "id_b": 30}, {"id_a": 20, "id_b": 40}] + td = spark.createDataFrame(data_training, schema=td_schema) + + # create the training_data table + preprocessing.run_register_python( + name="training_data", func=lambda: td, persist=True + ) + + # add a filter to the config to filter only for households in the training data set + preprocessing_conf["filter"] = [{"training_data_subset": True}] + assert preprocessing_conf["filter"] != [] + + # run the preprocessing step which includes filtering as a function + preprocessing.run_step(0) + + # Create pandas DFs of the step_0 preprocessed test data + pdf_a = spark.table("raw_df_a").toPandas() + pdf_b = spark.table("raw_df_b").toPandas() + + # Make assertions on the data + assert len(pdf_a.id) == 3 + assert len(pdf_b.id) == 1 + assert pdf_a.query("id == 20")["namelast"].iloc[0] == "Mc Last" + assert pd.isnull(pdf_b.query("id == 30")["namelast"].iloc[0]) + + +def test_step_0_filters_expression(preprocessing, spark, preprocessing_conf): + """ Test a filter run in preprocessing step 0 which selects rows from the raw data according to an expression""" + + # overwrite the config filter value to include only an expression type filter + preprocessing_conf["filter"] = [ + {"expression": "namelast is not null and namelast != ''"} + ] + assert preprocessing_conf["filter"] != [] + + # run the preprocessing step which includes filtering as a function + preprocessing.run_step(0) + + # create pandas DFs of the step_0 preprocessed test data + pdf_a = spark.table("raw_df_a").toPandas() + pdf_b = spark.table("raw_df_b").toPandas() + + # make assertions on the data + assert len(pdf_a.id) == 2 + assert len(pdf_b.id) == 1 + assert not pdf_a.namelast.isnull().values.any() + assert not pdf_b.namelast.isnull().values.any() + + +def test_step_0_filters_expression_and_household( + preprocessing, spark, preprocessing_conf +): + """ Test a filter run in preprocessing step 0 which selects rows from the raw data according to an expression AND includes any other entries with the same household ID (serialp)""" + + # overwrite the config filter value to include an expression filter which includes a household:true argument. + # Note: in a household filter, the variables for household ID in each input file must be specified in the filter. (This is the "serial_a": "serialp" bit below.) + + preprocessing_conf["filter"] = [ + { + "expression": "namelast == 'Mc Last' or namelast == 'Name'", + "household": True, + "serial_a": "serialp", + "serial_b": "serialp", + } + ] + + # run the preprocessing step which includes filtering as a function + preprocessing.run_step(0) + + # create pandas DFs of the step_0 preprocessed test data + pdf_a = spark.table("raw_df_a").toPandas() + pdf_b = spark.table("raw_df_b").toPandas() + + # make assertions on the data + assert len(pdf_a.id) == 2 + assert len(pdf_b.id) == 1 + assert not pdf_a.namelast.isnull().values.any() + assert not pdf_b.namelast.isnull().values.any() + + +def test_step_0_filters_datasource(preprocessing, spark, preprocessing_conf): + """Test a filter run in preprocessing step 0 which selects rows from the raw data according + to an expression AND only applies the expression to a specified datasource (a or b)""" + + # overwrite the config filter value to include an expression filter which includes a datasource argument. + preprocessing_conf["filter"] = [ + {"expression": "id == 30", "datasource": "a"}, + {"expression": "id == 10", "datasource": "b"}, + ] + assert preprocessing_conf["filter"] != [] + + # run the preprocessing step which includes filtering as a function + preprocessing.run_step(0) + + # create pandas DFs of the step_0 preprocessed test data + pdf_a = spark.table("raw_df_a").toPandas() + pdf_b = spark.table("raw_df_b").toPandas() + + # make assertions on the data + assert len(pdf_a.id) == 1 + assert len(pdf_b.id) == 1 + assert pdf_a.id[0] == 30 + assert pdf_b.id[0] == 10 + + +def test_step_0_check_for_all_spaces_unrestricted_data( + preprocessing, spark, preprocessing_conf_all_space_columns, capsys +): + """ Tests the check in preprocessing that looks for all-space columns, as found in unrestricted data files. """ + with pytest.raises(DataError, match=r"\: namelast, street\."): + preprocessing.run_step(0) + + +def test_step_1_transform_attach_variable( + preprocessing, spark, preprocessing_conf, region_code_path +): + """Test the transform "attach_variable" -- used to add a feature column from CSV data""" + preprocessing_conf["column_mappings"] = [{"column_name": "bpl"}] + preprocessing_conf["feature_selections"] = [ + { + "input_column": "bpl", + "output_column": "region", + "transform": "attach_variable", + "region_dict": region_code_path, + "col_to_join_on": "bpl", + "col_to_add": "region", + "null_filler": 99, + "col_type": "int", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert len(pdf_a.region) == 3 + assert len(pdf_b.region) == 3 + assert pdf_a.query("id == 10")["region"].iloc[0] == 6 + assert pdf_a.query("id == 30")["region"].iloc[0] == 99 + assert pdf_b.query("id == 10")["region"].iloc[0] == 8 + assert pdf_b.query("id == 50")["region"].iloc[0] == 99 + + +def test_step_1_transform_hash(preprocessing, spark, preprocessing_conf): + """Test the transform "attach_variable" -- used to add a feature column from CSV data""" + preprocessing_conf["column_mappings"].append( + { + "column_name": "namefrst", + "alias": "namefrst_clean", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + } + ) + + preprocessing_conf["feature_selections"] = [ + { + "input_column": "namefrst_clean", + "output_column": "namefrst_bigrams", + "transform": "bigrams", + "no_first_pad": True, + }, + { + "input_column": "namefrst_bigrams", + "output_column": "namefrst_bigrams_hash", + "transform": "hash", + "number": 5, + }, + ] + + preprocessing_conf["filter"] = [ + {"expression": "namefrst is not null and namefrst != ''"} + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert len(pdf_a.namefrst_bigrams_hash_count) == 3 + assert len(pdf_a.namefrst_bigrams_hash) == 3 + + assert len(pdf_b.namefrst_bigrams_hash_count) == 2 + assert len(pdf_b.namefrst_bigrams_hash) == 2 + + assert len(pdf_a.query("id == 10")["namefrst_bigrams_hash_count"].iloc[0]) == 17 + assert len(pdf_a.query("id == 10")["namefrst_bigrams_hash"].iloc[0]) == 5 + + assert len(pdf_b.query("id == 10")["namefrst_bigrams_hash_count"].iloc[0]) == 6 + assert len(pdf_b.query("id == 10")["namefrst_bigrams_hash"].iloc[0]) == 5 + + +def test_step_1_transform_override( + preprocessing, spark, preprocessing_conf, region_code_path +): + """Test a column mapping with transform and OVERRIDE column for a specified datasource -- used to generate a feature from a column from one datasource and use a pre-existing column from the other datasource""" + preprocessing_conf["column_mappings"] = [ + { + "column_name": "namefrst", + "alias": "namefrst_mid_init", + "override_column_b": "namemiddle", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_prefixes", "values": ["mr"]}, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + {"type": "split"}, + {"type": "array_index", "value": 1}, + ], + "override_transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 10")["namefrst_mid_init"].iloc[0] == "m" + assert pdf_a.query("id == 20")["namefrst_mid_init"].iloc[0] == "marc" + assert pd.isnull(pdf_a.query("id == 30")["namefrst_mid_init"].iloc[0]) + assert pdf_b.query("id == 10")["namefrst_mid_init"].iloc[0] == "m" + assert pdf_b.query("id == 50")["namefrst_mid_init"].iloc[0] == "marc" + assert pd.isnull(pdf_b.query("id == 30")["namefrst_mid_init"].iloc[0]) + + +def test_step_1_transforms_namefrst_soundex( + preprocessing, spark, preprocessing_conf, region_code_path +): + """Test a column mapping with string-based transforms on firstname removing middle name """ + preprocessing_conf["column_mappings"] = [ + { + "column_name": "namefrst", + "alias": "namefrst_std", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_prefixes", "values": ["mr"]}, + {"type": "remove_alternate_names"}, + {"type": "remove_one_letter_names"}, + {"type": "condense_strip_whitespace"}, + {"type": "split"}, + {"type": "array_index", "value": 0}, + ], + } + ] + + preprocessing_conf["feature_selections"] = [ + { + "input_column": "namefrst_std", + "output_column": "namefrst_soundex", + "transform": "soundex", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 10")["namefrst_std"].iloc[0] == "john" + assert pdf_a.query("id == 20")["namefrst_std"].iloc[0] == "marc" + assert pdf_a.query("id == 30")["namefrst_std"].iloc[0] == "jon" + assert pdf_a.query("namefrst_std == 'john'")["namefrst_soundex"].iloc[0] == "J500" + assert pdf_a.query("namefrst_std == 'marc'")["namefrst_soundex"].iloc[0] == "M620" + + assert pdf_b.query("id == 10")["namefrst_std"].iloc[0] == "john" + assert pd.isnull(pdf_b.query("id == 30")["namefrst_std"].iloc[0]) + assert pdf_b.query("id == 50")["namefrst_std"].iloc[0] == "jean" + assert pdf_b.query("namefrst_std == 'jean'")["namefrst_soundex"].iloc[0] == "J500" + + +def test_step_1_transforms_prefix_suffix( + preprocessing, spark, preprocessing_conf, region_code_path +): + """Test a column mapping with different string transforms """ + preprocessing_conf["column_mappings"] = [ + { + "column_name": "namefrst", + "alias": "namefrst_std", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "rationalize_name_words"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + { + "type": "remove_suffixes", + "values": ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"], + }, + {"type": "remove_prefixes", "values": ["mr"]}, + {"type": "remove_alternate_names"}, + {"type": "condense_strip_whitespace"}, + ], + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 10")["namefrst_std"].iloc[0] == "john m" + assert pdf_a.query("id == 20")["namefrst_std"].iloc[0] == "j marc ell" + assert pdf_a.query("id == 30")["namefrst_std"].iloc[0] == "jon" + + assert pdf_b.query("id == 10")["namefrst_std"].iloc[0] == "john" + assert pd.isnull(pdf_b.query("id == 30")["namefrst_std"].iloc[0]) + assert pdf_b.query("id == 50")["namefrst_std"].iloc[0] == "jean" + + +def test_step_1_transform_adv_str(preprocessing, spark, preprocessing_conf): + """ Test a column mapping with remaining transforms """ + preprocessing_conf["column_mappings"] = [ + { + "column_name": "namelast", + "alias": "namelast_std", + "transforms": [ + {"type": "lowercase_strip"}, + { + "type": "condense_prefixes", + "values": ["mc", "mac", "o", "de", "van", "di"], + }, + ], + }, + {"column_name": "sex"}, + { + "column_name": "sex", + "alias": "sex_int", + "transforms": [{"type": "cast_as_int"}], + }, + { + "column_name": "namelast_std", + "alias": "namelast_std_len", + "transforms": [{"type": "length"}], + }, + { + "alias": "namelast_init", + "column_name": "namelast_std", + "transforms": [{"type": "substring", "values": [0, 1]}], + }, + { + "column_name": "sex_int", + "alias": "sex_mapped", + "transforms": [{"type": "mapping", "mappings": {"1": "M", "2": "F"}}], + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + + assert pdf_a.query("id == 20")["namelast_std"].iloc[0] == "mclast" + assert pdf_a.query("id == 10")["sex_int"].iloc[0] == 1 + assert pdf_a.query("id == 20")["namelast_std_len"].iloc[0] == 6 + assert pdf_a.query("id == 20")["namelast_init"].iloc[0] == "m" + assert pdf_a.query("id == 30")["namelast_init"].iloc[0] == "l" + assert pdf_a.query("id == 10")["sex_mapped"].iloc[0] == "M" + assert pdf_a.query("id == 20")["sex_mapped"].iloc[0] == "F" + + +def test_step_1_transform_neighbor_agg( + preprocessing, spark, preprocessing_conf_household_data +): + """ Test neighbor_aggregate transform on data containing households """ + preprocessing_conf_household_data["feature_selections"] = [ + { + "output_column": "namelast_neighbors", + "input_column": "namelast_clean", + "transform": "neighbor_aggregate", + "neighborhood_column": "enumdist", + "sort_column": "serial", + "range": 5, + } + ] + + preprocessing.run_step(0) + + rda = spark.table("raw_df_a") + rda = ( + rda.withColumn("enumdist_tmp", rda["enumdist"].cast("bigint")) + .drop("enumdist") + .withColumnRenamed("enumdist_tmp", "enumdist") + ) + rda = ( + rda.withColumn("serial_tmp", rda["serial"].cast("bigint")) + .drop("serial") + .withColumnRenamed("serial_tmp", "serial") + ) + rda = ( + rda.withColumn("pernum_tmp", rda["pernum"].cast("bigint")) + .drop("pernum") + .withColumnRenamed("pernum_tmp", "pernum") + ) + rda.write.mode("overwrite").saveAsTable("raw_df_a_tmp") + spark.sql("drop table raw_df_a") + spark.sql("alter table raw_df_a_tmp rename to raw_df_a") + + rdb = spark.table("raw_df_b") + rdb = ( + rdb.withColumn("enumdist_tmp", rdb["enumdist"].cast("bigint")) + .drop("enumdist") + .withColumnRenamed("enumdist_tmp", "enumdist") + ) + rdb = ( + rdb.withColumn("serial_tmp", rdb["serial"].cast("bigint")) + .drop("serial") + .withColumnRenamed("serial_tmp", "serial") + ) + rdb = ( + rdb.withColumn("pernum_tmp", rdb["pernum"].cast("bigint")) + .drop("pernum") + .withColumnRenamed("pernum_tmp", "pernum") + ) + rdb.write.mode("overwrite").saveAsTable("raw_df_b_tmp") + spark.sql("drop table raw_df_b") + spark.sql("alter table raw_df_b_tmp rename to raw_df_b") + + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + wilson_a_nbrs = sorted( + pdf_a.query("namelast_clean == 'wilson'")["namelast_neighbors"].iloc[0] + ) + wilson_b_nbrs = sorted( + pdf_b.query("namelast_clean == 'wilson'")["namelast_neighbors"].iloc[0] + ) + lord_a_nbrs = sorted( + pdf_a.query("namelast_clean == 'lord'")["namelast_neighbors"].iloc[0] + ) + + assert lord_a_nbrs == ["allen", "dekay", "foster", "graham", "taylor", "thorpe"] + + assert wilson_a_nbrs == [ + "bierhahn", + "chambers", + "cleveland", + "collins", + "flemming", + "graham", + "harvey", + "mclean", + "seward", + "shields", + ] + assert wilson_b_nbrs == [ + "bierhahn", + "cleveland", + "collins", + "dekay", + "flemming", + "graham", + "harvey", + "mclean", + "seward", + "shields", + ] + + +def test_step_1_transform_attach_family_col( + preprocessing, spark, preprocessing_conf_household_data +): + """ Test attach_family_col transform on data containing households """ + preprocessing_conf_household_data["feature_selections"] = [ + { + "output_col": "spouse_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_clean", + "person_pointer": "sploc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "spouse_bpl", + "transform": "attach_family_col", + "other_col": "bpl", + "person_pointer": "sploc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "father_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_clean", + "person_pointer": "poploc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "mother_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_clean", + "person_pointer": "momloc", + "family_id": "serial", + "person_id": "pernum", + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + pdf_a = ( + spark.table("prepped_df_a") + .toPandas()[ + [ + "serial", + "pernum", + "namefrst_clean", + "namelast_clean", + "spouse_namefrst", + "spouse_bpl", + "father_namefrst", + "mother_namefrst", + ] + ] + .sort_values(["serial", "pernum"]) + ) + pdf_b = ( + spark.table("prepped_df_b") + .toPandas()[ + [ + "serial", + "pernum", + "namefrst_clean", + "namelast_clean", + "spouse_namefrst", + "spouse_bpl", + "father_namefrst", + "mother_namefrst", + ] + ] + .sort_values(["serial", "pernum"]) + ) + + assert ( + pdf_a.query("namefrst_clean == 'jezebel'")["spouse_namefrst"].iloc[0] == "job" + ) + assert ( + pdf_a.query("namefrst_clean == 'jezebel'")["mother_namefrst"].iloc[0] == "eliza" + ) + assert ( + pdf_a.query("namefrst_clean == 'willie may'")["father_namefrst"].iloc[0] + == "wm h" + ) + assert ( + pdf_a.query("namefrst_clean == 'willie may'")["mother_namefrst"].iloc[0] + == "martha" + ) + + assert ( + pdf_b.query("namefrst_clean == 'jezebel'")["spouse_namefrst"].iloc[0] == "job" + ) + assert pdf_b.query("namefrst_clean == 'jezebel'")["spouse_bpl"].iloc[0] == 10 + assert ( + pdf_b.query("namefrst_clean == 'jezebel'")["mother_namefrst"].iloc[0] == "eliza" + ) + assert pd.isnull( + pdf_b.query("namefrst_clean == 'esther'")["spouse_namefrst"].iloc[0] + ) + + +def test_step_1_transform_calc_nativity( + preprocessing, spark, preprocessing_conf_19thc_nativity_conf +): + """ Test attach_family_col transform on data containing households """ + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas().sort_values(["serial", "pernum"]) + + pdf_b = spark.table("prepped_df_b").toPandas().sort_values(["serial", "pernum"]) + + assert set(pdf_a["nativity"].tolist()) == {1, 2, 3, 4, 5} + assert set(pdf_a["test_nativity"].tolist()) == {0} + assert set(pdf_a["mbpl"].tolist()) == {999} + assert set(pdf_a["key_mbpl_range_b"].tolist()) == {0} + assert set(pdf_a["mbpl_range_b"].tolist()) == {0} + assert pdf_a["mother_nativity"].equals(pdf_a["key_mother_nativity"]) + assert pdf_a["key_mbpl"].equals(pdf_a["mbpl_calc"]) + assert pdf_a["key_fbpl"].equals(pdf_a["fbpl_calc"]) + assert pdf_a["mbpl_range"].equals(pdf_a["key_mbpl_range"]) + assert set(pdf_a["key_mbpl_range"].tolist()) == {0, 1, 2} + assert pdf_a["key_nativity_calc"].equals(pdf_a["nativity_calc"]) + + assert set(pdf_b["nativity"].tolist()) == {0} + assert set(pdf_b["test_nativity"].tolist()) == {0, 1, 2, 3, 4, 5} + assert pdf_b["key_nativity_calc"].equals(pdf_b["test_nativity"]) + assert set(pdf_b["key_mbpl_range"].tolist()) == {0} + assert set(pdf_b["key_mother_nativity"].tolist()) == {0} + assert pdf_b["mbpl_range_b"].equals(pdf_b["key_mbpl_range_b"]) + assert set(pdf_b["mother_nativity"].tolist()) == {0} + assert pdf_b["key_mbpl"].equals(pdf_b["mbpl_calc"]) + assert pdf_b["key_fbpl"].equals(pdf_b["fbpl_calc"]) + assert pdf_b["mbpl_range"].equals(pdf_b["mbpl_range_b"]) + assert pdf_b["mbpl_range"].equals(pdf_b["key_mbpl_range_b"]) + assert pdf_b["key_nativity_calc"].equals(pdf_b["nativity_calc"]) + + assert pdf_a["nativity_calc"].tolist() == [ + 0, + 0, + 1, + 0, + 5, + 0, + 2, + 0, + 5, + 3, + 5, + 0, + 2, + 5, + 5, + 0, + 0, + ] + assert pdf_b["nativity_calc"].tolist() == [ + 0, + 0, + 1, + 0, + 5, + 0, + 2, + 0, + 5, + 3, + 5, + 5, + 4, + 5, + 5, + 1, + ] + + +def test_step_1_transform_related_individuals( + preprocessing, spark, preprocessing_conf_household_data +): + """ Test attach related_individuals transform """ + preprocessing_conf_household_data["feature_selections"] = [ + { + "output_col": "namefrst_related", + "input_col": "namefrst_clean", + "transform": "related_individuals", + "family_id": "serial", + "relate_col": "relate", + "top_code": 10, + "bottom_code": 3, + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = ( + spark.table("prepped_df_a") + .toPandas()[ + [ + "serial", + "pernum", + "relate", + "namefrst_clean", + "namelast_clean", + "namefrst_related", + ] + ] + .sort_values(["serial", "pernum"]) + ) + pdf_b = ( + spark.table("prepped_df_b") + .toPandas()[ + [ + "serial", + "pernum", + "relate", + "namefrst_clean", + "namelast_clean", + "namefrst_related", + ] + ] + .sort_values(["serial", "pernum"]) + ) + + assert pdf_a.query("namefrst_clean == 'otillia'")["namefrst_related"].iloc[0] == [] + assert pdf_a.query("namefrst_clean == 'j clauson'")["namefrst_related"].iloc[0] == [ + "eugene" + ] + + assert sorted( + pdf_b.query("namefrst_clean == 'job'")["namefrst_related"].iloc[0] + ) == ["eliza", "jo", "mary"] + + +def test_step_1_transform_related_individual_rows( + preprocessing, spark, preprocessing_conf_household_data +): + """ Test attach related_individuals transform """ + preprocessing_conf_household_data["feature_selections"] = [ + { + "output_col": "spouse_namefrst", + "transform": "attach_family_col", + "other_col": "namefrst_clean", + "person_pointer": "sploc", + "family_id": "serial", + "person_id": "pernum", + }, + { + "output_col": "namefrst_related_rows", + "input_cols": ["namefrst_clean", "bpl", "sex"], + "transform": "related_individual_rows", + "family_id": "serial", + "filters": [ + {"column": "relate", "max": 10, "min": 3}, + {"column": "pernum", "max": 4, "min": 0, "dataset": "b"}, + ], + }, + { + "output_col": "unrelated_rows", + "input_cols": ["namefrst_clean", "bpl", "sex"], + "transform": "related_individual_rows", + "family_id": "serial", + "filters": [{"column": "relate", "max": 999, "min": 11}], + }, + { + "output_column": "namelast_neighbors", + "input_column": "namelast_clean", + "transform": "neighbor_aggregate", + "neighborhood_column": "enumdist", + "sort_column": "serial", + "range": 5, + }, + ] + + preprocessing.run_step(0) + + rda = spark.table("raw_df_a") + rda = ( + rda.withColumn("enumdist_tmp", rda["enumdist"].cast("bigint")) + .drop("enumdist") + .withColumnRenamed("enumdist_tmp", "enumdist") + ) + rda = ( + rda.withColumn("serial_tmp", rda["serial"].cast("bigint")) + .drop("serial") + .withColumnRenamed("serial_tmp", "serial") + ) + rda = ( + rda.withColumn("pernum_tmp", rda["pernum"].cast("bigint")) + .drop("pernum") + .withColumnRenamed("pernum_tmp", "pernum") + ) + rda = ( + rda.withColumn("bpl_tmp", rda["bpl"].cast("bigint")) + .drop("bpl") + .withColumnRenamed("bpl_tmp", "bpl") + ) + rda = ( + rda.withColumn("sex_tmp", rda["sex"].cast("bigint")) + .drop("sex") + .withColumnRenamed("sex_tmp", "sex") + ) + rda = ( + rda.withColumn("relate_tmp", rda["relate"].cast("bigint")) + .drop("relate") + .withColumnRenamed("relate_tmp", "relate") + ) + + rda.write.mode("overwrite").saveAsTable("raw_df_a_tmp") + spark.sql("drop table raw_df_a") + spark.sql("alter table raw_df_a_tmp rename to raw_df_a") + + rdb = spark.table("raw_df_b") + rdb = ( + rdb.withColumn("enumdist_tmp", rdb["enumdist"].cast("bigint")) + .drop("enumdist") + .withColumnRenamed("enumdist_tmp", "enumdist") + ) + rdb = ( + rdb.withColumn("serial_tmp", rdb["serial"].cast("bigint")) + .drop("serial") + .withColumnRenamed("serial_tmp", "serial") + ) + rdb = ( + rdb.withColumn("pernum_tmp", rdb["pernum"].cast("bigint")) + .drop("pernum") + .withColumnRenamed("pernum_tmp", "pernum") + ) + rdb = ( + rdb.withColumn("bpl_tmp", rdb["bpl"].cast("bigint")) + .drop("bpl") + .withColumnRenamed("bpl_tmp", "bpl") + ) + rdb = ( + rdb.withColumn("sex_tmp", rdb["sex"].cast("bigint")) + .drop("sex") + .withColumnRenamed("sex_tmp", "sex") + ) + rdb = ( + rdb.withColumn("relate_tmp", rdb["relate"].cast("bigint")) + .drop("relate") + .withColumnRenamed("relate_tmp", "relate") + ) + + rdb.write.mode("overwrite").saveAsTable("raw_df_b_tmp") + spark.sql("drop table raw_df_b") + spark.sql("alter table raw_df_b_tmp rename to raw_df_b") + + preprocessing.run_step(1) + select_cols = [ + "serial", + "pernum", + "relate", + "namefrst_clean", + "namelast_clean", + "spouse_namefrst", + "namefrst_related_rows", + "unrelated_rows", + "namelast_neighbors", + ] + + pdf_a = ( + spark.table("prepped_df_a") + .toPandas()[select_cols] + .sort_values(["serial", "pernum"]) + ) + pdf_b = ( + spark.table("prepped_df_b") + .toPandas()[select_cols] + .sort_values(["serial", "pernum"]) + ) + + wilson_a_nbrs = sorted( + pdf_a.query("namelast_clean == 'wilson'")["namelast_neighbors"].iloc[0] + ) + + assert wilson_a_nbrs == [ + "bierhahn", + "chambers", + "cleveland", + "collins", + "flemming", + "graham", + "harvey", + "mclean", + "seward", + "shields", + ] + + assert ( + pdf_a.query("namefrst_clean == 'otillia'")["namefrst_related_rows"].iloc[0] + == [] + ) + + row_a = pdf_a.query("namefrst_clean == 'j clauson'")["namefrst_related_rows"].iloc[ + 0 + ][0] + assert row_a.namefrst_clean == "eugene" + assert row_a.bpl == 10 + assert row_a.sex == 1 + + assert ( + pdf_a.query("namefrst_clean == 'jezebel'")["spouse_namefrst"].iloc[0] == "job" + ) + + assert ( + len(pdf_b.query("namefrst_clean == 'job'")["namefrst_related_rows"].iloc[0]) + == 2 + ) + assert ( + pdf_a.query("serial == '2485411' and pernum == 1").unrelated_rows.iloc[0][0][0] + == "anne" + ) + assert ( + len( + pdf_a.query( + "serial == '2492741' and pernum == 4" + ).namefrst_related_rows.iloc[0] + ) + == 2 + ) + assert ( + len( + pdf_b.query( + "serial == '2492741' and pernum == 4" + ).namefrst_related_rows.iloc[0] + ) + == 1 + ) + + +def test_step_1_transform_popularity( + preprocessing, spark, preprocessing_conf_popularity +): + """ Test attach related_individuals transform """ + preprocessing_conf_popularity["feature_selections"] = [ + { + "checkpoint": True, + "input_cols": ["namefrst", "namelast", "bpl", "sex"], + "range_col": "birthyr", + "range_val": 3, + "output_col": "ncount", + "transform": "popularity", + }, + { + "checkpoint": True, + "input_cols": ["namefrst", "bpl", "sex", "birthyr"], + "output_col": "fname_pop", + "transform": "popularity", + }, + { + "checkpoint": True, + "output_col": "byr_pop", + "range_col": "birthyr", + "range_val": 3, + "transform": "popularity", + }, + { + "input_col": "ncount", + "output_col": "ncount2", + "transform": "power", + "exponent": 2, + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").orderBy("id").toPandas() + + assert pdf_a.query("id == 0")["ncount"].iloc[0] == 3 + assert pdf_a.query("id == 1")["ncount"].iloc[0] == 1 + assert pdf_a.query("id == 2")["ncount"].iloc[0] == 2 + assert pdf_a.query("id == 3")["ncount"].iloc[0] == 2 + assert pdf_a.query("id == 4")["ncount"].iloc[0] == 1 + assert pdf_a.query("id == 5")["ncount"].iloc[0] == 1 + assert pdf_a.query("id == 6")["ncount"].iloc[0] == 1 + assert pdf_a.query("id == 7")["ncount"].iloc[0] == 1 + + assert pdf_a.query("id == 0")["ncount2"].iloc[0] == 9 + assert pdf_a.query("id == 1")["ncount2"].iloc[0] == 1 + assert pdf_a.query("id == 2")["ncount2"].iloc[0] == 4 + + assert pdf_a.query("id == 0")["fname_pop"].iloc[0] == 2 + assert pdf_a.query("id == 1")["fname_pop"].iloc[0] == 1 + assert pdf_a.query("id == 2")["fname_pop"].iloc[0] == 1 + assert pdf_a.query("id == 3")["fname_pop"].iloc[0] == 1 + assert pdf_a.query("id == 4")["fname_pop"].iloc[0] == 2 + assert pdf_a.query("id == 5")["fname_pop"].iloc[0] == 1 + assert pdf_a.query("id == 6")["fname_pop"].iloc[0] == 1 + assert pdf_a.query("id == 7")["fname_pop"].iloc[0] == 1 + + assert pdf_a.query("id == 0")["byr_pop"].iloc[0] == 7 + assert pdf_a.query("id == 1")["byr_pop"].iloc[0] == 1 + assert pdf_a.query("id == 2")["byr_pop"].iloc[0] == 4 + assert pdf_a.query("id == 3")["byr_pop"].iloc[0] == 6 + + +def test_step_1_transforms_adv_calc(preprocessing, spark, preprocessing_conf): + """ Test more column mapping with transforms """ + preprocessing_conf["column_mappings"] = [ + { + "column_name": "serialp", + "alias": "serialp_add_a", + "transforms": [{"type": "concat_to_a", "value": "_a"}], + }, + { + "column_name": "bpl", + "alias": "bpl_add_b", + "transforms": [{"type": "concat_to_b", "value": 1}], + }, + { + "alias": "bpl", + "column_name": "bpl", + }, + { + "alias": "sex", + "column_name": "sex", + }, + { + "alias": "concat_bpl_sex", + "column_name": "bpl", + "transforms": [{"type": "concat_two_cols", "column_to_append": "sex"}], + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 10")["serialp_add_a"].iloc[0] == "A_a" + assert pdf_b.query("id == 10")["serialp_add_a"].iloc[0] == "C" + assert pdf_a.query("id == 20")["bpl_add_b"].iloc[0] == 200 + assert pdf_b.query("id == 30")["bpl_add_b"].iloc[0] == "5001" + assert pdf_a.query("id == 10")["concat_bpl_sex"].iloc[0] == "1001" + assert pdf_b.query("id == 50")["concat_bpl_sex"].iloc[0] == "7002" + + +def test_step_1_transforms_expand(preprocessing, spark, preprocessing_conf): + """ Test transform expand """ + preprocessing_conf["column_mappings"] = [ + {"column_name": "age"}, + { + "column_name": "age", + "alias": "age_expand_3", + "transforms": [{"type": "expand", "value": 3}], + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + + assert pdf_a.query("id == 10")["age_expand_3"].iloc[0] == [ + 20, + 21, + 22, + 23, + 24, + 25, + 26, + ] + + +def test_step_1_override(preprocessing, spark, preprocessing_conf): + """ Test column override """ + preprocessing_conf["column_mappings"] = [ + { + "column_name": "serialp", + "override_column_a": "serialp", + "override_transforms": [{"type": "lowercase_strip"}], + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 30")["serialp"].iloc[0] == "b" + assert pdf_b.query("id == 50")["serialp"].iloc[0] == "E" + + +def test_step_1_set_values_a_explicitly(preprocessing, spark, preprocessing_conf): + """ Test setting a column value explicitly """ + preprocessing_conf["column_mappings"] = [ + {"column_name": "serialp", "alias": "serialp", "set_value_column_a": "c"} + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 30")["serialp"].iloc[0] == "c" + assert pdf_b.query("id == 50")["serialp"].iloc[0] == "E" + + +def test_step_1_set_values_b_explicitly(preprocessing, spark, preprocessing_conf): + """ Test setting b column value explicitly """ + preprocessing_conf["column_mappings"] = [ + {"column_name": "serialp", "alias": "serialp", "set_value_column_b": "a"} + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == 30")["serialp"].iloc[0] == "B" + assert pdf_b.query("id == 50")["serialp"].iloc[0] == "a" + + +def test_step_1_substitution( + preprocessing, + spark, + preprocessing_conf_simple_names, + substitutions_womens_names_path, + substitutions_mens_names_path, +): + """ Test text substitution """ + preprocessing_conf_simple_names["column_mappings"] = [ + { + "column_name": "namefrst", + "alias": "namefrst_std", + "transforms": [{"type": "lowercase_strip"}], + }, + {"column_name": "namefrst"}, + {"column_name": "sex"}, + ] + + preprocessing_conf_simple_names["substitution_columns"] = [ + { + "column_name": "namefrst_std", + "substitutions": [ + { + "join_column": "sex", + "join_value": "1", + "substitution_file": substitutions_mens_names_path, + }, + { + "join_column": "sex", + "join_value": "2", + "substitution_file": substitutions_womens_names_path, + }, + ], + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == '10ah'")["namefrst_std"].iloc[0] == "cat" + assert pdf_a.query("id == '20bc'")["namefrst_std"].iloc[0] == "bernard" + assert pdf_a.query("id == '34hi'")["namefrst_std"].iloc[0] == "catherine" + assert pdf_a.query("id == '54de'")["namefrst_std"].iloc[0] == "kat" + assert pdf_b.query("id == 'c23'")["namefrst_std"].iloc[0] == "bernard" + assert pdf_b.query("id == 'd45'")["namefrst_std"].iloc[0] == "catherine" + assert pdf_b.query("id == 'e77'")["namefrst_std"].iloc[0] == "bernard" + + +def test_step_1_street_abbrev_substitution( + preprocessing, + spark, + preprocessing_conf_street_names, + substitutions_street_abbrevs_path, +): + """ Test text substitution """ + preprocessing_conf_street_names["id_column"] = "histid" + preprocessing_conf_street_names["column_mappings"] = [ + { + "column_name": "street", + "alias": "street_unstd", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + {"type": "condense_strip_whitespace"}, + ], + }, + { + "column_name": "street_unstd", + "alias": "street_swapped", + "transforms": [ + { + "type": "swap_words", + "values": {"bch": "beach", "ctr": "center", "rd": "road"}, + } + ], + }, + ] + + preprocessing_conf_street_names["substitution_columns"] = [ + { + "column_name": "street_unstd", + "substitutions": [ + { + "substitution_file": substitutions_street_abbrevs_path, + "regex_word_replace": True, + } + ], + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + + assert pdf_a.query("histid == 'a01'")["street_unstd"].iloc[0] == "turnpike 35" + assert ( + pdf_a.query("histid == 'b02'")["street_unstd"].iloc[0] == "4th terrace avenue" + ) + assert pdf_a.query("histid == 'c03'")["street_unstd"].iloc[0] == "4th state" + assert pdf_a.query("histid == 'd04'")["street_unstd"].iloc[0] == "old boulevard" + assert pdf_a.query("histid == 'e05'")["street_unstd"].iloc[0] == "old motorway" + assert pdf_a.query("histid == 'f06'")["street_unstd"].iloc[0] == "miami bch road" + assert pdf_a.query("histid == 'g07'")["street_unstd"].iloc[0] == "center street" + assert pdf_a.query("histid == 'g08'")["street_unstd"].iloc[0] == "ctr street" + assert pdf_a.query("histid == 'i09'")["street_unstd"].iloc[0] == "strstreet" + + assert ( + pdf_a.query("histid == 'f06'")["street_swapped"].iloc[0] == "miami beach road" + ) + assert pdf_a.query("histid == 'g08'")["street_swapped"].iloc[0] == "center street" + + +def test_step_1_street_remove_stop_words( + preprocessing, + spark, + preprocessing_conf_street_names, + substitutions_street_abbrevs_path, +): + """ Test text substitution """ + preprocessing_conf_street_names["id_column"] = "histid" + preprocessing_conf_street_names["column_mappings"] = [ + {"column_name": "street", "alias": "street_orig"}, + { + "column_name": "street", + "alias": "street_unstd", + "transforms": [ + {"type": "lowercase_strip"}, + {"type": "remove_qmark_hyphen"}, + {"type": "replace_apostrophe"}, + {"type": "condense_strip_whitespace"}, + ], + }, + { + "column_name": "street_unstd", + "alias": "street_removed", + "transforms": [ + { + "type": "remove_stop_words", + "values": [ + "avn", + "blvd", + "rd", + "road", + "street", + "str", + "ter", + "trnpk", + ], + }, + {"type": "condense_strip_whitespace"}, + ], + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + + assert pdf_a.query("histid == 'a01'")["street_removed"].iloc[0] == "35" + assert pdf_a.query("histid == 'b02'")["street_removed"].iloc[0] == "4th" + assert pdf_a.query("histid == 'c03'")["street_removed"].iloc[0] == "4th state" + assert pdf_a.query("histid == 'd04'")["street_removed"].iloc[0] == "old" + assert pdf_a.query("histid == 'e05'")["street_removed"].iloc[0] == "old motorway" + assert pdf_a.query("histid == 'f06'")["street_removed"].iloc[0] == "miami bch" + assert pdf_a.query("histid == 'g07'")["street_removed"].iloc[0] == "centre" + assert pdf_a.query("histid == 'g08'")["street_removed"].iloc[0] == "ctr" + assert pdf_a.query("histid == 'i09'")["street_removed"].iloc[0] == "strstreet" + + +def test_step_1_divide_by_int_mapping_birthyr( + preprocessing, spark, preprocessing_conf_birthyr +): + """ Test text substitution """ + preprocessing_conf_birthyr["id_column"] = "histid" + preprocessing_conf_birthyr["column_mappings"] = [ + {"column_name": "yearp", "alias": "year"}, + {"column_name": "age"}, + {"column_name": "birthyr", "alias": "raw_birthyr"}, + { + "column_name": "birthyr", + "transforms": [{"type": "mapping", "mappings": {1999: ""}}], + }, + {"column_name": "bpl", "alias": "raw_bpl"}, + { + "column_name": "bpl", + "transforms": [ + {"type": "divide_by_int", "value": 100}, + {"type": "get_floor"}, + ], + }, + ] + preprocessing_conf_birthyr["feature_selections"] = [ + { + "input_column": "birthyr", + "output_column": "birthyr_filled", + "condition": "case when birthyr is null or birthyr == '' then year - age else birthyr end", + "transform": "sql_condition", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + + assert pdf_a.query("histid == 'a01'")["raw_birthyr"].iloc[0] == 1999 + assert pdf_a.query("histid == 'a01'")["birthyr"].iloc[0] == "" + assert pdf_a.query("histid == 'a01'")["bpl"].iloc[0] == 33 + assert pdf_a.query("histid == 'b00'")["raw_bpl"].iloc[0] == 4799 + assert pdf_a.query("histid == 'b00'")["bpl"].iloc[0] == 47 + assert pdf_a.query("histid == 'b02'")["bpl"].iloc[0] == 30 + assert pdf_a.query("histid == 'd04'")["bpl"].iloc[0] == 1 + assert pdf_a.query("histid == 'a01'")["birthyr_filled"].iloc[0] == "1864" + assert pdf_a.query("histid == 'b02'")["birthyr_filled"].iloc[0] == "1858" + assert pd.isnull(pdf_a.query("histid == 'b00'")["birthyr_filled"].iloc[0]) + assert pdf_a.query("histid == 'c03'")["birthyr_filled"].iloc[0] == "1901" + assert pdf_a.query("histid == 'd04'")["birthyr_filled"].iloc[0] == "1850" + + +def test_step_1_fix_bpl(preprocessing, spark, preprocessing_conf_birthyr): + """ Test text substitution """ + preprocessing_conf_birthyr["id_column"] = "histid" + preprocessing_conf_birthyr["column_mappings"] = [ + {"column_name": "state1"}, + {"column_name": "state2"}, + {"column_name": "bpl", "alias": "bpl_orig"}, + { + "column_name": "bpl", + "alias": "bpl_state_orig", + "transforms": [ + {"type": "divide_by_int", "value": 100}, + {"type": "get_floor"}, + ], + }, + ] + preprocessing_conf_birthyr["feature_selections"] = [ + { + "input_column": "bpl_orig", + "output_column": "clean_bpl", + "condition": """case + when state1 == "washington" and state2=="washington" + then 5300 + when (state1 is null or state1 == '') and state2=="washington" + then 5300 + when state1 == "washington" and (state2=='' or state2 is null) + then 5300 + else bpl_orig + end""", + "transform": "sql_condition", + }, + { + "input_column": "bpl_state_orig", + "output_column": "bpl_state", + "condition": """case + when state1 == "washington" and state2=="washington" + then 53 + when (state1 is null or state1 == '') and state2=="washington" + then 53 + when state1 == "washington" and (state2=='' or state2 is null) + then 53 + else bpl_state_orig + end""", + "transform": "sql_condition", + }, + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + + assert pdf_a.query("histid == 'a01'")["clean_bpl"].iloc[0] == 5300 + assert pdf_a.query("histid == 'b02'")["clean_bpl"].iloc[0] == 5300 + assert pdf_a.query("histid == 'b00'")["clean_bpl"].iloc[0] == 4799 + assert pdf_a.query("histid == 'c03'")["clean_bpl"].iloc[0] == 5300 + assert pdf_a.query("histid == 'd04'")["clean_bpl"].iloc[0] == 100 + assert pdf_a.query("histid == 'a01'")["bpl_state"].iloc[0] == 53 + assert pdf_a.query("histid == 'b02'")["bpl_state"].iloc[0] == 53 + assert pdf_a.query("histid == 'b00'")["bpl_state"].iloc[0] == 47 + assert pdf_a.query("histid == 'c03'")["bpl_state"].iloc[0] == 53 + assert pdf_a.query("histid == 'd04'")["bpl_state"].iloc[0] == 1 + + +def test_step_1_bigrams(preprocessing, spark, preprocessing_conf_simple_names): + """ Test checkpoint transform """ + preprocessing_conf_simple_names["column_mappings"] = [ + {"column_name": "namefrst"}, + { + "column_name": "namefrst", + "alias": "namefrst_std", + "transforms": [{"type": "lowercase_strip"}], + }, + ] + + preprocessing_conf_simple_names["feature_selections"] = [ + { + "input_column": "namefrst_std", + "output_column": "bigrams_namefrst", + "transform": "bigrams", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == '10ah'")["bigrams_namefrst"].iloc[0] == [ + " c", + "a t", + "c a", + ] + assert pdf_b.query("id == 'd45'")["bigrams_namefrst"].iloc[0] == [ + " k", + "a t", + "i e", + "k a", + "t i", + ] + + +def test_step_1_bigrams_no_space(preprocessing, spark, preprocessing_conf_simple_names): + """ Test checkpoint transform """ + preprocessing_conf_simple_names["column_mappings"] = [ + {"column_name": "namefrst"}, + { + "column_name": "namefrst", + "alias": "namefrst_std", + "transforms": [{"type": "lowercase_strip"}], + }, + ] + + preprocessing_conf_simple_names["feature_selections"] = [ + { + "input_column": "namefrst_std", + "output_column": "bigrams_namefrst", + "no_first_pad": True, + "transform": "bigrams", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == '10ah'")["bigrams_namefrst"].iloc[0] == ["a t", "c a"] + assert pdf_b.query("id == 'd45'")["bigrams_namefrst"].iloc[0] == [ + "a t", + "i e", + "k a", + "t i", + ] + + +def test_step_1_array(preprocessing, spark, preprocessing_conf_simple_names): + """ Test array transform """ + preprocessing_conf_simple_names["column_mappings"] = [ + {"column_name": "namefrst"}, + {"column_name": "sex"}, + ] + + preprocessing_conf_simple_names["feature_selections"] = [ + { + "input_columns": ["namefrst", "sex"], + "output_column": "namefrst_sex_array", + "transform": "array", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert pdf_a.query("id == '54de'")["namefrst_sex_array"].iloc[0] == ["Kat", "1"] + assert pdf_b.query("id == 'e77'")["namefrst_sex_array"].iloc[0] == ["Bernard", "1"] + + +def test_step_1_union( + preprocessing, spark, preprocessing_conf_synthetic_household_data +): + """ Test union transform""" + preprocessing_conf_synthetic_household_data["column_mappings"] = [ + {"column_name": "namefrst"}, + {"column_name": "namelast"}, + {"column_name": "neighbors"}, + {"column_name": "nonfamily_household"}, + ] + + preprocessing_conf_synthetic_household_data["feature_selections"] = [ + { + "input_columns": ["neighbors", "nonfamily_household"], + "output_column": "names_union", + "transform": "union", + } + ] + + preprocessing.run_step(0) + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert sorted(pdf_a.query("namefrst == 'jane'")["names_union"].iloc[0]) == [ + "edie", + "elmer", + "gerald", + ] + assert sorted(pdf_a.query("namefrst == 'janice'")["names_union"].iloc[0]) == [ + "edie" + ] + assert sorted(pdf_b.query("namefrst == 'gary'")["names_union"].iloc[0]) == [ + "colleen" + ] + + +def test_rel_rows_real_data(spark, preprocessing, preprocessing_conf_rel_rows): + """ Test related_rows data and double threshold comparison blocking criteria """ + + preprocessing.run_step(0) + + rda = spark.table("raw_df_a") + rda = ( + rda.withColumn("serialp_tmp", rda["serialp"].cast("bigint")) + .drop("serialp") + .withColumnRenamed("serialp_tmp", "serialp") + ) + rda = ( + rda.withColumn("relate_tmp", rda["relate"].cast("bigint")) + .drop("relate") + .withColumnRenamed("relate_tmp", "relate") + ) + rda = ( + rda.withColumn("sex_tmp", rda["sex"].cast("bigint")) + .drop("sex") + .withColumnRenamed("sex_tmp", "sex") + ) + rda = ( + rda.withColumn("age_tmp", rda["age"].cast("bigint")) + .drop("age") + .withColumnRenamed("age_tmp", "age") + ) + + rda.write.mode("overwrite").saveAsTable("raw_df_a_tmp") + spark.sql("drop table raw_df_a") + spark.sql("alter table raw_df_a_tmp rename to raw_df_a") + + rdb = spark.table("raw_df_b") + rdb = ( + rdb.withColumn("serialp_tmp", rdb["serialp"].cast("bigint")) + .drop("serialp") + .withColumnRenamed("serialp_tmp", "serialp") + ) + rdb = ( + rdb.withColumn("relate_tmp", rdb["relate"].cast("bigint")) + .drop("relate") + .withColumnRenamed("relate_tmp", "relate") + ) + rdb = ( + rdb.withColumn("sex_tmp", rdb["sex"].cast("bigint")) + .drop("sex") + .withColumnRenamed("sex_tmp", "sex") + ) + rdb = ( + rdb.withColumn("age_tmp", rdb["age"].cast("bigint")) + .drop("age") + .withColumnRenamed("age_tmp", "age") + ) + + rdb.write.mode("overwrite").saveAsTable("raw_df_b_tmp") + spark.sql("drop table raw_df_b") + spark.sql("alter table raw_df_b_tmp rename to raw_df_b") + + preprocessing.run_step(1) + + pdf_a = spark.table("prepped_df_a").toPandas() + pdf_b = spark.table("prepped_df_b").toPandas() + + assert ( + len( + pdf_a.query("histid == 'D1DAEB8F-66F0-435C-8E45-F004D967549D'")[ + "namefrst_related_rows" + ].iloc[0] + ) + == 3 + ) + assert ( + len( + pdf_a.query("histid == 'D1DAEB8F-66F0-435C-8E45-F004D967549D'")[ + "namefrst_related_rows_age_min_5" + ].iloc[0] + ) + == 1 + ) + assert ( + len( + pdf_a.query("histid == 'D1DAEB8F-66F0-435C-8E45-F004D967549D'")[ + "namefrst_related_rows_age_b_min_5" + ].iloc[0] + ) + == 3 + ) + + assert ( + len( + pdf_b.query("histid == 'B04F6A33-9A86-4EAF-884B-0BD6107CCDEB'")[ + "namefrst_related_rows" + ].iloc[0] + ) + == 7 + ) + assert ( + len( + pdf_b.query("histid == 'B04F6A33-9A86-4EAF-884B-0BD6107CCDEB'")[ + "namefrst_related_rows_age_min_5" + ].iloc[0] + ) + == 6 + ) + assert ( + len( + pdf_b.query("histid == 'B04F6A33-9A86-4EAF-884B-0BD6107CCDEB'")[ + "namefrst_related_rows_age_b_min_5" + ].iloc[0] + ) + == 6 + ) + + assert ( + len( + pdf_a.query("histid == '8B0A8FA5-A260-4841-95D0-2C45689485C8'")[ + "namefrst_related_rows" + ].iloc[0] + ) + == 5 + ) + assert ( + len( + pdf_a.query("histid == '8B0A8FA5-A260-4841-95D0-2C45689485C8'")[ + "namefrst_related_rows_age_min_5" + ].iloc[0] + ) + == 2 + ) + assert ( + len( + pdf_a.query("histid == '8B0A8FA5-A260-4841-95D0-2C45689485C8'")[ + "namefrst_related_rows_age_b_min_5" + ].iloc[0] + ) + == 5 + ) + + assert ( + len( + pdf_b.query("histid == 'F7E0450D-ECCC-4338-92B0-ACB4F9D40D8F'")[ + "namefrst_related_rows" + ].iloc[0] + ) + == 7 + ) + assert ( + len( + pdf_b.query("histid == 'F7E0450D-ECCC-4338-92B0-ACB4F9D40D8F'")[ + "namefrst_related_rows_age_min_5" + ].iloc[0] + ) + == 7 + ) + assert ( + len( + pdf_b.query("histid == 'F7E0450D-ECCC-4338-92B0-ACB4F9D40D8F'")[ + "namefrst_related_rows_age_b_min_5" + ].iloc[0] + ) + == 7 + ) diff --git a/hlink/tests/reporting_test.py b/hlink/tests/reporting_test.py new file mode 100644 index 0000000..4103e1a --- /dev/null +++ b/hlink/tests/reporting_test.py @@ -0,0 +1,119 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +import pandas as pd +import pyspark.sql.functions as f +import os + + +def test_report_r2_percent_linked(reporting, spark, reporting_test_data_r2_pct): + + pdfa_path, pm_path, hhpm_path = reporting_test_data_r2_pct + + reporting.spark.read.csv(pdfa_path, header=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + reporting.spark.read.csv(pm_path, header=True).write.mode("overwrite").saveAsTable( + "predicted_matches" + ) + reporting.spark.read.csv(hhpm_path, header=True).write.mode( + "overwrite" + ).saveAsTable("hh_predicted_matches") + + reporting.run_step(0) + + linked_rnds = reporting.spark.table("linked_rounds") + calc_lr = linked_rnds.select("linked_round").rdd.flatMap(lambda x: x).collect() + coded_lr = list( + map( + int, + reporting.spark.table("prepped_df_a") + .select("linked_round_hardcoded") + .rdd.flatMap(lambda x: x) + .collect(), + ) + ) + + assert calc_lr == coded_lr + + counted_links = reporting.spark.table("counted_links") + assert ( + counted_links.select(f.mean("R1_pct")).rdd.flatMap(lambda x: x).collect()[0] + == 0.425 + ) + assert ( + round( + counted_links.select(f.mean("R2_pct")) + .rdd.flatMap(lambda x: x) + .collect()[0], + 5, + ) + == 0.39583 + ) + + +def test_report_representivity( + reporting, spark, reporting_test_data_representivity, integration_conf +): + + rdf_path, pdf_path, pm_path, hhpm_path = reporting_test_data_representivity + + reporting.spark.read.csv(rdf_path, header=True).write.mode("overwrite").saveAsTable( + "raw_df_a" + ) + reporting.spark.read.csv(rdf_path, header=True).write.mode("overwrite").saveAsTable( + "raw_df_b" + ) + reporting.spark.read.csv(pdf_path, header=True).write.mode("overwrite").saveAsTable( + "prepped_df_a" + ) + reporting.spark.read.csv(pdf_path, header=True).write.mode("overwrite").saveAsTable( + "prepped_df_b" + ) + reporting.spark.read.csv(pm_path, header=True).write.mode("overwrite").saveAsTable( + "predicted_matches" + ) + reporting.spark.read.csv(hhpm_path, header=True).write.mode( + "overwrite" + ).saveAsTable("hh_predicted_matches") + + reporting.link_run.config = integration_conf + reporting.run_step(1) + + sda = reporting.spark.table("source_data_a") + assert all( + elem + in [ + "histid", + "serialp", + "sex", + "age", + "marst", + "durmarr", + "statefip", + "sei", + "linked_round_all", + "race_div_100", + "relate_div_100", + "region", + "bpl_clean", + "namefrst_unstd", + "namefrst_std", + "namelast_clean", + "region_of_residence", + ] + for elem in list(sda.columns) + ) + + fdir = os.path.dirname(__file__) + df = pd.read_csv( + os.path.join(fdir, "../../output_data/reports/representivity.csv"), + index_col=["feature", "values"], + ) + df_expected = pd.read_csv( + os.path.join(fdir, "input_data/representivity.csv"), + index_col=["feature", "values"], + ) + assert df.equals(df_expected) diff --git a/hlink/tests/table_test.py b/hlink/tests/table_test.py new file mode 100644 index 0000000..89e2924 --- /dev/null +++ b/hlink/tests/table_test.py @@ -0,0 +1,65 @@ +import pytest +from hlink.linking.table import Table +from pyspark.sql.types import StructType, StructField, StringType + + +@pytest.fixture() +def simple_schema(): + return StructType([StructField("test", StringType())]) + + +@pytest.mark.parametrize("table_name", ["this_table_does_not_exist", "@@@", "LOL rofl"]) +def test_exists_table_does_not_exist(spark, table_name): + t = Table(spark, table_name, "table used for testing") + assert not t.exists() + + +@pytest.mark.parametrize("table_name", ["table_for_testing_Table_class"]) +def test_exists_table_does_exist(spark, table_name, simple_schema): + t = Table(spark, table_name, "table used for testing") + spark.catalog.createTable(table_name, schema=simple_schema) + print([table.name for table in spark.catalog.listTables()]) + assert t.exists() + spark.sql(f"DROP TABLE {table_name}") + + +@pytest.mark.parametrize("table_name", ["table_for_testing_Table_class"]) +def test_drop_table_does_exist(spark, table_name, simple_schema): + t = Table(spark, table_name, "table used for testing") + spark.catalog.createTable(table_name, schema=simple_schema) + assert t.exists() + t.drop() + assert not t.exists() + + +@pytest.mark.parametrize("table_name", ["this_table_does_not_exist", "@@@", "LOL rofl"]) +def test_drop_table_does_not_exist(spark, table_name): + # Check that dropping a table that doesn't exist doesn't throw errors + # or somehow create the table. + t = Table(spark, table_name, "table used for testing") + assert not t.exists() + t.drop() + assert not t.exists() + + +@pytest.mark.parametrize("table_name", ["table_for_testing_Table_class"]) +def test_df_table_does_exist(spark, table_name, simple_schema): + t = Table(spark, table_name, "table used for testing") + spark.catalog.createTable(table_name, schema=simple_schema) + assert t.exists() + assert t.df() is not None + spark.sql(f"DROP TABLE {table_name}") + + +@pytest.mark.parametrize("table_name", ["this_table_does_not_exist", "@@@", "LOL rofl"]) +def test_df_table_does_not_exist(spark, table_name): + t = Table(spark, table_name, "table used for testing") + assert t.df() is None + + +@pytest.mark.parametrize( + "table_name", ["table_for_testing_Table_class", "camelCaseTable", "@@@", "LOL rofl"] +) +def test_name_is_unchanged(spark, table_name): + t = Table(spark, table_name, "table used for testing") + assert t.name == table_name diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py new file mode 100644 index 0000000..bf85b17 --- /dev/null +++ b/hlink/tests/training_test.py @@ -0,0 +1,236 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from pyspark.ml import Pipeline +import hlink.linking.core.pipeline as pipeline_core + + +def test_all_steps( + spark, + training_conf, + training, + state_dist_path, + training_data_path, + potential_matches_path, + spark_test_tmp_dir_path, + matching, + training_validation_path, +): + """ Test running the chosen model on potential matches dataset """ + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "column_name": "bpl", + "key_count": 1, + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + { + "alias": "exact", + "column_names": ["namelast", "namelast"], + "comparison_type": "all_equals", + }, + { + "alias": "exact_all", + "column_names": ["namelast", "bpl", "sex"], + "comparison_type": "all_equals", + }, + ] + + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + "exact_mult", + "exact_all_mult", + "hits", + "namelast_jw_buckets", + ] + training_conf["pipeline_features"] = [ + { + "input_column": "namelast_jw", + "output_column": "namelast_jw_buckets", + "transformer_type": "bucketizer", + "categorical": True, + "splits": [0, 0.25, 0.5, 0.75, 0.99, 1], + } + ] + training_conf["training"]["chosen_model"] = { + "type": "random_forest", + "maxDepth": 6, + "numTrees": 100, + "featureSubsetStrategy": "sqrt", + } + + # training_conf["training"]["use_potential_matches_features"] = True + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + + training.link_run.trained_models["trained_model"] = None + + training.run_step(0) + + training.run_step(1) + tf = spark.table("training_features").toPandas() + assert tf.query("id_a == 20 and id_b == 30")["exact"].iloc[0] + assert not tf.query("id_a == 20 and id_b == 50")["exact"].iloc[0] + assert not tf.query("id_a == 20 and id_b == 30")["exact_mult"].iloc[0] + assert not tf.query("id_a == 20 and id_b == 10")["exact_mult"].iloc[0] + + training.run_step(2) + + # m = PipelineModel.load(spark_test_tmp_dir_path + "/chosen_model") + p = training.link_run.trained_models["pre_pipeline"] + m = training.link_run.trained_models["trained_model"] + transformed_df = m.transform( + p.transform(spark.table("training_features")) + ).toPandas() + row = transformed_df.query("id_a == 10 and id_b == 50").iloc[0] + assert row.prediction == 0 + assert row.state_distance_imp.round(0) == 1909 + + +def test_step_2_bucketizer(spark, main, conf): + """ Test a bucketized feature using spark pipeline function """ + data = [ + (0.0, 0, 0), + (3.0, 1, 1), + (5.0, 2, 0), + (6.0, 3, 1), + (9.0, 4, 0), + (10.0, 5, 1), + (11.0, 6, 0), + (23.0, 7, 1), + ] + dataFrame = spark.createDataFrame(data, ["immyear_abs_diff", "test_id", "match"]) + dataFrame.createOrReplaceTempView("training_features") + + conf["pipeline_features"] = [ + { + "input_column": "immyear_abs_diff", + "output_column": "immyear_caution", + "transformer_type": "bucketizer", + "categorical": True, + "splits": [0, 6, 11, 999], + } + ] + conf["training"] = { + "dependent_var": "match", + "independent_vars": ["immyear_abs_diff", "immyear_caution"], + } + conf["comparison_features"] = [] + + ind_vars = conf["training"]["independent_vars"] + tf = spark.table("training_features") + pipeline_stages = pipeline_core.generate_pipeline_stages( + conf, ind_vars, tf, "training" + ) + prep_pipeline = Pipeline(stages=pipeline_stages) + prep_model = prep_pipeline.fit(tf) + prepped_data = prep_model.transform(tf) + prepped_data = prepped_data.toPandas() + + assert prepped_data.shape == (8, 7) + assert list(prepped_data.query("test_id == 0")["features_vector"].iloc[0]) == [ + 0, + 1, + 0, + 0, + 0, + ] + assert list(prepped_data.query("test_id == 1")["features_vector"].iloc[0]) == [ + 3, + 1, + 0, + 0, + 0, + ] + assert list(prepped_data.query("test_id == 3")["features_vector"].iloc[0]) == [ + 6, + 0, + 1, + 0, + 0, + ] + assert list(prepped_data.query("test_id == 6")["features_vector"].iloc[0]) == [ + 11, + 0, + 0, + 1, + 0, + ] + + main.do_drop_all("") + + +def test_step_2_interaction(spark, main, conf): + """ Test interacting two and three features using spark pipeline function """ + data = [ + (0.0, 0.0, 0.0), + (3.0, 1.0, 1.0), + (5.0, 2.0, 0.0), + (6.0, 3.0, 1.0), + (9.0, 4.0, 0.0), + (10.0, 5.0, 1.0), + (11.0, 6.0, 0.0), + (23.0, 7.0, 1.0), + ] + dataFrame = spark.createDataFrame(data, ["var0", "var1", "var2"]) + dataFrame.createOrReplaceTempView("training_features") + + conf["pipeline_features"] = [ + { + "input_columns": ["var0", "var1"], + "output_column": "interacted_vars01", + "transformer_type": "interaction", + }, + { + "input_columns": ["var0", "var1", "var2"], + "output_column": "interacted_vars012", + "transformer_type": "interaction", + }, + ] + + conf["training"] = { + "dependent_var": "var2", + "independent_vars": ["interacted_vars01", "interacted_vars012"], + } + conf["comparison_features"] = [] + + ind_vars = conf["training"]["independent_vars"] + tf = spark.table("training_features") + pipeline_stages = pipeline_core.generate_pipeline_stages( + conf, ind_vars, tf, "training" + ) + prep_pipeline = Pipeline(stages=pipeline_stages) + prep_model = prep_pipeline.fit(tf) + prepped_data = prep_model.transform(tf) + prepped_data = prepped_data.toPandas() + + assert prepped_data.shape == (8, 8) + assert prepped_data.query("var1 == 0")["interacted_vars01"].iloc[0][0] == 0 + assert prepped_data.query("var1 == 2")["interacted_vars01"].iloc[0][0] == 10 + assert prepped_data.query("var1 == 2")["interacted_vars012"].iloc[0][0] == 0 + assert prepped_data.query("var1 == 3")["interacted_vars01"].iloc[0][0] == 18 + assert prepped_data.query("var1 == 3")["interacted_vars012"].iloc[0][0] == 18 + + main.do_drop_all("") diff --git a/hlink/tests/validation_data/crosswalks/crosswalk.csv b/hlink/tests/validation_data/crosswalks/crosswalk.csv new file mode 100644 index 0000000..8077466 --- /dev/null +++ b/hlink/tests/validation_data/crosswalks/crosswalk.csv @@ -0,0 +1,7 @@ +"histid_b","histid_a","age_a","age_b" +50,0,54,64 +51,3,80,65 +52,4,25,25 +53,8,16,90 +59,11,45,23 +61,12,44,55 diff --git a/hlink/tests/validation_data/crosswalks/crosswalk_with_round.csv b/hlink/tests/validation_data/crosswalks/crosswalk_with_round.csv new file mode 100755 index 0000000..6b11026 --- /dev/null +++ b/hlink/tests/validation_data/crosswalks/crosswalk_with_round.csv @@ -0,0 +1,7 @@ +"histid_b","histid_a","round","age_a","age_b" +50,0,1,54,64 +51,3,1,80,65 +52,4,1,25,25 +53,8,1,16,90 +59,11,1,45,23 +61,12,2,44,55 diff --git a/hlink/tests/validation_data/training_p_all.parquet/._SUCCESS.crc b/hlink/tests/validation_data/training_p_all.parquet/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/hlink/tests/validation_data/training_p_all.parquet/._SUCCESS.crc differ diff --git a/hlink/tests/validation_data/training_p_all.parquet/.part-00000-6c9e5760-174a-4652-b687-6823087ed5bf-c000.snappy.parquet.crc b/hlink/tests/validation_data/training_p_all.parquet/.part-00000-6c9e5760-174a-4652-b687-6823087ed5bf-c000.snappy.parquet.crc new file mode 100644 index 0000000..2043263 Binary files /dev/null and b/hlink/tests/validation_data/training_p_all.parquet/.part-00000-6c9e5760-174a-4652-b687-6823087ed5bf-c000.snappy.parquet.crc differ diff --git a/hlink/tests/validation_data/training_p_all.parquet/_SUCCESS b/hlink/tests/validation_data/training_p_all.parquet/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/hlink/tests/validation_data/training_p_all.parquet/part-00000-6c9e5760-174a-4652-b687-6823087ed5bf-c000.snappy.parquet b/hlink/tests/validation_data/training_p_all.parquet/part-00000-6c9e5760-174a-4652-b687-6823087ed5bf-c000.snappy.parquet new file mode 100644 index 0000000..faf1f8e Binary files /dev/null and b/hlink/tests/validation_data/training_p_all.parquet/part-00000-6c9e5760-174a-4652-b687-6823087ed5bf-c000.snappy.parquet differ diff --git a/hlink_config/config/test_conf_flag_run.json b/hlink_config/config/test_conf_flag_run.json new file mode 100644 index 0000000..48393dc --- /dev/null +++ b/hlink_config/config/test_conf_flag_run.json @@ -0,0 +1,7 @@ +{ + "column_mappings": [], + "id_column": "id_conf_flag", + "substitution_columns": [], + "filter": [], + "feature_selections": [] +} diff --git a/hlink_config/config/test_run.json b/hlink_config/config/test_run.json new file mode 100644 index 0000000..891e0b7 --- /dev/null +++ b/hlink_config/config/test_run.json @@ -0,0 +1,7 @@ +{ + "column_mappings": [], + "id_column": "id_rel_dir", + "substitution_columns": [], + "filter": [], + "feature_selections": [] +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a12e563 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[tool.black] +line-length = 88 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist +)/ +# project-specific files and directories +| conda_env +| working_dir +''' diff --git a/scala_jar/build.sbt b/scala_jar/build.sbt new file mode 100644 index 0000000..625dc5b --- /dev/null +++ b/scala_jar/build.sbt @@ -0,0 +1,85 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +// The simplest possible sbt build file is just one line: + +scalaVersion := "2.11.12" +// That is, to create a valid sbt build, all you've got to do is define the +// version of Scala you'd like your project to use. + +// ============================================================================ + +// Lines like the above defining `scalaVersion` are called "settings" Settings +// are key/value pairs. In the case of `scalaVersion`, the key is "scalaVersion" +// and the value is "2.12.6" + +// It's possible to define many kinds of settings, such as: + +name := "hlink_lib" +organization := "isrdi" +version := "1.0" + +// Note, it's not required for you to define these three settings. These are +// mostly only necessary if you intend to publish your library's binaries on a +// place like Sonatype or Bintray. + + +// Want to use a published library in your project? +// You can define other libraries as dependencies in your build like this: +libraryDependencies ++= Seq( + "org.scala-lang.modules" %% "scala-parser-combinators" % "1.1.0", + "org.apache.commons" % "commons-text" % "1.4", + "org.apache.spark" % "spark-sql_2.11" % "2.3.1" % "provided", + "org.apache.spark" % "spark-mllib_2.11" % "2.3.1" % "provided" + ) + +// Here, `libraryDependencies` is a set of dependencies, and by using `+=`, +// we're adding the cats dependency to the set of dependencies that sbt will go +// and fetch when it starts up. +// Now, in any Scala file, you can import classes, objects, etc, from cats with +// a regular import. + +// TIP: To find the "dependency" that you need to add to the +// `libraryDependencies` set, which in the above example looks like this: + +// "org.typelevel" %% "cats-core" % "1.1.0" + +// You can use Scaladex, an index of all known published Scala libraries. There, +// after you find the library you want, you can just copy/paste the dependency +// information that you need into your build file. For example, on the +// typelevel/cats Scaladex page, +// https://index.scala-lang.org/typelevel/cats, you can copy/paste the sbt +// dependency from the sbt box on the right-hand side of the screen. + +// IMPORTANT NOTE: while build files look _kind of_ like regular Scala, it's +// important to note that syntax in *.sbt files doesn't always behave like +// regular Scala. For example, notice in this build file that it's not required +// to put our settings into an enclosing object or class. Always remember that +// sbt is a bit different, semantically, than vanilla Scala. + +// ============================================================================ + +// Most moderately interesting Scala projects don't make use of the very simple +// build file style (called "bare style") used in this build.sbt file. Most +// intermediate Scala projects make use of so-called "multi-project" builds. A +// multi-project build makes it possible to have different folders which sbt can +// be configured differently for. That is, you may wish to have different +// dependencies or different testing frameworks defined for different parts of +// your codebase. Multi-project builds make this possible. + +// Here's a quick glimpse of what a multi-project build looks like for this +// build, with only one "subproject" defined, called `root`: + +// lazy val root = (project in file(".")). +// settings( +// inThisBuild(List( +// organization := "ch.epfl.scala", +// scalaVersion := "2.12.6" +// )), +// name := "hello-world" +// ) + +// To learn more about multi-project builds, head over to the official sbt +// documentation at http://www.scala-sbt.org/documentation.html diff --git a/scala_jar/project/assembly.sbt b/scala_jar/project/assembly.sbt new file mode 100644 index 0000000..625df64 --- /dev/null +++ b/scala_jar/project/assembly.sbt @@ -0,0 +1,6 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") diff --git a/scala_jar/project/build.properties b/scala_jar/project/build.properties new file mode 100644 index 0000000..d6e3507 --- /dev/null +++ b/scala_jar/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.1.6 diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/ArrayToString.scala b/scala_jar/src/main/scala/com/isrdi/udfs/ArrayToString.scala new file mode 100644 index 0000000..7c17f08 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/ArrayToString.scala @@ -0,0 +1,17 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF1 + +class ArrayToString extends UDF1[Seq[String], String] { + def stringify(x: Seq[String]): String = x match { + case null => null + case _ => s"""[${x.mkString(",")}]""" + } + override def call(s1: Seq[String]): String = { + stringify(s1) + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/AttachHHColumn.scala b/scala_jar/src/main/scala/com/isrdi/udfs/AttachHHColumn.scala new file mode 100644 index 0000000..48df1d5 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/AttachHHColumn.scala @@ -0,0 +1,52 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs + +import org.apache.spark.sql.api.java.UDF5 +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.Row +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types._ +import scala.util.control.Breaks._ +import org.apache.spark.sql.SQLContext +import scala.math.abs +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.types.StructType +import scala.collection.JavaConverters._ + +class AttachHHColumn { + def createAttachUDF(spark: SQLContext, df: Dataset[Row], transforms: java.util.List[java.util.Map[String, String]], udf_name: String ) = { + val person_id = transforms.get(0).get("person_id") + val attach_udf = (hh_rows: Seq[Row]) => { + val hh_map = hh_rows.map { row => row.getAs[Any](person_id) -> row }.toMap + val new_rows = hh_rows.map { row => + val new_cols = transforms.asScala.map { transform => + val person_pointer = row.getAs[Any](transform.get("person_pointer")) + if (person_pointer != 0) { + val other_row = hh_map.get(person_pointer).get + other_row.getAs[Any](transform.get("other_col")) + } else { + None + } + } + Row.fromSeq(row.toSeq ++ new_cols) + } + new_rows + } + + val old_struct_type = df.schema.find(_.name == "hh_rows").get.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType] + val old_struct_fields = old_struct_type.fields + val new_struct_fields = transforms.asScala.map { transform => + val other_col = old_struct_type.find(_.name == transform.get("other_col")).get + StructField(transform.get("output_col"), other_col.dataType) + } + val schema = ArrayType(StructType(old_struct_fields ++ new_struct_fields)) + val my_udf = udf((hh_rows: Seq[Row]) => attach_udf(hh_rows), schema) + spark.udf.register(udf_name, my_udf) + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/AttachRelatedRows.scala b/scala_jar/src/main/scala/com/isrdi/udfs/AttachRelatedRows.scala new file mode 100755 index 0000000..c1e2426 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/AttachRelatedRows.scala @@ -0,0 +1,71 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs + +import org.apache.spark.sql.api.java.UDF5 +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.Row +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types._ +import scala.util.control.Breaks._ +import org.apache.spark.sql.SQLContext +import scala.math.abs +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.types.StructType +import scala.collection.JavaConverters._ + +class AttachRelatedRows { + def createAttachUDF(spark: SQLContext, df: Dataset[Row], transforms: java.util.List[java.util.Map[String, Any]], id_col: String, a_or_b: String, udf_name: String ) = { + val attach_udf = (hh_rows: Seq[Row]) => { + val related_rows_list = transforms.asScala.map { transform => + val input_cols = transform.get("input_cols").asInstanceOf[java.util.List[String]].asScala + val filters = transform.get("filters").asInstanceOf[java.util.List[java.util.Map[String,Any]]].asScala + val filtered_hh_rows = hh_rows.filter { row => + filters.map { filter => + val filter_col = filter.get("column").asInstanceOf[String] + val min:Long = filter.get("min").asInstanceOf[Int].toLong + val max:Long = filter.get("max").asInstanceOf[Int].toLong + val filter_val = row.getAs[Long](filter_col).asInstanceOf[Int].toLong + val has_dataset = filter.containsKey("dataset") + if (!has_dataset || (has_dataset && filter.get("dataset").asInstanceOf[String] == a_or_b)) { + (filter_val >= min.toLong) && (filter_val <= max.toLong) + + } else { + true + } + }.forall(x => x) + } + filtered_hh_rows.map { row => (row.getAs[Any](id_col), Row.fromSeq(input_cols.map(row.getAs[Any](_)))) } + } + hh_rows.map { row => + val new_cols = related_rows_list.map { related_rows_list => + related_rows_list.filter { case (id, rel_row) => + val my_id = row.getAs[Any](id_col) + my_id != id + }.map(_._2) + } + Row.fromSeq(row.toSeq ++ new_cols) + } + } + + val old_struct_type = df.schema.find(_.name == "hh_rows").get.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType] + val old_struct_fields = old_struct_type.fields + val new_struct_fields = transforms.asScala.map { transform => + val input_cols = transform.get("input_cols").asInstanceOf[java.util.List[String]].asScala + val output_col = transform.get("output_col").asInstanceOf[String] + val struct_fields = input_cols.map { input_col => + val data_type = old_struct_type.find(_.name == input_col).get.dataType + StructField(input_col, data_type) + } + StructField(output_col, ArrayType(StructType(struct_fields))) + } + val schema = ArrayType(StructType(old_struct_fields ++ new_struct_fields)) + val my_udf = udf((hh_rows: Seq[Row]) => attach_udf(hh_rows), schema) + spark.udf.register(udf_name, my_udf) + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/ExtraChildren.scala b/scala_jar/src/main/scala/com/isrdi/udfs/ExtraChildren.scala new file mode 100644 index 0000000..2caab48 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/ExtraChildren.scala @@ -0,0 +1,72 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF8 +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ +import com.isrdi.udfs.SerJaroWinklerDistance +import scala.util.control.Breaks._ +import scala.collection.mutable.ArrayBuffer +import scala.math.abs + +class ExtraChildren extends UDF8[Seq[Row], Seq[Row], String, Long, Long, String, String, Map[String, String], Double] { + val distance = new SerJaroWinklerDistance + override def call(y1: Seq[Row], y2: Seq[Row], year_b: String, relate_a: Long, relate_b: Long, jw_threshold: String, age_threshold: String, var_map: Map[String, String]): Double = { + + if (relate_a <= 399 && relate_b <= 399) { + var relate = var_map.getOrElse("relate", "relate") + var year = year_b.toDouble + var birthyr = var_map.getOrElse("byr", "birthyr") + var CA = y1.filter(x => x.getAs[Long](relate) >= 300 && x.getAs[Long](relate) < 400) + var CB = y2.filter(x => x.getAs[Long](relate) >= 300 && x.getAs[Long](relate) < 400 && (year - x.getAs[Long](birthyr) >= 11)) + + if (CB.length > 0) { + if (CA.length > 0) { + var jw_t = jw_threshold.toDouble + var age_t = age_threshold.toDouble + var histid = var_map.getOrElse("histid", "histid") + var name = var_map.getOrElse("name", "namefrst_std") + var sex = var_map.getOrElse("sex", "sex") + var ids_b = Set[String]() + + var good_matches = ArrayBuffer[Tuple3[Double, String, String]]() + for ((r2, i) <- CB.zipWithIndex) { + for ((r1, j) <- CA.zipWithIndex) { + ids_b += r2.getAs[String](histid) + var jw_s = distance.apply(r1.getAs[String](name), r2.getAs[String](name)) + if (abs(r1.getAs[Long](birthyr).toLong - r2.getAs[Long](birthyr).toLong) <= age_t && r1.getAs[Long](sex) == r2.getAs[Long](sex) && jw_s >= jw_t) { + var ma = (jw_s.toDouble, r1.getAs[String](histid), r2.getAs[String](histid)) + good_matches += ma + } + } + } + + var tm_a = Set[String]() + var tm_b = Set[String]() + + if (good_matches.nonEmpty) { + for (m <- good_matches.sortWith(_._1 > _._1)) { + if (!(tm_a contains m._2) && !(tm_b contains m._3)) { + tm_a += m._2 + tm_b += m._3 + } + } + } + var remaining_ids_b = ids_b &~ tm_b + return remaining_ids_b.size + } else { + return CB.length + } + } else { + return 0 + } + } else { + return 0 + } + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/ExtractNeighbors.scala b/scala_jar/src/main/scala/com/isrdi/udfs/ExtractNeighbors.scala new file mode 100644 index 0000000..5c3ca18 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/ExtractNeighbors.scala @@ -0,0 +1,17 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs + +import org.apache.spark.sql.api.java.UDF2 +import org.apache.spark.sql.Row +import org.apache.spark.sql.Dataset + +class ExtractNeighbors extends UDF2[Seq[Row], Long, Seq[String]] { + override def call(rows: Seq[Row], serial: Long): Seq[String] = { + rows.filter(_.getLong(0) != serial).map(_.getString(1)) + } +} + diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/HHCompare.scala b/scala_jar/src/main/scala/com/isrdi/udfs/HHCompare.scala new file mode 100644 index 0000000..91dbe5e --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/HHCompare.scala @@ -0,0 +1,36 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF2 +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ +import com.isrdi.udfs.SerJaroWinklerDistance +import scala.util.control.Breaks._ +import scala.math.abs + +class HHCompare extends UDF2[Seq[Row], Seq[Row], Double] { + val distance = new SerJaroWinklerDistance + override def call(y1: Seq[Row], y2: Seq[Row]): Double = { + var score_tmp = Array[Double](0.0, 0.0, 0.0, 0.0, 0.0) + var matches = 0.0 + for ((r2, i) <- y2.zipWithIndex) { + breakable { for ((r1, j) <- y1.zipWithIndex) { + score_tmp = Array(0.0, 0.0, 0.0, 0.0) + score_tmp(0) = if (distance.apply(r1.getAs[String]("namefrst_std"), r2.getAs[String]("namefrst_std")) > 0.8) 2.0 else 0.0 + score_tmp(1) = if (abs(r1.getAs[String]("birthyr").toLong - r2.getAs[String]("birthyr").toLong) < 1) 1.0 else 0.0 + score_tmp(2) = if (r1.getAs[String]("bpl") == r2.getAs[String]("bpl")) 1 else 0.0 + score_tmp(3) = if (r1.getAs[String]("sex") == r2.getAs[String]("sex")) 1 else 0.0 + if (score_tmp.sum > 3.9) { + matches += 1 + break + } + } } + } + return matches / y2.length + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/HHDrop.scala b/scala_jar/src/main/scala/com/isrdi/udfs/HHDrop.scala new file mode 100644 index 0000000..d0a9556 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/HHDrop.scala @@ -0,0 +1,18 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF2 +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ +import scala.math.abs + +class HHDrop extends UDF2[Seq[Row], String, Seq[Row]] { + override def call(rows: Seq[Row], filter_id: String): Seq[Row] = { + return rows.filterNot(_.getAs[String]("id") == filter_id) + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/HHGetFirstValue.scala b/scala_jar/src/main/scala/com/isrdi/udfs/HHGetFirstValue.scala new file mode 100755 index 0000000..b864b1e --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/HHGetFirstValue.scala @@ -0,0 +1,18 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs + +import org.apache.spark.sql.api.java.UDF4 +import org.apache.spark.sql.Row +import org.apache.spark.sql.Dataset + +class HHRowsGetFirstValue extends UDF4[Seq[Row], String, String, String, Tuple2[Long, String]] { + override def call(rows: Seq[Row], serial_col: String, pernum_col: String, value_col: String): Tuple2[Long, String] = { + val min:Long = rows.map(_.getAs[Long](pernum_col)).min + val row = rows.find(_.getAs[Long](pernum_col) == min).get + (row.getAs[Long](serial_col), row.getAs[String](value_col)) + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/HasMatchingElement.scala b/scala_jar/src/main/scala/com/isrdi/udfs/HasMatchingElement.scala new file mode 100644 index 0000000..7f3b562 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/HasMatchingElement.scala @@ -0,0 +1,34 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF2 + + +class HasMatchingElement extends UDF2[String, String, Boolean] { + override def call(l1: String, l2: String): Boolean = { + return false; + /*if (l1.size == 0 || l2.size == 0) { + return false; + } + var it1 = l1.iterator; + var it2 = l2.iterator; + var cur1 = it1.next; + var cur2 = it2.next; + if (cur1 == cur2) { + return true; + } + while(it1.hasNext && it2.hasNext) { + if (cur1 == cur2) { + return true; + } else if (cur1 < cur2) { + cur1 = it1.next; + } else if (cur1 > cur2) { + cur2 = it2.next; + } + } + return false;*/ + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/JWCompare.scala b/scala_jar/src/main/scala/com/isrdi/udfs/JWCompare.scala new file mode 100644 index 0000000..86fe97c --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/JWCompare.scala @@ -0,0 +1,16 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF2 +import com.isrdi.udfs.SerJaroWinklerDistance + + +class JWCompare extends UDF2[String, String, Double] { + val distance = new SerJaroWinklerDistance + override def call(s1: String, s2: String): Double = { + distance.apply(s1, s2) + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/JWRate.scala b/scala_jar/src/main/scala/com/isrdi/udfs/JWRate.scala new file mode 100644 index 0000000..4e943fb --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/JWRate.scala @@ -0,0 +1,33 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF3 +import com.isrdi.udfs.SerJaroWinklerDistance +import scala.math.max + +class JWRate extends UDF3[Seq[String], Seq[String], String, Double] { + val distance = new SerJaroWinklerDistance + override def call(list1: Seq[String], list2: Seq[String], jw_threshold: String): Double = { + + var hits = 0.0 + var max_score = 0.0 + var jw_t = jw_threshold.toDouble + for (s1 <- list1) { + max_score = 0.0 + for (s2 <- list2) { + max_score = max(max_score, distance.apply(s1, s2)) + } + if (max_score > jw_t) { + hits = hits + 1.0 + } + } + if (list1.length > 0) { + hits / list1.length + } else{ + 0 + } + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/JWRelatedRows.scala b/scala_jar/src/main/scala/com/isrdi/udfs/JWRelatedRows.scala new file mode 100644 index 0000000..90cb748 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/JWRelatedRows.scala @@ -0,0 +1,40 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF5 +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ +import com.isrdi.udfs.SerJaroWinklerDistance +import scala.util.control.Breaks._ +import scala.math.abs + +class JWRelatedRows extends UDF5[Seq[Row], Seq[Row], String, String, Map[String, String], Double] { + val distance = new SerJaroWinklerDistance + override def call(y1: Seq[Row], y2: Seq[Row], jw_threshold: String, age_threshold: String, var_map: Map[String, String]): Double = { + var score_tmp = Array[Double](0.0, 0.0, 0.0, 0.0) + var matches = 0.0 + var jw_t = jw_threshold.toDouble + var age_t = age_threshold.toDouble + var name = var_map.getOrElse("name", "namefrst_std") + var byr = var_map.getOrElse("byr", "birthyr") + var sex = var_map.getOrElse("sex", "sex") + for ((r2, i) <- y2.zipWithIndex) { + breakable { for ((r1, j) <- y1.zipWithIndex) { + score_tmp = Array(0.0, 0.0, 0.0) + score_tmp(0) = if (distance.apply(r1.getAs[String](name), r2.getAs[String](name)) >= jw_t) 1.0 else 0.0 + score_tmp(1) = if (abs(r1.getAs[Long](byr).toLong - r2.getAs[Long](byr).toLong) <= age_t) 1.0 else 0.0 + score_tmp(2) = if (r1.getAs[Long](sex) == r2.getAs[Long](sex)) 1 else 0.0 + if (score_tmp.sum == 3) { + matches += 1 + break + } + } } + } + return matches + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/MaxJWCompare.scala b/scala_jar/src/main/scala/com/isrdi/udfs/MaxJWCompare.scala new file mode 100644 index 0000000..f07aff7 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/MaxJWCompare.scala @@ -0,0 +1,23 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF2 +import com.isrdi.udfs.SerJaroWinklerDistance +import scala.math.max + +class MaxJWCompare extends UDF2[Seq[String], Seq[String], Double] { + val distance = new SerJaroWinklerDistance + override def call(list1: Seq[String], list2: Seq[String]): Double = { + + var max_score = 0.0 + for (s1 <- list1) { + for (s2 <- list2) { + max_score = max(max_score, distance.apply(s1, s2)) + } + } + max_score + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/ParseProbabilityVector.scala b/scala_jar/src/main/scala/com/isrdi/udfs/ParseProbabilityVector.scala new file mode 100644 index 0000000..a251e52 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/ParseProbabilityVector.scala @@ -0,0 +1,19 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF2 +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.functions._ + +class ParseProbabilityVector extends UDF2[Vector, Int, Double] { + override def call(v: Vector, i: Int): Double = { + if (v.size < 2) { + 0 + } else { + v(i) + } + } +} diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/SerJaroWinklerDistance.scala b/scala_jar/src/main/scala/com/isrdi/udfs/SerJaroWinklerDistance.scala new file mode 100644 index 0000000..b4570f5 --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/SerJaroWinklerDistance.scala @@ -0,0 +1,9 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.commons.text.similarity._ + +class SerJaroWinklerDistance extends JaroWinklerDistance with Serializable { } diff --git a/scala_jar/src/main/scala/com/isrdi/udfs/VectorToString.scala b/scala_jar/src/main/scala/com/isrdi/udfs/VectorToString.scala new file mode 100644 index 0000000..b980d7a --- /dev/null +++ b/scala_jar/src/main/scala/com/isrdi/udfs/VectorToString.scala @@ -0,0 +1,18 @@ +// This file is part of the ISRDI's hlink. +// For copyright and licensing information, see the NOTICE and LICENSE files +// in this project's top-level directory, and also on-line at: +// https://github.com/ipums/hlink + +package com.isrdi.udfs +import org.apache.spark.sql.api.java.UDF1 +import org.apache.spark.ml.linalg.Vector + +class VectorToString extends UDF1[Vector, String] { + def stringify(x: Vector): String = x match { + case null => null + case _ => x.toString + } + override def call(s1: Vector): String = { + stringify(s1) + } +} diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ee43b1f --- /dev/null +++ b/setup.py @@ -0,0 +1,62 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink + +from setuptools import setup, find_packages +import os + + +packages_with_templates = [ + "hlink.linking", + "hlink.linking.preprocessing", + "hlink.linking.matching", + "hlink.linking.hh_matching", +] + +package_data = {"hlink.spark": ["jars/hlink_lib-assembly-1.0.jar"]} +for package in packages_with_templates: + package_path = package.replace(".", "/") + template_files = [] + for root, dirs, files in os.walk(f"{package_path}/templates"): + for file in files: + template_files.append( + os.path.relpath(os.path.join(root, file), package_path) + ) + package_data[package] = template_files + +package_data["hlink.linking"].append("table_definitions.csv") + +install_requires = [ + "colorama==0.4.1", + "ipython==7.0.1", + "Jinja2==2.10", + "numpy==1.19.5", + "pandas==0.25.3", + "pyspark==2.3.1", + "scikit-learn==0.23.1", + "toml==0.10.1", +] + +dev_requires = [ + "pre-commit", + "black==20.8b1", + "flake8==3.7.8", + "sphinx", + "recommonmark", + "pluggy==0.13.1", + "pytest==4.6.3", +] + +setup( + name="hlink", + version="2.0.0", + packages=find_packages(), + package_data=package_data, + install_requires=install_requires, + extras_require={"dev": dev_requires}, + entry_points=""" + [console_scripts] + hlink=hlink.scripts.main:cli + """, +) diff --git a/sphinx-docs/Makefile b/sphinx-docs/Makefile new file mode 100644 index 0000000..28e1fd8 --- /dev/null +++ b/sphinx-docs/Makefile @@ -0,0 +1,24 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +github: + @make html + @cp -a _build/html/. ../docs + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile github + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/sphinx-docs/column_mapping_transforms.md b/sphinx-docs/column_mapping_transforms.md new file mode 100755 index 0000000..bea728c --- /dev/null +++ b/sphinx-docs/column_mapping_transforms.md @@ -0,0 +1,274 @@ +# Column mapping transforms + +Each header below represents a column mapping transform type. Transforms are used in the context of `column_mappings`. + +Some transforms refer to "a" or "b". These mean the transforms apply to columns from only one of the two datasets to be linked (we're trying to link people in dataset "a" with people in dataset "b"). + +More than one transform can be applied to a column. Transforms apply in the order they're listed, so the output of one transform may be the input of another. + +Each transform applies to the column specified by the `column_name` attribute in the config under the `[[column_mappings]]` section. The `transforms` attribute +indicates the type of the transform, which is one of the ones listed below. Along with `type`, there can be additional attributes used by the transform. +These may vary by type, and additional information is given for each type of transform below. Often an additional attribute is just named `value` or `values`. + +``` +[[column_mappings]] +alias = "namefrst_split" +column_name = "namefrst_clean" +transforms = [ { type = "split" } ] +``` + +## add_to_a + +Add a value to a column from dataset "a". + +``` +transforms = [ { type = "add_to_a", value = 11 } ] +``` + +## concat_to_a + +Concatenate the string value to the end of a column in dataset "a". + +``` +transforms = [ { type = "concat_to_a", value = " "} ] +``` + +## concat_to_b + +Concatenate the string value to the end of a column in dataset "b". + +``` +transforms = [ { type = "concat_to_b", value = " "} ] +``` + +## lowercase_strip + +Used in name cleaning. + +Convert alphabetical characters to lower-case and strip white space characters from the start and end of the strings in the column. + +``` +transforms = [ { type = "lowercase_strip"} ] + +``` + +## rationalize_name_words + +Used in name cleaning. + +Replace '?', '\*', and '-' with spaces. Since people's names in raw census data can contain these +characters, replacing these characters can lead to better matching. + +``` +transforms = [ { type = "rationalize_name_words"} ] +``` + + +## remove_qmark_hyphen + +Used in name cleaning. + +Remove the '?-' from words and replace with nothing. + +``` +transforms = [ { type = "remove_qmark_hyphen"} ] +``` + +## remove_punctuation + +Remove most punctuation and replace with nothing. + +Removes: +``` +? - \ / " ' : , . [ ] { } +``` + +``` +transforms = [ { type = "remove_punctuation"} ] +``` + +## replace_apostrophe + +Used in name cleaning. + +Replace each apostrophe "'" with a space. + +``` +transforms = [ { type = "replace_apostrophe"} ] + +``` + + +## remove_alternate_names + +Used in name cleaning. + +Remove any names following the string 'or'. + +``` +transforms = [ { type = "remove_alternate_names"} ] +``` + + +## remove_suffixes + +Used in name cleaning. + +Given a list of suffixes, remove them from the names in the column. + +``` +transforms=[{ type = "remove_suffixes", values = ["jr", "sr", "ii", "iii"] }] +``` + +## remove_stop_words + +Used in name cleaning. + +Remove last words from names such as street names. + +``` +transforms=[ +{type = "remove_stop_words", values = ['alley','ally','aly','anex','annex','av','ave','aven','avenu','avenue','avn','avnue','avanue','avaneu','bg','blvd','boul','boulevard','brg','bridge','burg','camp','circle','cor', 'corner', 'corners','cors', 'court', 'courts', 'cp', 'cres', 'crescent', 'ct', 'cts', 'dr','driv', 'drive', 'est', 'estate', 'express', 'expressway', 'ext', 'extension', 'ferry', 'fort', 'frt', 'fry', 'ft', 'heights', 'ht', 'hts', 'is', 'island', 'key', 'ky', 'ldg', 'lodge', 'mill', 'mills', 'ml', 'mls', 'mount', 'mountain', 'mountin', 'mt', 'mtn', 'park', 'parkway','pike', 'pikes','pkwy', 'pl', 'place', 'point', 'points', 'pr', 'prairie', 'prk', 'pt', 'pts', 'rad', 'radial', 'rd', 'rds', 'rest', 'riv', 'river', 'road', 'roads', 'rst', 'spgs', 'springs', 'sq', 'square', 'st', 'sta', 'station', 'str', 'street', 'streets', 'strt', 'sts', 'ter', 'terrace', 'track', 'tracks', 'trail', 'trails', 'trnpk', 'turnpike', 'un', 'union', 'valley', 'vally', 'via', 'viaduct', 'vill', 'villag', 'village', 'villiage', 'well', 'wl', 'wl', 'and','of','.',',','-','/','&','south','north','east','west','s','n','e','w','block']}] + +``` + +## remove_prefixes + +Used in name cleaning. + +Remove prefixes like "Ms.", "Mr.", or "Mrs." from names. + +In some census data, "ah" is such a prefix from Chinese names. + +``` +transforms=[{ type = "remove_prefixes", values = ["ah"]}] +``` + +## condense_strip_whitespace + +Used in name cleaning. + +Take white space that may be more than one character or contain non-space characters and replace it with a single space. + +``` + +transforms=[{ type = "condense_strip_whitespace"}] + +``` + +## remove_one_letter_names + +Used in name cleaning. + +If a name is a single character, remove it and leave the white space behind. + +``` +transforms=[{ type = "remove_one_letter_names"}] +``` + + +## split + + +Split the column value on space characters (" "). + +``` +[[column_mappings]] +alias = "namefrst_split" +column_name = "namefrst_clean" +transforms = [ { type = "split" } ] +``` + + + + +## array_index + +If the column contains an array, select the element at the given position. + +This can be used as the input to another transform. In the example below, the first transform selects the second (index 1) item from the "namefrst_split" column that contains a set of names split on white space. Then, the substring 0,1 is selected which gives the first initial of the person's probable middle name. + +``` +alias = "namefrst_mid_init" +column_name = "namefrst_split" +transforms = [ + { type = "array_index", value = 1}, + { type = "substring", values = [0, 1]} +] +``` + +## mapping + +Map single or multiple values to a single output value, otherwise known as a "recoding." + +``` +[[column_mappings]] +column_name = "birthyr" +alias = "clean_birthyr" +transforms=[ +{type = "mapping" +values = [{"from"=[9999,1999], "to" = ""}, +{"from" = -9998, "to" = 9999} +]} +``` + +## substring + +Replace a column with a substring of the data in the column. + +``` +transforms = [ + { type = "substring", values = [0, 1]}] + ``` + +## divide_by_int + +Divide data in a column by an integer value. It may leave a non-integer result. + +For instance, this transform takes the birthplace variable and converts it from the detailed version to the general version. The two least significant digits are detailed birthplace information; to make the more general version, we simply drop them by dividing by 100 and rounding to the lowest whole number (floor function). + +``` +[[column_mappings]] +column_name = "bpl" +alias = "bpl_root" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] + +``` + + +## when_value + + +Apply conditional logic to replacement of values in a column. Works like the SQL if() or case() expressions in the SQL "select" clause. + +When a the value of a column is "value" replace it with "if_value" otherwise replace it with the "else_value". + +This example replaces all "race" IPUMS codes with 0 (white) or 1 (non-white). An IPUMS code of 100 is the "white" race category. + +``` +column_name = "race" +transforms = [ + { type = "when_value", value = 100, if_value = 0, else_value = 1} +] +``` + + +## get_floor + +Round down to the nearest whole number. + +This example produces the general version of the IPUMS "relate" variable. The variable is coded such that detailed categories are between the hundreds (300 is child of household head, 301 is simply 'child', 302 is adopted child, 303 is step-child for instance). The general categories are usually all that's needed (1 == household head, 2 == spouse, 3 == child, 4 == child-in-law, 5 == parent, 6 == parent-in-law, 7== sibling, 12 == not related to head). + +``` +[[column_mappings]] +alias = "relate_div_100" +column_name = "relate" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] +``` + diff --git a/sphinx-docs/comparison_types.md b/sphinx-docs/comparison_types.md new file mode 100644 index 0000000..06a93b8 --- /dev/null +++ b/sphinx-docs/comparison_types.md @@ -0,0 +1,836 @@ +# Comparison types, transform add-ons, aggregate features, and household aggregate features + +This page has information on the different comparison types available for the `[[comparison_features]]` +section, along with some attributes available to all of the comparison types and some aggregate features +that are not configurable. + +## Comparison types +Each header below represents a comparison type. Transforms are used in the context of `comparison_features`. + +``` +[[comparison_features]] +alias = "relatematch" +column_name = "relate_div_100" +comparison_type = "equals" +categorical = true +``` + +### maximum_jaro_winkler +Finds the greatest Jaro-Winkler value among the cartesian product of multiple columns. For example, given an input of `column_names = ['namefrst', 'namelast']`, it would return the maximum Jaro-Winkler name comparison value among the following four comparisons: +``` +[('namefrst_a', 'namefrst_b'), + ('namefrst_a', 'namelast_b'), + ('namelast_a', 'namefrst_b'), + ('namelast_a', 'namelast_b')] + ``` +* Attributes: + * `column_names` -- Type: list of strings. Required. The list of columns used as input for the set of comparisons generated by taking the cartesian product. + + ``` +[[comparison_features]] +alias = "maximum_jw" +column_names = ["namelast", "namefrst"] +comparison_type = "maximum_jaro_winkler" +``` + + +### jaro_winkler + +Returns the Jaro-Winkler comparison score for a given column. +* Attributes: + * `column_name` -- Type: `string`. Required. The column to compare using the Jaro-Winkler score. +``` +[[comparison_features]] +alias = "namefrst_jw" +column_name = "namefrst" +comparison_type = "jaro_winkler +``` + +### jaro_winkler_street +Uses an additional geographic column value to filter for major location changes before comparing street names. If boundary column A is not equal to boundary column B, a Jaro-Winkler score of zero is returned. If boundary column A and B are equal, the Jaro-Winkler comparison score of the street columns is returned. +* Attributes: + * `column_name` -- Type: `string`. Required. The input street column. + * `boundary` -- Type: `string`. Required. An input column to match on before comparing street name values. +``` +[[comparison_features]] +alias = "jw_street" +column_name = "street" +boundary = "enum_dist" +comparison_type = "jaro_winkler_street" +``` + +### max_jaro_winkler + +Returns the greatest Jaro-Winkler value from the comparisons of a list of names. +* Attributes: + * `column_name` -- Type: `string`. Required. Input column containing a list of names to compare (such as related household members, or neighborhood surnames). +``` +[[comparison_features]] +alias = "related_individual_max_jw" +column_name= "namefrst_related" +comparison_type = "max_jaro_winkler" +``` + +### equals + +Asserts that values are the same for both compared columns using SQL: `a.{column_name} IS NOT DISTINCT FROM b.{column_name}` + +``` +[[comparison_features]] +alias = "relatematch" +column_name = "relate_div_100" +comparison_type = "equals" +categorical = true +``` + +### f1_match +Evaluates if the first name initial A matches either the first name first initial B or either the first or second middle initial of B. If so, returns 1. Otherwise, returns 2. + +1 = First initial of first first name A matches first initial of any of potential match first names B + +2 = mismatch + +Uses the following SQL query: +``` +"CASE WHEN ( + (a.{fi} IS NOT DISTINCT FROM b.{fi}) OR + (a.{fi} IS NOT DISTINCT FROM b.{mi0}) OR + (a.{fi} IS NOT DISTINCT FROM b.{mi1}) +) THEN 1 ELSE 2 END" +``` + +``` +[[comparison_features]] +alias = "f1_match" +first_init_col = "namefrst_init" +mid_init_cols = ["namefrst_mid_init", "namefrst_mid_init_2"] +comparison_type = "f1_match" +categorical = true +``` + +### f2_match +Evaluates if first middle initial A is empty/null. If so, return 0. +Otherwise, if either first or second middle initial A is not null and matches first name initial B, or first or second middle initial B, return 1. +Otherwise, return 2. + +1 = First initial of A second first name matches first initial of any of potential match first names B + +2 = mismatch + +0 = no second first name A + +Uses the following SQL: +``` +CASE WHEN ((a.{mi0} == '') OR (a.{mi0} IS NULL)) THEN 0 WHEN ( + (a.{mi0} IS NOT DISTINCT FROM b.{fi}) OR + ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{fi})) OR + (a.{mi0} IS NOT DISTINCT FROM b.{mi0}) OR + (a.{mi0} IS NOT DISTINCT FROM b.{mi1}) OR + ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi0})) OR + ((a.{mi1} IS NOT NULL) AND (a.{mi1} IS NOT DISTINCT FROM b.{mi1})) +) THEN 1 ELSE 2 END +``` +* Attributes: + * `first_init_col` -- Type: `string`. Required. First name initial input column. + * `mid_init_cols` -- Type: list of strings. Required. List of first and second middle initial input columns. +``` +[[comparison_features]] +alias = "f2_match" +first_init_col = "namefrst_init" +mid_init_cols = ["namefrst_mid_init", "namefrst_mid_init_2"] +comparison_type = "f2_match" +categorical = true +``` + +### not_equals +Asserts that values are distinct between compared individuals using SQL: `a.{column_name} IS DISTINCT FROM b.{column_name}`. Used mainly in caution flag features (f_caution, m_caution, sp_caution). +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare. + +``` +[[comparison_features]] +alias = "m_caution" +column_names = ["mbpl", "mother_birthyr", "stepmom", "momloc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "mbpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "mother_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "stepmom" +comparison_type = "parent_step_change" +[comparison_features.comp_d] +column_name = "momloc" +comparison_type = "present_both_years" +``` + +### equals_as_int +Checks for equality using equals sign and returns boolean result in integer form. Uses SQL: `CAST(a.{col} = b.{col} as INT)` +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare. + +``` +[[comparison_features]] +alias = "namelast_equal_as_int" +column_name = "namelast_clean" +comparison_type = "equals_as_int" +``` +### all_equals +Asserts whether the values in all given columns match. Uses a SQL expression generated by joining `a.{col} = b.{col}` and `AND` for each given column. +* Attributes: + * `column_names` -- Type: list of strings. Required. List of the columns to evaluate if all are equal across records being compared. +``` +[[comparison_features]] +alias = "exact" +column_names = ["namefrst_unstd", "namelast_clean"] +comparison_type = "all_equals" +``` + +### or +Allows for the concatenation of up to four comparison features into one feature using a SQL `OR` between the generated clause for each sub-comparison. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. + * `comp_c`, `comp_d` -- Type: Object. Optional. Sub-comparison using any of the comparison feature types documented in this section. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr"] +comparison_type = "or" +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +lower_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +``` + +### and + +Allows for the concatenation of up to four comparison features into one feature using a SQL `AND` between the generated clause for each sub-comparison. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. + * `comp_c`, `comp_d` -- Type: Object. Optional. Sub-comparison using any of the comparison feature types documented in this section. + +In this example, the `and` comparison appears in `[comparison_features.comp_b]`. + +``` +[[comparison_features]] +alias = "street_jw" +comparison_type = "times" +column_names = ["street","county", "statefip"] +[comparison_features.comp_a] +column_name = "street" +comparison_type = "jaro_winkler" +lower_threshold = 0.9 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### times +Takes the output of two sub-comparisons and multiplies them together after casting as floats. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. `comp_a` and `comp_b` can also have sub-comparisons, as in the given example. +``` +[[comparison_features]] +alias = "street_jw" +comparison_type = "times" +column_names = ["street","county", "statefip"] +[comparison_features.comp_a] +column_name = "street" +comparison_type = "jaro_winkler" +lower_threshold = 0.9 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### caution_comp_3 +Generates an SQL expression in the form `(({expr_a} OR {expr_b}) AND {expr_c})`. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b`, `comp_c` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. `comp_a`, `comp_b`, and `comp_c` can also have sub-comparisons. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_3" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +``` + +### caution_comp_4 +Generates an SQL expression in the form `(({expr_a} OR {expr_b} OR {expr_c}) AND {expr_d})`. +* Attributes: + * `column_names` -- Type: list of strings. Required. A list of all input columns used by sub-comparisons. + * `comp_a`, `comp_b`, `comp_c`, `comp_d` -- Type: Object. Required. Sub-comparison using any of the comparison feature types documented in this section. `comp_a`, `comp_b`, `comp_c`, and `comp_d` can also have sub-comparisons. + +``` +[[comparison_features]] +alias = "m_caution" +column_names = ["mbpl", "mother_birthyr", "stepmom", "momloc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "mbpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "mother_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "stepmom" +comparison_type = "parent_step_change" +[comparison_features.comp_d] +column_name = "momloc" +comparison_type = "present_both_years" +``` + +### any_equals +Used to compare middle initials and first names under specific circumstances. +If middle initial A is not empty/null and is the same as either middle initial B or first name B, +OR if first name A is not empty/null and is the same as middle initial B. +* Attributes: + * `column_names` -- Type: list of strings. Required. The first input column should be the middle initial column, and the second input column should be the first name column. +``` +[[comparison_features]] +alias = "mid_init_match" +column_names = ["namefrst_mid_init", "namefrst_unstd"] +comparison_type = "any_equals" +``` + +### either_are_1 +Checks if the column value for either A or B is equal to 1. +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare to 1. + +``` +[[comparison_features]] +alias = "either_1" +column_name = "nativity" +comparison_type = "either_are_1" +categorical = true +``` + +### either_are_0 +Checks if the column value for either A or B is equal to 0. +* Attributes: + * `column_name` -- Type: `string`. Required. Input column to compare to 0. + +``` +[[comparison_features]] +alias = "either_0" +column_name = "nativity" +comparison_type = "either_are_0" +categorical = true +``` + +### second_gen_imm +Checks if individual A is a second-generation immigrant by looking for `nativity` value of 2, 3, or 4 (one or both parents foreign-born). +* Attributes: + * `column_name` -- Type: `string`. Required. Input should be the name of the nativity column. +``` +[[comparison_features]] +alias = "sgen" +column_name = "nativity" +comparison_type = "second_gen_imm" +categorical = true +``` + +### rel_jaro_winkler +Uses a Scala function to determine the number of people in the input column with a name similarity score (Jaro-Winkler) greater than or equal to the given `jw_threshold`, an age difference less than or equal to the given `age_threshold`, and matching sex for the sample A individual and the sample B potential match. Takes a column generated with the feature selection transform `related_individual_rows` as input (list of person data objects to compare). Can be used for related or unrelated individuals, depending on the input column specified. + +* Attributes: + * `column_name` -- Type: `string`. The input column with data in the form of a list of person data objects. + * `name_col` -- Type: `string`. + * `birthyr_col` -- Type: `string`. + * `jw_threshold` -- Type: `float`. + * `age_threshold` -- Type: `int`. + +``` +[[comparison_features]] +alias = "rel" +column_name = "namefrst_related_rows" +name_col = "namefrst_unstd" +birthyr_col = "replaced_birthyr" +comparison_type = "rel_jaro_winkler" +jw_threshold = 0.9 +age_threshold = 5 +``` + +### extra_children +Using a Scala function, checks to see if there are children present in sample B who are not present in sample A, but based on relate codes, age, sex, and name, we would have expected to be present in A. Returns a count of suspected "extra" children. Takes a column generated with the feature selection transform `related_individual_rows` as input (list of person data objects to compare). +* Attributes: + * `column_name` -- Type: `string`. The input column with data in the form of a list of person data objects. + * `relate_col` -- Type: `string`. The name of the column with the `relate` code. + * `histid_col` -- Type: `string`. The name of the id column. + * `name_col` -- Type: `string`. The name of the column containing the first name for comparison. + * `birthyr_col` -- Type: `string`. The name of the column containing the birth year. + * `year_b` -- Type: `int`. The year that sample B was taken. + * `jw_threshold` -- Type: `float`. The minimum acceptable Jaro-Winkler score to consider a match. + * `age_threshold` -- Type: `int`. The maximum acceptable age difference to consider a match. + +``` +[[comparison_features]] +alias = "extra_children" +column_name = "namefrst_related_rows" +relate_col = "relate" +histid_col = "histid" +name_col = "namefrst_unstd" +birthyr_col = "replaced_birthyr" +year_b = 1910 +comparison_type = "extra_children" +jw_threshold = 0.8 +age_threshold = 2 +``` + +### jaro_winkler_rate +Uses a Scala function to calculate the percentage of individuals who have a Jaro-Winkler score greater than or equal to the given threshold. Rate returned as a percentage as a float data type. +* Attributes: + * `column_name` -- Type: `string`. The input column with data in the form of a list of person data objects. The input column seen below ("namelast_neighbors")was generated using a "neighbor_aggregate" feature selection. + * `jw_threshold` -- Type: `float`. The minimum Jaro-Winkler threshold to consider an acceptable match. + +In the following example, a `lower_threshold` feature add-on is used to convert the returned rate to a boolean asserting whether it meets the given minimum threshold. (>= 5% of neighbors have a Jaro-Winkler score >= 0.95) +``` +[[comparison_features]] +alias = "nbors" +comparison_type = "times" +column_names = ["namelast_neighbors", "county", "statefip"] +[comparison_features.comp_a] +column_name = "namelast_neighbors" +comparison_type = "jaro_winkler_rate" +jw_threshold = 0.95 +lower_threshold = 0.05 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### sum +Adds the column values for A and B together (takes the sum). +* Attributes: + * `column_name` -- Type: `string`. The input column to be added. + +``` +[[comparison_features]] +alias = "namelast_popularity_sum" +column_name = "namelast_popularity" +comparison_type = "sum" +``` + +### length_b +Returns the length of the column value in record B using the SQL `size()` function. +* Attributes: + * `column_name` -- Type: `string`. The name of the input column to take the length of in dataset B. + +### abs_diff +Takes the absolute value of the difference between the values of the given column in datasets A and B. +* Attributes: + * `column_name` -- Type: `string`. The input column to evaluate. + * `not_equals` -- Type: `int`. OPTIONAL. You can specify a value for the column to be considered invalid input, in which case the expression would return the value -1 instead of an absolute difference. For example, if you are evaluating the difference in marriage duration values, and "99" is a placeholder value for "unknown" in the data, you can exclude those values from consideration using this attribute. + +``` +[[comparison_features]] +alias = "byrdiff" +column_name = "replaced_birthyr" +comparison_type = "abs_diff" + +[[comparison_features]] +alias = "mardurmatch" +column_name = "durmarr" +not_equals = 99 +comparison_type = "abs_diff" +btwn_threshold = [9, 14] +categorical = True +``` + +### b_minus_a +Returns the value of subtracting the value of column A from the value of column B. +* Attributes: + * `column_name` -- Type: `string`. The input column to evaluate. + * `not_equals` -- Type: `int`. OPTIONAL. You can specify a value for the column to be considered invalid input, in which case the expression would return the value -1 instead of an absolute difference. For example, if you are evaluating the difference in marriage duration values, and "99" is a placeholder value for "unknown" in the data, you can exclude those values from consideration using this attribute. +``` +[[comparison_features]] +alias = "mardurmatch" +column_name = "durmarr" +not_equals = 99 +comparison_type = "b_minus_a" +btwn_threshold = [5,14] +categorical = true +``` + +### geo_distance +Uses a lookup table to find the geographic distance between locations. The SQL expression is generated by `hlink/linking/core/dist_table.py`. There are several ways to configure this feature. You can look up distances in the given file using one or two keys (specified with the `key_count` attribute). You can also optionally have a secondary look-up table that serves as a back-up value in the case that the primary look-up does not contain a value for the locations given. This is particularly useful for county distance, as you can set the primary join to be across counties, but set up a secondary join on state, which has much fewer combinations and thus less risk of nulls, to fill in when the counties specified aren't in the look-up. + +* Attributes: + * `key_count` -- Type: `int`. The number of keys used to join on the primary (or only) look-up table. Acceptable values are 1 or 2. Ex: for state and county, key_count = 2. For just state, key_count = 1 even though there is county_a and county_b. + * `distances_file` -- Type: `string` of path. Path to the distances look-up file. + * `table_name` -- Type: `string`. What to name the table that will be generated from the distances file. If you want to do multiple look-ups, if the table_name is the same across all feature specifications, it will only be read in once. + + * Attributes for `key_count = 1`: + * `column_name` -- Type: `string`. The column in the input data that you want to use as a key to look up the geographic distance. + * `loc_a` -- Type: `string`. First column to join on in the look-up table (where to find the value coming from the `column_name` column A). + * `loc_b` -- Type: `string`. Second column to join on in the look-up table (where to find the value coming from the `column_name` column B). + * `distance_col` -- Type: `string`. Name of the column containing the geographic distance in the look-up table. + + * Attributes for `key_count = 2`: + * `column_names` -- Type: list of strings. The two columns you want to use as keys to look up the geographic distance. + * `source_column_a` -- Type: `string`. First column to join on in the source data. + * `source_column_b` -- Type: `string`. Second column to join on in the source data. + * `loc_a_0` -- Type: `string`. First column to join on in the look-up table. + * `loc_a_1` -- Type: `string`. First column to join on in the look-up table. + * `loc_b_0` -- Type: `string`. Second column to join on in the look-up table. + * `loc_b_1` -- Type: `string`. Second column to join on in the look-up table. + * `distance_col` -- Type: `string`. Name of the column containing the geographic distance in the look-up table. + + * Attributes if using a secondary join: + * `secondary_key_count` -- Type: `int`. The number of keys used to join on the secondary (backup) look-up table. Acceptable values are 1 or 2. + * `secondary_table_name` -- Type: `string`. What to name the table that will be generated from the `secondary_distances_file`. If you want to do multiple look-ups, if the table_name is the same across all feature specifications, it will only be read in once. + * `secondary_distances_file` -- Type: `string` of path. Path to the secondary distances look-up file. + * `secondary_source_column` -- Type: `string`. The column in the input data that you want to use as a key in the secondary geographic distance look-up. + * `secondary_loc_a` -- Type: `string`. First column to join on in the secondary look-up table. + * `secondary_loc_b` -- Type: `string`. Second column to join on in the secondary look-up table. + * `secondary_distance_col` -- Type: `string`. Name of the column containing the geographic distance in the secondary look-up table. + +``` +[[comparison_features]] +alias = "state_distance" +comparison_type = "geo_distance" +key_count = 1 +table_name = "state_distance_lookup" +distances_file = "/path/to/county_state_distance.csv" +column_name = "bpl" +loc_a = "statecode1" +loc_b = "statecode2" +distance_col = "dist" + + +[[comparison_features]] +alias = "county_distance" +comparison_type = "geo_distance" +column_names = ["county", "statefip"] +key_count = 2 +table_name = "county_distance_lookup" +distances_file = "/path/to/county_1900_1910_distances_km.csv" +# columns to join on in the data +source_column_a = "county" +source_column_b = "statefip" + +# column names from the csv lookup file +loc_a_0 = "from_icpsrctyi" +loc_a_1 = "to_icpsrctyi" +loc_b_0 = "from_statefip" +loc_b_1 = "to_statefip" +distance_col = "distance_km" + +# SECONDARY JOIN +secondary_key_count = 1 +secondary_table_name = "state_distance_lookup" +secondary_distances_file = "/path/to/state_1900_1910_distances_km.csv" +secondary_source_column = "statefip" +secondary_loc_a = "from_statefip" +secondary_loc_b = "to_statefip" +secondary_distance_col = "distance_km" +``` + +### fetch_a + +Gets the value of column A. + +* Attributes: + * `column_name` -- Type: `string`. Required. The column to get the value from. + +``` +[[comparison_features]] +alias = "race" +column_name = "race" +comparison_type = "fetch_a" +categorical = true +``` + + +### fetch_b + +Gets the value of column B. + +* Attributes: + * `column_name` -- Type: `string`. The column to get the value from. + +``` +[[comparison_features]] +alias = "race" +column_name = "race" +comparison_type = "fetch_b" +categorical = true +``` + +### present_both_years + +Checks whether both column A and column B are present. + +* Attributes: + * `column_name` -- Type: `string`. The column to check. + +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +[comparison_features.comp_d] +column_name = "sploc" +comparison_type = "present_both_years" +``` + +### neither_are_null + +Checks that neither column A nor column B is null. + +* Attributes: + * `column_name` -- Type: `string`. The column to check. + + +### present_and_not_equal + +Checks that column A and column B are both present but are not equal. + +* Attributes: + * `column_name` -- Type: `string`. The column to check. + +## Feature add-ons +These attributes can be added to most comparison feature types above to extend the type of output returned beyond the standard comparison feature. + +### alias +* Attributes: + * `alias`: Type: `string`. Should be used at the top level comparison of every comparison feature. The name for the output column. +``` +[[comparison_features]] +alias = "jw_f" +column_name = "father_namefrst" +comparison_type = "jaro_winkler" +``` + +### power +Raises a comparison feature to a given exponential power. +* Attributes: + * `power` -- Type: `int`. The power to raise the comparison output to. For example, `power = 2` will square the output. +``` +[[comparison_features]] +alias = "county_distance_squared" +comparison_type = "geo_distance" +column_names = ["county", "statefip"] +# PRIMARY JOIN +# key count: the number of keys used for the join per source file. Ex: for state and county, key_count = 2. For just state, key_count = 1 even though there is county_a and county_b +key_count = 2 +table_name = "county_distance_lookup" +#distances_file = "/path/to/county_state_distance.csv" +distances_file = "/path/to/county_1900_1910_distances_km.csv" +# columns to join on in the data +source_column_a = "county" +source_column_b = "statefip" +# column names from the csv lookup file +loc_a_0 = "from_icpsrctyi" +loc_a_1 = "to_icpsrctyi" +loc_b_0 = "from_statefip" +loc_b_1 = "to_statefip" +distance_col = "distance_km" +# SECONDARY JOIN +secondary_key_count = 1 +secondary_table_name = "state_distance_lookup" +secondary_distances_file = "/path/to/state_1900_1910_distances_km.csv" +secondary_source_column = "statefip" +secondary_loc_a = "from_statefip" +secondary_loc_b = "to_statefip" +secondary_distance_col = "distance_km" +power = 2 +``` + +### threshold +* Attributes: + * `threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is greater than or equal to (`>=`) the given threshold value. +``` +[[comparison_features]] +alias = "imm" +column_name = "nativity" +comparison_type = "fetch_a" +threshold = 5 +categorical = true +``` + +### lower_threshold +* Attributes: + * `lower_threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is greater than or equal to (`>=`) the given threshold value. +``` +[[comparison_features]] +alias = "street_jw" +comparison_type = "times" +column_names = ["street","county", "statefip"] +[comparison_features.comp_a] +column_name = "street" +comparison_type = "jaro_winkler" +lower_threshold = 0.9 +[comparison_features.comp_b] +comparison_type = "and" +column_names = ["county", "statefip"] +[comparison_features.comp_b.comp_a] +column_name = "county" +comparison_type = "equals" +[comparison_features.comp_b.comp_b] +column_name = "statefip" +comparison_type = "equals" +``` + +### upper_threshold +* Attributes: + * `upper_threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is less than or equal to (`<=`) the given threshold value. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +[comparison_features.comp_d] +column_name = "sploc" +comparison_type = "present_both_years" +``` + +### gt_threshold +* Attributes: + * `gt_threshold` -- Type: numeric types. Asserts if the comparison feature output is not null and is greater than (`>`) the given threshold value. +``` +[[comparison_features]] +alias = "sp_caution" +column_names = ["spouse_bpl", "spouse_birthyr", "durmarr", "sploc"] +comparison_type = "caution_comp_4" +categorical = true +[comparison_features.comp_a] +column_name = "spouse_bpl" +comparison_type = "not_equals" +[comparison_features.comp_b] +column_name = "spouse_birthyr" +comparison_type = "abs_diff" +gt_threshold = 5 +[comparison_features.comp_c] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +[comparison_features.comp_d] +column_name = "sploc" +comparison_type = "present_both_years" +``` + +### btwn_threshold +* Attributes: + * `btwn_threshold` -- Type: List of numeric type. Asserts if the comparison feature output is greater than or equal to (`>=`) the first threshold value, and less than or equal to (`<=`) the second threshold value. +``` +[[comparison_features]] +alias = "mardurmatch" +column_name = "durmarr" +not_equals = 99 +comparison_type = "b_minus_a" +btwn_threshold = [5,14] +categorical = true +``` + +### look_at_addl_var +* Attributes: + * `look_at_addl_var` -- Type: boolean. Flags the program to consider an additional column value before reporting the comparison feature value. + * `addl_var` -- Type: `string`. The additional column to consider. + * `check_val_expr` -- Type: expression. The expression to use to evaluate the additional column. For example, `check_val_expr = "= 5"`. + * `else_val` -- Type: same type as comparison feature output. If the additional volumn value does not meet the `check_val_expr` specification, the value to return instead of the comparison feature value. + +In the following example, the generated SQL expression for the column would be: `CASE WHEN {datasource}.nativity = 5 then {yrimmig abs_diff value} else -1 END`. +``` +[[comparison_features]] +alias = "immyear_diff" +column_name = "yrimmig" +comparison_type = "abs_diff" +look_at_addl_var = true +addl_var = "nativity" +datasource = "a" +check_val_expr = "= 5" +else_val = -1 +``` + +## Aggregate Features +These features are not configurable. To include them in the generated comparison features, they just need to be included in the `[training][independent_vars]` section of the config. They are generated using the "aggregate_features" SQL template. + +### hits +The number of potential matches generated for the given individual (counted by aggregating on `{id_column}_a`). + +### hits2 +`hits` squared. + +### exact_mult +Indicator for the existence of multiple potential matches with the exact same first and last name as the A sample individual within the B data. Returns numeric boolean (0 or 1). + +## Household Aggregate Features +These features are not configurable. To include them in the generated comparison features, they just need to be included in the `[hh_training][independent_vars]` section of the config. They are generated using the "hh_aggregate_features" SQL template. + +### jw_max_a +The highest Jaro-Winkler score for any of the first names in linked household A against the first name in linked household B where birth year difference is less than or equal to ten, excluding the individual A in the current potential match. Returns `0` if no other individuals are in the household for comparison. + +### jw_max_b +The highest Jaro-Winkler score for any of the first names in linked household A against the first name in linked household B where sex matches and birth year difference is less than or equal to ten, excluding the individual A in the current potential match. Returns `0` if no other individuals are in the household for comparison. diff --git a/sphinx-docs/conf.py b/sphinx-docs/conf.py new file mode 100644 index 0000000..8180a19 --- /dev/null +++ b/sphinx-docs/conf.py @@ -0,0 +1,57 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +# sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath("..")) + + +# -- Project information ----------------------------------------------------- + +project = "hlink" +copyright = "2019-2022, IPUMS" +author = "Jacob Wellington, Kelly Thompson, Jonas Helgertz, Riley Harper" + +version = "2.0.0" +# The full version, including alpha/beta/rc tags +release = version + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["recommonmark", "sphinx.ext.autodoc"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "alabaster" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/sphinx-docs/config.md b/sphinx-docs/config.md new file mode 100644 index 0000000..df9ce24 --- /dev/null +++ b/sphinx-docs/config.md @@ -0,0 +1,749 @@ +# Configuration +1. [Basic Example Config File](#basic-config-file) +2. [Advanced Example Config File](#advanced-config-file) +3. [Top level configs](#top-level-configs) +4. [Data sources](#data-sources) +5. [Filter](#filter) +6. [Column mappings](#column-mappings) +7. [Substitution columns](#subsitution-columns) +8. [Feature selections](#feature-selections) +9. [Potential matches universe](#potential-matches-universe) +10. [Blocking](#blocking) +11. [Comparisons](#comparisons) +12. [Household comparisons](#household-comparisons) +13. [Comparison features](#comparison-features) +14. [Pipeline-generated features](#pipeline-generated-features) +15. [Training and models](#training-and-models) +16. [Household training and models](#household-training-and-models) + +## Basic Config File + +The config file tells the hlink program what to link and how to link it. A description of the different sections of +a configuration file are below. For reference, here is an example of a relatively basic config file. This config file +is used by the `examples/tutorial/tutorial.py` script for linking, and there is a more detailed discussion of the config +file in the README in `examples/tutorial`. + +Note that this config is written in TOML, but hlink is also able to work with JSON config files. + +``` +id_column = "id" +feature_selections = [] + +[datasource_a] +alias = "a" +file = "data/A.csv" + +[datasource_b] +alias = "b" +file = "data/B.csv" + +[[column_mappings]] +column_name = "NAMEFRST" +transforms = [ + {type = "lowercase_strip"} +] + +[[column_mappings]] +column_name = "NAMELAST" +transforms = [ + {type = "lowercase_strip"} +] + +[[column_mappings]] +column_name = "AGE" +transforms = [ + {type = "add_to_a", value = 10} +] + +[[column_mappings]] +column_name = "SEX" + +[[blocking]] +column_name = "SEX" + +[[blocking]] +column_name = "AGE_2" +dataset = "a" +derived_from = "AGE" +expand_length = 2 +explode = true + +[[comparison_features]] +alias = "NAMEFRST_JW" +column_name = "NAMEFRST" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "NAMELAST_JW" +column_name = "NAMELAST" +comparison_type = "jaro_winkler" + +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "NAMEFRST_JW" +threshold = 0.79 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "NAMELAST_JW" +threshold = 0.84 +``` + +## Advanced Config File + +Here is an example of a more complex config file that makes use of more of hlink's features. +It uses machine learning to probabilistically link the two datasets. + +``` +id_column = "histid" +drop_data_from_scored_matches = false + +# --------- DATASOURCES -------------- +[datasource_a] +alias = "us1900" +file = "/path/to/us1900m_usa.P.parquet" + +[datasource_b] +alias = "us1910" +file = "/path/to/us1910m_usa.P.parquet" + +# --------- FILTERS -------------- + +[[filter]] +expression = "NAMELAST is not null and NAMELAST != ''" + +[[filter]] +training_data_subset = true +datasource = "a" + +[[filter]] +expression = "age >= 5" +datasource = "b" + +# --------- COLUMN MAPPINGS -------------- + +[[column_mappings]] +column_name = "serialp" + +[[column_mappings]] +column_name = "sex" + +[[column_mappings]] +column_name = "age" + +[[column_mappings]] +column_name = "namelast" + +[[column_mappings]] +alias = "namefrst_clean" +column_name = "namefrst" +transforms = [ + { type = "lowercase_strip" }, + { type = "rationalize_name_words" }, + { type = "remove_qmark_hyphen"}, + { type = "replace_apostrophe"}, + { type = "remove_suffixes", values = ["jr", "sr", "ii", "iii"] }, + { type = "remove_alternate_names"}, + { type = "condense_strip_whitespace"}, +] + +[[column_mappings]] +alias = "namefrst_split" +column_name = "namefrst_clean" +transforms = [ { type = "split" } ] + +[[column_mappings]] +alias = "namefrst_std" +column_name = "namefrst_split" +transforms = [ + { type = "array_index", value = 0 } +] + +[[column_mappings]] +alias = "bpl_orig" +column_name = "bpl" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] + +[[column_mappings]] +alias = "statefip" +column_name = "statefip_h" + +[[column_mappings]] +column_name = "birthyr" +alias = "clean_birthyr" +[[column_mappings.transforms]] +type = "mapping" +mappings = {9999 = "", 1999 = ""} +output_type = "int" + +[[column_mappings]] +alias = "relate_div_100" +column_name = "relate" +transforms = [ + { type = "divide_by_int", value = 100 }, + { type = "get_floor" } +] + +# --------- SUBSTITUTIONS -------------- + +[[substitution_columns]] +column_name = "namefrst_std" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "1" +substitution_file = "/path/to/name_std/male.csv" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "2" +substitution_file = "/path/to/name_std/female.csv" + +# --------- FEATURE SELECTIONS -------------- + +[[feature_selections]] +input_column = "clean_birthyr" +output_column = "replaced_birthyr" +condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end" +transform = "sql_condition" + +[[feature_selections]] +input_column = "namelast" +output_column = "namelast_bigrams" +transform = "bigrams" + +[[feature_selections]] +input_column = "bpl_orig" +output_column = "bpl_clean" +condition = "case when bpl_str == 'washington' and bpl2_str=='washington' then 53 when (bpl_str is null or bpl_str == '') and bpl2_str=='washington' then 53 when bpl_str == 'washington' and (bpl2_str=='' or bpl2_str is null) then 53 else bpl_orig end" +transform = "sql_condition" + +[[feature_selections]] +input_column = "bpl_clean" +output_column = "region" +transform = "attach_variable" +region_dict = "/path/to/region.csv" +col_to_join_on = "bpl" +col_to_add = "region" +null_filler = 99 +col_type = "float" + +# --------- POTENTIAL MATCHES UNIVERSE ------------- + +[[potential_matches_universe]] +expression = "sex == 1" + +# --------- BLOCKING -------------- + +[[blocking]] +column_name = "sex" + +[[blocking]] +column_name = "birthyr_3" +dataset = "a" +derived_from = "replaced_birthyr" +expand_length = 3 +explode = true + +[[blocking]] +column_name = "namelast_bigrams" +explode = true + +# --------- COMPARISONS -------------- + +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "namefrst_std_jw" +threshold = 0.8 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "namelast_jw" +threshold = 0.75 + +# --------- HOUSEHOLD COMPARISIONS (post-blocking filters) ------------- + +[hh_comparisons] +comparison_type = "threshold" +feature_name = "byrdiff" +threshold_expr = "<= 10" + +# --------- COMPARISON FEATURES -------------- + +[[comparison_features]] +alias = "region" +column_name = "region" +comparison_type = "fetch_a" +categorical = true + +[[comparison_features]] +alias = "namefrst_std_jw" +column_name = "namefrst_std" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "namelast_jw" +column_name = "namelast" +comparison_type = "jaro_winkler" + +[[comparison_features]] +alias = "sex_equals" +column_name = "sex" +comparison_type = "equals" +categorical = true + +[[comparison_features]] +alias = "relate_a" +column_name = "relate_div_100" +comparison_type = "fetch_a" + +# --------- PIPELINE-GENERATED FEATURES ------------ + +[[pipeline_features]] +input_columns = ["sex_equals", "region"] +output_column = "sex_region_interaction" +transformer_type = "interaction" + +[[pipeline_features]] +input_column = "relate_a" +output_column = "relatetype" +transformer_type = "bucketizer" +categorical = true +splits = [1,3,5,9999] + +# --------- TRAINING -------------- + +[training] + +independent_vars = [ "namelast_jw", "region", "hits", "sex_region_interaction", "relatetype"] +scale_data = false + +dataset = "/path/to/training_data.csv" +dependent_var = "match" +score_with_model = true +use_training_data_features = false +split_by_id_a = true +decision = "drop_duplicate_with_threshold_ratio" + +n_training_iterations = 2 +output_suspicious_TD = true +param_grid = true +model_parameters = [ + { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] }, + { type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] } +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } + +# --------- HOUSEHOLD TRAINING -------------- + +[hh_training] + +prediction_col = "prediction" +hh_col = "serialp" + +independent_vars = ["namelast_jw", "namefrst_std_jw", "relatetype", "sex_equals"] +scale_data = false + +dataset = "/path/to/hh_training_data_1900_1910.csv" +dependent_var = "match" +score_with_model = true +use_training_data_features = false +split_by_id_a = true +decision = "drop_duplicate_with_threshold_ratio" + +n_training_iterations = 10 +output_suspicious_TD = true +param_grid = false +model_parameters = [ + { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 }, + { type = "probit", threshold = 0.5, threshold_ratio = 1.0 } +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } + +``` + +## Top level configs + +These configs should go at the top of your config file under no header: + +*id_column* + +Required. Specify the id column that uniquely identifies a record in each dataset. +``` +id_column = "id" +``` + +*drop_data_from_scored_matches* + +Optional. Whether or not the scored potential matches should be output with full features data, or just ids and match information. +``` +drop_data_from_scored_matches = false +``` + +## Data sources + +* Header names: `datasource_a`, `datasource_b` +* Description: Specifies your input data. +* Required: True +* Type: Object +* Attributes: + * `alias` -- Type: `string`. The short name for the datasource. Must be alphanumeric with no spaces. + * `file` -- Type: `string`. Required. The path to the input file. The file can be `csv` or `parquet`. + +``` +[datasource_a] +alias = "us1900" +file = "/path/to/my_file.csv" +``` + +## Filter + +* Header name: `filter` +* Description: Specifies filters to apply to your input data. +* Required: False +* Type: List +* Attributes: + * `expression` -- Type: `string`. SQL expression to apply to your input datasets. Can not be combined with `training_data_subset` in a single filter. + * `training_data_subset` -- Type: `boolean`. If set to true, will subset your input data to only include records that are also in your training data. Can not be combined with `expression` in a single filter. + * `datasource` -- Type: `string`. If you want to limit the filter to operate only on dataset a or b, you can specify that with this attribute. + +``` +[[filter]] +training_data_subset = true +datasource = "a" + +[[filter]] +expression = "NAMELAST is not null and NAMELAST != ''" + +[[filter]] +expression = "age >= 5" +datasource = "b" +``` + + +## [Column Mappings](column_mapping_transforms) + +* Header name: `column_mappings` +* Description: Base column mappings and transformations to extract from your input datasets. +* Required: True +* Type: List +* Attributes: + * `alias` -- Type: `string`. Optional; if not specified the new column name defaults to `column_name`. New name of column. + * `column_name` -- Type: `string`. Name of column in input data. Used as the name of the output column if `alias` is not specified. + * `transforms` -- Type: `List`. Optional. A list of transforms to apply, in order, to the input data. See the [column mapping transforms](column_mapping_transforms) section for more information. + +``` +[[column_mappings]] +column_name = "age" + +[[column_mappings]] +alias = "namefrst_clean" +column_name = "namefrst" +transforms = [ + { type = "lowercase_strip" }, + { type = "rationalize_name_words" }, + { type = "remove_qmark_hyphen"}, + { type = "replace_apostrophe"}, + { type = "remove_suffixes", values = ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"] }, + { type = "remove_alternate_names"}, + { type = "condense_strip_whitespace"} +] +``` + +## [Substitution Columns](substitutions) + +* Header name: `substitution_columns` +* Description: Substitutions to apply to data after column mappings. +* Required: False +* Type: List +* Attributes: + * `column_name` -- Type: `string`. Required. Column to apply substitutions to. + * `substitutions` -- Type: `list`. A list of substitutions to apply. See the [substitutions](substitutions) section for more information. + +``` +[[substitution_columns]] +column_name = "namefrst_std" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "1" +substitution_file = "/path/to/name_std/male.csv" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "2" +substitution_file = "/path/to/name_std/female.csv" +``` + + +## [Feature Selections](feature_selection_transforms) + +* Header name: `feature_selections` +* Description: A list of feature selections to apply to the input data after substitutions and column mappings. See the [feature selection transforms](feature_selection_transforms) section for more information, including information on the specific transforms available. + +* Required: False +* Type: List +* Attributes: + * `input_column` -- Type: `string`. Required. The name of the input column. + * `output_column` -- Type: `string`. Required. The name of the output column. + * `transform` -- Type: `string`. The name of the transform to apply to the column. + * Other attributes vary depending on transform type. + +``` +[[feature_selections]] +input_column = "namelast_clean" +output_column = "namelast_clean_bigrams" +transform = "bigrams" + +[[feature_selections]] +input_column = "bpl_clean" +output_column = "region" +transform = "attach_variable" +region_dict = "/path/to/region.csv" +col_to_join_on = "bpl" +col_to_add = "region" +null_filler = 99 +col_type = "float" +``` + +## Potential Matches Universe + +* Header name: `potential_matches_universe` +* Description: Limits the universe of created potential matches generated using an expression fed to a SQL query. +* Required: False +* Type: List +* Attributes: + * `expression` -- Type: `string`. Required. The expression to use to filter prepped_df_(a/b) before generating potential matches. + +``` +[[potential_matches_universe]] +# limits potential matches created to only men +expression = "sex == 1" +``` + +## Blocking + +* Header name: `blocking` +* Description: Describes what columns to block on and how to create the blocks for the potential matches. +* Required: True +* Type: List +* Attributes: + * `column_name` -- Type: `string`. Required. The name of the column in the existing data to block on if not exploded; The name of the newly exploded column if `explode = true`. + * `explode` -- Type: `boolean`. Optional. If true, will attempt to "explode" the column by creating duplicate rows for each value in the column. Only works on columns that are arrays of values or when `expand_length` is set. + * `dataset` -- Type: `string`. Optional. Must be `a` or `b` and used in conjuction with `explode`. Will only explode the column from the `a` or `b` dataset when specified. + * `derived_from` -- Type: `string`. Used in conjunction with `explode = true`. Specifies an input column from the existing dataset to be exploded. + * `expand_length` -- Type: `integer`. When `explode` is used on a column that is an integer, this can be specified to create an array with a range of integer values from (`expand_length` minus `original_value`) to (`expand_length` plus `original_value`). For example, if the input column value for birthyr is 1870, explode is true, and the expand_length is 3, the exploded column birthyr_3 value would be the array [1867, 1868, 1869, 1870, 1871, 1872, 1873]. + + +``` +[[blocking]] +column_name = "bpl" + +[[blocking]] +column_name = "birthyr_3" +dataset = "a" +derived_from = "birthyr" +expand_length = 3 +explode = true +``` + +## [Comparisons](comparison_types) + +* Header name: `comparisons` +* Description: A list of comparisons to threshold the potential matches on. Only potential matches that pass the thresholds will be created. See [comparison types](comparison_types) for more information. +* Required: True +* Type: Object +* Attributes: + * `comparison_type` -- Type: `string`. Required. See [comparison types](comparison_types) for more information. + * `feature_name` -- Type: `string`. Required. The `comparison_feature` to use for the comparison threshold. A `comparison_feature` column by this name must be specified in the `comparison_features` section. + +``` +[comparisons] +operator = "AND" + +[comparisons.comp_a] +comparison_type = "threshold" +feature_name = "namefrst_jw" +threshold = 0.79 + +[comparisons.comp_b] +comparison_type = "threshold" +feature_name = "namelast_jw" +threshold = 0.79 +``` + +## [Household Comparisons](comparison_types) + +* Header name: `hh_comparisons` +* Description: A list of comparisons to threshold the household potential matches on. Also referred to as post-blocking filters, as all household potential matches are created, then only potential matches that pass the post-blocking filters will be kept for scoring. See [comparison types](comparison_types) for more information. +* Required: False +* Type: Object +* Attributes: + * `comparison_type` -- Type: `string`. Required. See [comparison types](comparison_types) for more information. + * `feature_name` -- Type: `string`. Required. The `comparison_feature` to use for the comparison threshold. A `comparison_feature` column by this name must be specified in the `comparison_features` section. + +``` +[hh_comparisons] +# only keep household potential matches with an age difference less than or equal than ten years +comparison_type = "threshold" +feature_name = "byrdiff" +threshold_expr = "<= 10" +``` + +## [Comparison Features](comparison_types) + +* Header name: `comparison_features` +* Description: A list of comparison features to create when comparing records. Comparisons for individual and household linking rounds are both represented here -- no need to duplicate comparisons if used in both rounds, simply specify the `column_name` in the appropriate `training` or `hh_training` section of the config. See the [comparison types](comparison_types) section for more information. +* Required: True +* Type: List +* Attributes: + * `alias` -- Type: `string`. Optional. The name of the comparison feature column to be generated. If not specified, the output column will default to `column_name`. + * `column_name` -- Type: `string`. The name of the columns to compare. + * `comparison_type` -- Type: `string`. The name of the comparison type to use. See the [comparison types](comparison_types) section for more information. + * `categorical` -- Type: `boolean`. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage). + * Other attributes may be included as well depending on `comparison_type`. See the [comparison types](comparison_types) section for details on each comparison type. + +``` +[[comparison_features]] +alias = "race" +column_name = "race" +comparison_type = "equals" +categorical = true + +[[comparison_features]] +alias = "namefrst_jw" +column_name = "namefrst_unstd" +comparison_type = "jaro_winkler" + +[[comparison_features]] +column_name = "durmarr" +comparison_type = "new_marr" +upper_threshold = 7 +``` + +## [Pipeline-generated Features](pipeline_features) + +* Header name: `pipeline_features` +* Description: Features to be added in the model pipeline created for scoring a dataset. These features cannot be used in the `comparisons` section of the config and are for creating more robust ML models. They typically leverage code available in the Spark Pipeline API. +* Required: False +* Type: List +* Attributes: + * `transformer_type` -- Type: `string`. Required. See [pipeline features](pipeline_features) for more information on the available transformer types. + * `input_column` -- Type: `string`. Either use `input_column` or `input_columns`. Used if a single input_column is needed for the pipeline feature. + * `input_columns` -- Type: List of strings. Either use `input_column` or `input_columns`. Used if a list of input_columns is needed for the pipeline feature. + * `output_column` -- Type: `string`. The name of the new pipeline feature column to be generated. + * `categorical` -- Type: `boolean`. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage). + * Other attributes may be included as well depending on the particular pipline feature `transformer_type`. + +``` +[[pipeline_features]] +input_columns = ["sex_equals", "regionf"] +output_column = "sex_regionf_interaction" +transformer_type = "interaction" + +[[pipeline_features]] +input_column = "immyear_diff" +output_column = "immyear_caution" +transformer_type = "bucketizer" +categorical = true +splits = [-1,0,6,11,9999] +``` + +## Training and [models](models) + +* Header name: `training` +* Description: Specifies the training data set as well as a myriad of attributes related to training a model including the dependent variable within that dataset, the independent variables created from the `comparison_features` section, and the different models you want to use for either model exploration or scoring. +* Required: False +* Type: Object +* Attributes: + * `dataset` -- Type: `string`. Location of the training dataset. Must be a csv file. + * `dependent_var` -- Type: `string`. Name of dependent variable in training dataset. + * `independent_vars` -- Type: `list`. List of independent variables to use in the model. These must be either part of `pipeline_features` or `comparison_features`. + * `chosen_model` -- Type: `object`. The model to train with in the `training` task and score with in the `matching` task. See the [models](models) section for more information on model specifications. + * `threshold` -- Type: `float`. The threshold for which to accept model probability values as true predictions. Can be used to specify a threshold to use for all models, or can be specified within each `chosen_model` and `model_parameters` specification. + * `decision` -- Type: `string`. Optional. Specifies which decision function to use to create the final prediction. The first option is `drop_duplicate_a`, which drops any links for which a record in the `a` data set has a predicted match more than one time. The second option is `drop_duplicate_with_threshold_ratio` which only takes links for which the `a` record has the highest probability out of any other potential links, and the second best link for the `a` record is less than the `threshold_ratio`. + * `threshold_ratio` -- Type: `float`. Optional. For use when `decision` is `drop_duplicate_with_threshold_ratio` . Specifies the smallest possible ratio to accept between a best and second best link for a given record. Can be used to specify a threshold ratio (beta threshold) to use for all models. Alternatively, unique threshold ratios can be specified in each individual `chosen_model` and `model_parameters` specification. + * `model_parameters` -- Type: `list`. Specifies models to test out in the `model_exploration` task. See the [models](models) section for more information on model specifications. + * `param_grid` -- Type: `boolean`. Optional. If you would like to evaluate multiple hyper-parameters for a single model type in your `model_parameters` specification, you can give hyper-parameter inputs as arrays of length >= 1 instead of integers to allow one model per row specification with multiple model eval outputs. + * `score_with_model` -- Type: `boolean`. If set to false, will skip the `apply_model` step of the matching task. Use this if you want to use the `run_all_steps` command and are just trying to generate potential links, such as for the creation of training data. + * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of training iterations to use during the `model_exploration` task. + * `scale_data` -- Type: `boolean`. Optional. Whether to scale the data as part of the machine learning pipeline. + * `use_training_data_features` -- Type: `boolean`. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to `true`, or training features will not be able to be generated, giving null column errors. For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to `true` or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data. If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to `false`, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven't changed, you could set it to `true` to save a small amount of processing time. + * `output_suspicious_TD` -- Type: `boolean`. Optional. Used in the `model_exploration` link task. Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data. Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set. + * `split_by_id_a` -- Type: `boolean`. Optional. Used in the `model_exploration` link task. When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a "A304BT" has three potential matches in the training data, one each to histid_b "B200", "C201", and "D425", all of those potential matches would either end up in the "train" split or the "test" split when evaluating the model performance. + * `feature_importances` -- Type: `boolean`. Optional, and currently not functional. Whether to record feature importances for the training features when training or evaluating an ML model. + + +``` +[training] +independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"] +scale_data = false +dataset = "/path/to/1900_1910_training_data_20191023.csv" +dependent_var = "match" +use_training_data_features = false +output_suspicious_TD = true +split_by_id_a = true + +score_with_model = true +feature_importances = true + +decision = "drop_duplicate_with_threshold_ratio" + +n_training_iterations = 10 +param_grid = false +model_parameters = [ + { type = "random_forest", maxDepth = 6, numTrees = 50 }, + { type = "probit", threshold = 0.5} +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } +``` + +## Household training and models + +* Header name: `hh_training` +* Description: Specifies the household training data set as well as a myriad of attributes related to training a model including the dependent var within that data set, the independent vars created from the `comparison_features` section, and the different models you want to use. +* Required: False +* Type: Object +* Attributes: + * All of the attributes and [models](models) available in [training](#training-and-models) may also be used here. + * `prediction_col` -- Type: `string`. Required. The name of the column that the final prediction value is recorded in the individual linking round scoring step. + * `hh_col` -- Type: `string`. Required. The name of the column with the household identifier. + +``` +[hh_training] +prediction_col = "prediction" +hh_col = "serialp" + +independent_vars = ["namelast_jw","namefrst_jw","namefrst_std_jw", "jw_max_a", "jw_max_b", "f1_match", "f2_match", "byrdifcat", "racematch", "imm", "bplmatch", "imm_interacted_bplmatch", "sexmatch", "mardurmatch", "relatetype", "relatematch", "relatetype_interacted_relatematch"] + +scale_data = false +dataset = "/path/to/hh_training_data_1900_1910.csv" +dependent_var = "match" +use_training_data_features = false +output_suspicious_TD = true +split_by_id_a = true +score_with_model = true +feature_importances = true +decision = "drop_duplicate_with_threshold_ratio" + +param_grid = true +n_training_iterations = 10 +model_parameters = [ + { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.1]}, + { type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]} +] + +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } +``` diff --git a/sphinx-docs/feature_selection_transforms.md b/sphinx-docs/feature_selection_transforms.md new file mode 100644 index 0000000..0e7e332 --- /dev/null +++ b/sphinx-docs/feature_selection_transforms.md @@ -0,0 +1,102 @@ +# Feature Selection transforms + +Each header below represents a feature selection transform. These transforms are used in the context of `feature_selections`. + +``` +[[feature_selections]] +input_column = "clean_birthyr" +output_column = "replaced_birthyr" +condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end" +transform = "sql_condition" +``` + +There are some additional attributes available for all transforms: `checkpoint`, `override_column_a`, `override_column_b`, `set_value_column_a`, `set_value_column_b`. + +## bigrams + +Split the given string column into [bigrams](https://en.wikipedia.org/wiki/Bigram). + +* Attributes: + * `input_column` - Type: `string`. Required. + * `output_column` - Type: `string`. Required. + * `no_first_pad` - Type: boolean. Optional. If set to true, don't prepend a space " " to the column before splitting into bigrams. If false or not provided, do prepend the space. + +``` +[[feature_selections]] +input_column = "namelast_clean" +output_column = "namelast_clean_bigrams" +transform = "bigrams" +``` + +## sql_condition + +Apply the given SQL. + +* Attributes: + * `condition` - Type: `string`. Required. The SQL condition to apply. + * `output_column` - Type: `string`. Required. + +``` +[[feature_selections]] +input_column = "clean_birthyr" +output_column = "replaced_birthyr" +condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end" +transform = "sql_condition" +``` + +## array + +Combine two input columns into an array output column. + +* Attributes: + * `input_columns` - Type: list of strings. Required. The two input columns. + * `output_column` - Type: `string`. Required. + +``` +[[feature_selections]] +input_columns = ["namelast_clean_bigrams", "namefrst_unstd_bigrams"] +output_column = "namelast_frst_bigrams" +transform = "array" +``` + +## union + +Take the set union of two columns that are arrays of strings, returning another +array of strings. + +* Attributes: + * `input_columns` - Type: list of strings. Required. + * `output_column` - Type: `string`. Required. + +## soundex + +Compute the [soundex](https://en.wikipedia.org/wiki/Soundex) encoding of the input column. + +* Attributes: + * `input_column` - Type: `string`. Required. + * `output_column` - Type: `string`. Required. + +``` +[[feature_selections]] +input_column = "namelast_clean" +output_column = "namelast_clean_soundex" +transform = "soundex" +``` + +## power + +Raise the input column to a given power. + +* Attributes: + * `input_col` - Type: `string`. Required. + * `output_col` - Type: `string`. Required. + * `exponent` - Type: `int`. Required. The power to which to raise the input column. + +``` +[[feature_selections]] +input_col = "ncount" +output_col = "ncount2" +transform = "power" +exponent = 2 +``` + diff --git a/sphinx-docs/index.rst b/sphinx-docs/index.rst new file mode 100644 index 0000000..1f903fd --- /dev/null +++ b/sphinx-docs/index.rst @@ -0,0 +1,31 @@ +.. hlink documentation master file, created by + sphinx-quickstart on Mon Jul 1 14:30:23 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to hlink's documentation! +================================= + +.. toctree:: + :maxdepth: 2 + + introduction + installation + link_tasks + running_the_program + Advanced Workflows + config + +Configuration API +================= + +.. toctree:: + :maxdepth: 2 + :caption: Configuration API + + Column Mapping + Comparison Types + Feature Selection + Pipeline Features + substitutions + models diff --git a/sphinx-docs/installation.md b/sphinx-docs/installation.md new file mode 100644 index 0000000..ff82cd9 --- /dev/null +++ b/sphinx-docs/installation.md @@ -0,0 +1,15 @@ +# Installation + +## Requirements +Make sure that you have each of these installed on your system. + +- [Java 8](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) +- Python >= 3.6 + +## Installing the program + +In the root project directory, run `pip install .` + +To install hlink for development work, run `pip install -e .[dev]`. This will install additional +development dependencies and install hlink in editable mode so that any changes made to the source +code are automatically built. diff --git a/sphinx-docs/introduction.md b/sphinx-docs/introduction.md new file mode 100644 index 0000000..e1627b5 --- /dev/null +++ b/sphinx-docs/introduction.md @@ -0,0 +1,15 @@ +# Introduction + +## Overview + +`hlink` is designed to link two datasets. It allows for probabilistic and deterministic record linkage. It provides functionality for the following production tasks: + +1. [Preprocessing](link_tasks.html#preprocessing): preprocess each dataset to clean and transform it in preparation for linking. +2. [Training](link_tasks.html#training-and-household-training): train machine learning models on a set of features and compare results between models. +3. [Matching](link_tasks.html#matching): match two datasets using a model created in training or with deterministic rules. +4. [Household Training](link_tasks.html#training-and-household-training): train machine learning models on a set of features for households and compare results between models. +5. [Household Matching](link_tasks.html#household-matching): match households between two datasets. + +In addition, it also provides functionality for the following research/development tasks: +1. [Model Exploration and Household Model Exploration](link_tasks.html#model-exploration-and-household-model-exploration): Use a matrix of models and hyper-parameters to evaluate model performance and select a model to be used in the production run. Also generates reports of suspected false positives and false negatives in the specified training data set if appropriate config flag is set. +2. [Reporting](link_tasks.html#reporting): Generate reports on the linked data. diff --git a/sphinx-docs/link_tasks.md b/sphinx-docs/link_tasks.md new file mode 100644 index 0000000..6fd8a3d --- /dev/null +++ b/sphinx-docs/link_tasks.md @@ -0,0 +1,73 @@ +# Link Tasks + +## Preprocessing + +### Overview + +Read in raw data and prepare it for linking. + +### Task steps + +* Step 0: Register raw dataframes with the program. Read raw data in from .parquet or .csv files. +* Step 1: Prepare the dataframes for linking. Perform substitutions, transformations, and column mappings as requested. + +## Training and Household Training + +### Overview + +Train a machine learning model to use for classification of potential links. + +### Task steps + +The steps in each of these tasks are the same: +* Step 0: Ingest the training data from a .csv file. +* Step 1: Create comparison features. +* Step 2: Train and save the model. + +## Matching + +### Overview + +Run the linking algorithm, generating a table with potential matches between individuals in the two datasets. + +### Task steps + +* Step 0: Perform blocking, exploding any columns that need it. +* Step 1: Run the matching algorithm, outputting potential matches to a `potential_matches` table. +* Step 2: Score the potential matches with the trained model. This step will be automatically skipped if machine learning is not being used. + +## Household Matching + +### Overview + +Generate a table with potential matches between households in the two datasets. + +### Task steps + +* Step 0: Block on households. +* Step 1: Filter households based on `hh_comparisons` configuration settings. +* Step 2: Score the potential matches with the trained model. This step will be automatically skipped if machine learning is not being used. + +## Model Exploration and Household Model Exploration + +### Overview + +There are two dedicated linking tasks for model exploration. `model_exploration` uses configuration settings from the Training section of the config file. `hh_model_exploration` uses configuration settings from the Household Training section of the config file. See documentation of the [`[training]`](config.html#training-and-models) and [`[hh_training]`](config.html#household-training-and-models) config sections for more details. + +### Task steps +The steps in each of these tasks are the same: + * Step 0: Ingest the specified training data file specified in the config with the `dataset` tag. + * Step 1: Create training features on the training data, or use those in the training data file (specified in the respective config section with the `use_training_data_features` flag). + * Step 2: Run `n_training_iterations` number of train-test splits on each of the models in the config `model_parameters`. + +## Reporting + +### Overview + +Report on characteristics of the linked data. + +### Task steps + +* Step 0: For households with anyone linked in round 1, report the percent of remaining household members linked in round 2. +* Step 1: Report on the representivity of linked data compared to source populations. +* Step 2: Pull in key demographic data for linked individuals and export a fixed-width crosswalk file. diff --git a/sphinx-docs/make.bat b/sphinx-docs/make.bat new file mode 100644 index 0000000..2119f51 --- /dev/null +++ b/sphinx-docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/sphinx-docs/models.md b/sphinx-docs/models.md new file mode 100644 index 0000000..6631f5c --- /dev/null +++ b/sphinx-docs/models.md @@ -0,0 +1,60 @@ +# Models + +These are models available to be used in the model evaluation, training, and household training link tasks. + +* Attributes for all models: + * `threshold` -- Type: `float`. Alpha threshold (model hyperparameter). + * `threshold_ratio` -- Type: `float`. Beta threshold (de-duplication distance ratio). + * Any parameters available in the model as defined in the Spark documentation can be passed as params using the label given in the Spark docs. Commonly used parameters are listed below with descriptive explanations from the Spark docs. + +## random_forest + +Uses [pyspark.ml.classification.RandomForestClassifier](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier). Returns probability as an array. +* Parameters: + * `maxDepth` -- Type: `int`. Maximum depth of the tree. Spark default value is 5. + * `numTrees` -- Type: `int`. The number of trees to train. Spark default value is 20, must be >= 1. + * `featureSubsetStrategy` -- Type: `string`. Per the Spark docs: "The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]." + +``` +model_parameters = { type = "random_forest", maxDepth = 5, numTrees = 75, featureSubsetStrategy = "sqrt", threshold = 0.15, threshold_ratio = 1.0 } +``` + +## probit + +Uses [pyspark.ml.regression.GeneralizedLinearRegression](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.regression.GeneralizedLinearRegression) with `family="binomial"` and `link="probit"`. + +``` +model_parameters = { type = "probit", threshold = 0.85, threshold_ratio = 1.2 } +``` + +## logistic_regression + +Uses [pyspark.ml.classification.LogisticRegression](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression) + +``` +chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } +``` + +## decision_tree + +Uses [pyspark.ml.classification.DecisionTreeClassifier](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier). +* Parameters: + * `maxDepth` -- Type: `int`. Maximum depth of the tree. + * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1." + * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature." + +``` +chosen_model = { type = "decision_tree", maxDepth = 6, minInstancesPerNode = 2, maxBins = 4} +``` + +## gradient_boosted_trees + +Uses [pyspark.ml.classification.GBTClassifier](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html#pyspark.ml.classification.GBTClassifier). +* Parameters: + * `maxDepth` -- Type: `int`. Maximum depth of the tree. + * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1." + * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature." + +``` +chosen_model = { type = "gradient_boosted_trees", maxDepth = 4, minInstancesPerNode = 1, maxBins = 6, threshold = 0.7, threshold_ratio = 1.3 } +``` diff --git a/sphinx-docs/pipeline_features.md b/sphinx-docs/pipeline_features.md new file mode 100644 index 0000000..5e07829 --- /dev/null +++ b/sphinx-docs/pipeline_features.md @@ -0,0 +1,48 @@ +# Pipeline generated features + +## Transformer types + +Each header below represents a feature created using a transformation available through the Spark Pipeline API. These transforms are used in the context of `pipeline_features`. + +``` +[[pipeline_features]] +input_column = "immyear_diff" +output_column = "immyear_caution" +transformer_type = "bucketizer" +categorical = true +splits = [-1,0,6,11,9999] + +[[pipeline_features]] +input_columns = ["race","srace"] +output_column = "race_interacted_srace" +transformer_type = "interaction" + +``` + +### interaction + +Interact two or more features, creating a vectorized result. + +``` +[[pipeline_features]] +# interact the categorical features for mother caution flag, mother present flag, and mother jaro-winkler score +input_columns = ["m_caution", "m_pres", "jw_m"] +output_column = "m_interacted_jw_m" +transformer_type = "interaction" +``` + +### bucketizer + +From the `pyspark.ml.feature.Bucketizer()` docs: "Maps a column of continuous features to a column of feature buckets." + +* Attributes: + * `splits` -- Type: Array of integers. Required for this transformer_type. Per the `pyspark.ml.feature.Bucketizer()` docs: "Split points for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. The splits should be of length >= 3 and strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, values outside the splits specified will be treated as errors." + +``` +[[pipeline_features]] +input_column = "relate_a" +output_column = "relatetype" +transformer_type = "bucketizer" +categorical = true +splits = [1,3,5,9999] +``` diff --git a/sphinx-docs/running_the_program.md b/sphinx-docs/running_the_program.md new file mode 100644 index 0000000..3402d38 --- /dev/null +++ b/sphinx-docs/running_the_program.md @@ -0,0 +1,256 @@ +# Running hlink + +## Using hlink as a Library + +hlink can be used as a Python library for scripting linking jobs. It provides some high-level classes and +functions for interacting with Spark, handling configuration, and running linking tasks and steps. + +The main class in the library is `LinkRun`, which represents a complete linking job. It provides access +to each of the link tasks and their steps. Here is an example script that uses `LinkRun` to do some linking. +Below we go into more detail on each of the important aspects of the script. + +```python +from hlink.linking.link_run import LinkRun +from hlink.spark.factory import SparkFactory +from hlink.configs.load_config import load_conf_file + +# First we create a SparkSession with all default configuration settings. +factory = SparkFactory() +spark = factory.create() + +# Now let's load in our config file. +config = load_conf_file("./my_conf") + +lr = LinkRun(spark, config) + +# Get some information about each of the steps in the +# preprocessing task. +prep_steps = lr.preprocessing.get_steps() +for (i, step) in enumerate(prep_steps): + print(f"Step {i}:", step) + print("Required input tables:", step.input_table_names) + print("Generated output tables:", step.output_table_names) + +# Run all of the steps in the preprocessing task. +lr.preprocessing.run_all_steps() + +# Run the first two steps in the matching task. +lr.matching.run_step(0) +lr.matching.run_step(1) + +# Get the potential_matches table. +matches = lr.get_table("potential_matches") + +assert matches.exists() + +# Get the Spark DataFrame for the potential_matches table. +matches_df = matches.df() +``` + +Each link task can be accessed through the `LinkRun` as an attribute like `lr.preprocessing` or `lr.hh_model_exploration`. +Link steps for each task can be run with `task.run_all_steps()` or `task.run_step(i)`. The easiest way to +access Spark tables is through `lr.get_table()`. This method returns an `hlink.linking.table.Table` object, which provides +an interface to easily check if the table exists, get its Spark DataFrame, or drop it. + +To create a `LinkRun`, we need to set up a `pyspark.sql.SparkSession` object. The most convenient way to do this is through +the `hlink.spark.factory.SparkFactory` class. `SparkFactory` defines many default configuration values which can be adjusted as needed. + +``` +from hlink.spark.factory import SparkFactory + +factory = SparkFactory() +spark = factory.set_local().set_num_cores(8).set_executor_memory("5G").create() +``` + +We'll also need to load in a config to get the `LinkRun` up and running. A config is +a dictionary with string keys, often read in from a TOML or JSON file. The +`hlink.configs.load_config.load_conf_file` function is helpful for reading in config files, +as are the `json` and `toml` python modules. For more information on writing config files, +please see the [Configuration](config) page. + +In the `examples/tutorial` directory there is an example script that uses hlink as a library to +link people between two datasets. The example includes a working config file. + +## Interactive Mode + +In addition to a library, hlink provides a command-line interface, which can be started +with the `hlink` command. + +### Starting the program + +The program takes as input a TOML or JSON configuration file, described in the [Configuration](config) page. Parameters described in the config include paths to input data files, paths to training data files, instructions for generating machine learning features, and model parameters. The configuration enables reproducible runs that should produce the same results on the same input data. + +All input flags can be printed to the console by running `hlink --help`. + +``` +cpu ~$ hlink --help +usage: hlink [-h] [--mesos] [--user USER] [--cores CORES] + [--executor_memory EXECUTOR_MEMORY] [--task TASK] + [--execute_tasks EXECUTE_TASKS [EXECUTE_TASKS ...]] + [--execute_command EXECUTE_COMMAND [EXECUTE_COMMAND ...]] + [--conf CONF] + +Historical linking program. + +optional arguments: + -h, --help show this help message and exit + --mesos run on mesos at isrdi. Must be on isrdi machines to + work. + --user USER run as a specific user + --cores CORES the max number of cores to use on mesos + --executor_memory EXECUTOR_MEMORY + the memory per executor to use + --task TASK The initial task to begin processing. + --execute_tasks EXECUTE_TASKS [EXECUTE_TASKS ...] + Execute a series of tasks then exit the program. + --execute_command EXCUTE_COMMAND [EXECUTE_COMMAND ...] + Execute a single command then exit the program. + --conf CONF, --run CONF + Specify a filepath where your config file for the run + is located. +``` + +To run the program in interactive mode using a configuration file at a specified path, say `./fullcount_1870_1880.toml`, run a command following this pattern: + +```bash +hlink --conf=./full_count_1870_1880.toml +``` + +After the program has started, you will see a prompt that looks like this: + +``` +hlink $ +``` + +Type `help` or `?` and hit enter to see a list of commands; type `help ` to see the help text of a specific command. +Commands that start with "x_" are experimental. They may be unstable or missing some documentation. + +``` +hlink $ ? + +Documented commands (type help ): +======================================== +analyze get_steps set_preexisting_tables x_persist +borrow_tables get_tasks set_print_sql x_sql +count help show x_sqlf +csv ipython showf x_summary +desc list x_crosswalk x_tab +drop q x_hh_tfam x_tfam +drop_all reload x_hh_tfam_2a x_tfam_raw +drop_all_prc run_all_steps x_hh_tfam_2b x_union +drop_all_temp run_step x_load +get_settings set_link_task x_parquet_from_csv +``` + +### Running Linking Tasks and Steps + +The program is organized into a hierarchy of tasks and steps. The five major tasks are `preprocessing`, `training`, `matching`, `hh_training`, and `hh_matching`, and within each task are multiple steps. +To see all linking tasks, run the command `get_tasks`. You should see something like this: + +``` +hlink $ get_tasks +Current link task: Preprocessing +Linking task choices are: +preprocessing :: Preprocessing + Requires no preexisting tables. + Produces tables: {'prepped_df_a', 'prepped_df_b', 'raw_df_b', 'raw_df_a'} +training :: Training + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'training_data', 'training_features'} +matching :: Matching + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'scored_potential_matches', 'potential_matches_prepped', 'potential_matches', 'exploded_df_b', 'exploded_df_a', 'predicted_matches'} +hh_training :: Household Training + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'hh_training_features', 'hh_training_data'} +hh_matching :: Household Matching + Requires tables: {'prepped_df_a', 'predicted_matches', 'prepped_df_b'} + Produces tables: {'hh_predicted_matches', 'hh_scored_potential_matches', 'hh_potential_matches', 'hh_blocked_matches', 'hh_potential_matchs_prepped'} +model_exploration :: Model Exploration + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'model_eval_training_vectorized', 'model_eval_training_data', 'model_eval_repeat_FPs', 'model_eval_training_features', 'model_eval_training_results', 'model_eval_repeat_FNs'} +hh_model_exploration :: Household Model Exploration + Requires tables: {'prepped_df_a', 'prepped_df_b'} + Produces tables: {'hh_model_eval_training_vectorized', 'hh_model_eval_repeat_FPs', 'hh_model_eval_repeat_FNs', 'hh_model_eval_training_results', 'hh_model_eval_training_features', 'hh_model_eval_training_data'} +reporting :: Reporting + Requires tables: {'prepped_df_a', 'hh_predicted_matches', 'prepped_df_b', 'predicted_matches', 'raw_df_b', 'raw_df_a'} + Produces no persistent tables. +``` + +Each linking task will interact with Spark tables within the program. To see a list of tables run the command `list`. To also see hidden intermediate tables, run `list all`. If you have just started the program for the first time, you should see no tables created yet: + +``` +hlink $ list ++--------+---------+-----------+ +|database|tableName|isTemporary| ++--------+---------+-----------+ ++--------+---------+-----------+ +``` + +To see information about the steps of the task you are currently on, run `get_steps`. You should see something that looks like this: + +```txt +Link task: Preprocessing +step 0: register raw dataframes + Tables used: + Tables created: + Table 'raw_df_a' <- Preprocessing: Raw data read in from datasource A + Table 'raw_df_b' <- Preprocessing: Raw data read in from datasource B +step 1: prepare dataframes + Tables used: + Table 'raw_df_a' <- Preprocessing: Raw data read in from datasource A + Table 'raw_df_b' <- Preprocessing: Raw data read in from datasource B + Tables created: + Table 'prepped_df_a' <- Preprocessing: Preprocessed data from source A with selected columns and features + Table 'prepped_df_b' <- Preprocessing: Preprocessed data from source B with selected columns and features +``` + +To change your current link task, run `set_link_task `, where `` is the name of the link task. + +Once you are sure that you are on the right task, you can use the `run_step ` command to run a step. For example if you run `run_step 0` you should see something like this: + +``` +hlink $ run_step 0 +Link task: Preprocessing +Running step 0: register raw dataframes +Finished step 0: register raw dataframes in 5.85s +``` + +After the step is complete, you can run `list` to see what tables it created: + +``` +hlink $ list ++--------+---------+-----------+-------------------------------------------------+ +|database|tableName|isTemporary|description | ++--------+---------+-----------+-------------------------------------------------+ +|linking |raw_df_a |false |Preprocessing: Raw data read in from datasource A| +|linking |raw_df_b |false |Preprocessing: Raw data read in from datasource B| ++--------+---------+-----------+-------------------------------------------------+ +``` + +To run all steps in a task, use the `run_all_steps ` command, where `` is a list of tasks you want to run all the steps for. By default this command will run all the steps for the current task. + +### Example interactive mode workflow + +1) Create a config file and put it in your hlink config directory. + For example: + ``` + /path/to/conf/full_count_1870_1880.toml + ``` + +2) Launch the hlink program in interactive mode: + ```bash + hlink --conf=/path/to/conf/full_count_1870_1880 + ``` +3) Run the tasks you want to complete: + ``` + hlink $ run_all_steps preprocessing training matching + ``` +4) List the created tables: + ``` + hlink $ list + ``` +5) Export the results: + ``` + hlink $ csv predicted_matches /my/output/file.csv + ``` diff --git a/sphinx-docs/substitutions.md b/sphinx-docs/substitutions.md new file mode 100644 index 0000000..93c9947 --- /dev/null +++ b/sphinx-docs/substitutions.md @@ -0,0 +1,49 @@ +# Substitutions +* Parent header: `substitution_columns` +* Subheader name: `substitutions` +* Type: List +* Attributes: + * `substitution_file` -- Type: `string`. Required. Path to the file containing the look-up table to join against for replacement values. + +You must supply a substitution file and either specify `regex_word_replace=true` or supply a join value. + +## 1:1 substitution by data table + +Performs a 1:1 replacement on a filtered subset of the data table. If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file. Used to replace variant name forms with standardized name forms, filtering on sex. + +* Attributes: + * `join_column` -- Type: `string`. Column to filter input data on. + * `join_value` -- Type: `string`. Value to filter for in the input data. + +``` +[[substitution_columns]] +column_name = "namefrst_std" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "1" +substitution_file = "/path/to/name_std/male.csv" + +[[substitution_columns.substitutions]] +join_column = "sex" +join_value = "2" +substitution_file = "/path/to/name_std/female.csv" +``` + +## Substitution by regex word replace + +Performs word replacement within a column's data string (such as replacing the abbreviation `Ave.` in the string `7th Ave.` with `Avenue` to create `7th Avenue`). + +* Attributes: + * `regex_word_replace` -- Type: `boolean`. Whether or not to use regex matching on the input data to perform replacement. If `true`, the swap value will still be replaced if it is anywhere in the column data, as long as it is: + * at the start of the column data string, or proceeded by a space + * at the end of the column data string, or followed by a space + +``` +[[substitution_columns]] +column_name = "street_unstd" + +[[substitution_columns.substitutions]] +regex_word_replace = true +substitution_file = "/path/to/dir/substitutions_street_abbrevs.csv" +``` diff --git a/sphinx-docs/use_examples.md b/sphinx-docs/use_examples.md new file mode 100644 index 0000000..e781202 --- /dev/null +++ b/sphinx-docs/use_examples.md @@ -0,0 +1,137 @@ +# Advanced Workflow Examples + + +## Export training data after generating features to reuse in different linking years + +It is common to have a single training data set that spans two linked years, which is then used to train a model that is applied to a different set of linked years. For example, we have a training data set that spans linked individuals from the 1900 census to the 1910 census. We use this training data to predict links in the full count 1900-1910 linking run, but we also use this training data to link year pairs 1910-1920, 1920-1930, and 1930-1940. + +When this training data set is used for the years it was derived from, the only columns necessary are the HISTIDs identifying the individuals in the data and the dependent variable (usually a boolean `match` column) for the model training. Features for the machine learning model generation are created from the source data available in the full count run. + +However, when this training data set is used for other years, the program does not have access to the source full count files, and can't generate the ML features from the given data. In this scenario, you would need to save a copy of the `training_features` and `hh_training_features` Spark tables to .csv so you can point to that in the other year pair runs, and indicate the `use_potential_matches_features = true` flag in both the `training` and `hh_training` sections of the configuration. + +### Example training data export with generated ML features + +1) Create a config file and put it in your hlink config directory. + +2) Launch the hlink program in interactive mode: + + ``` + hlink --conf=full_count_1900_1910 --cores 50 --executor_memory 50G + ``` + +3) Run the preprocessing and training link tasks: + + ```bash + hlink $ run_all_steps preprocessing training + ``` + +4) Ask the program what the arguments for the `csv` command are: + + ```bash + hlink $ ? csv + Writes a dataframe out to csv. + Arg 1: dataframe + Arg 2: path + Arg 3 (optional): # of partitions + ``` + +5) Export the results using the `csv` command: + + ```bash + hlink $ csv training_features /my/output/training_data_1900_1910_HLINK_FEATURES.csv + ``` + +6) Continue with other linking work you might need to do with this year pair, otherwise shut down the hlink framework for this pair of linking years: + + ```bash + hlink $ q + ``` + +7) In the config file for the new year pairs (1910-1920, 1920-1930, etc.), point to this new file as your dataset, and set the `use_training_data_features` + + ``` + # config file for 1910-1920 linking run using the 1900-1910 training data with hlink-generated features + [training] + + # more configs here... + + dataset = "/path/to/training_data_1900_1910_HLINK_FEATURES.csv" + dependent_var = "match" + + # This needs to be changed to `true` to use the features we just generated + use_training_data_features = true + + # configs continue here... + ``` + +8) Launch the hlink program using your new config for the new year pair you want to link. Run your link tasks and export relevant data. + +## ML model exploration and export of lists of potential false positives/negatives in training data +`hlink` accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models. You can see example `training` and `hh_training` configuration sections that implement this in the [training](config.html#training-and-models) and [household training](config.html#household-training-and-models) sections of the configuration documentation. + +The model exploration link task also allows you to export lists of potential false positives (FPs) and false negatives (FNs) in your training data. This is calculated when running the train/test splits in the regular model exploration tasks if the `output_suspicious_TD` flag is true. + +### Example model exploration and FP/FN export workflow + +1) Create a config file that has a `training` and/or `hh_training` section with model parameters to explore. For example: + + ``` + [training] + + independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"] + + scale_data = false + dataset = "/path/to/training_data_1900_1910.csv" + dependent_var = "match" + + # This would need to be changed to `true` in a run between other years if your + # source data years weren't identical to the linked years of your training data. + use_training_data_features = false + + # VERY IMPORTANT if you want to output FPs/FNs + output_suspicious_TD = true + + split_by_id_a = true + score_with_model = true + feature_importances = false + decision = "drop_duplicate_with_threshold_ratio" + param_grid = true + n_training_iterations = 10 + model_parameters = [ + { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.0, 1.1]}, + { type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]} + ] + + # The chosen_model is the final selected model to use in the full count production + # run. This is where you would manually update your config after running model + # exploration and making decisions about your models and hyperparameters. This + # section isn't used by the model exploration task. + chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 } + ``` + +2) Launch the hlink program in interactive mode: + + ```bash + hlink --conf=full_count_1900_1910 --cores 50 --executor_memory 50G + ``` + +3) Run the preprocessing and model exploration link tasks: + + ``` + hlink $ run_all_steps preprocessing model_exploration + ``` + +4) Export the results of the train/test split runs to csv for further analysis. For `training` params, the results will be in the `training_results` table, and for `hh_training` in the `hh_training_results` table. + + ``` + hlink $ csv training_results /my/output/1900_1910_training_results.csv + ``` + +5) Export the potential FPs and FNs to csv. For `training` params, the results will be in the `repeat_FPs` and `repeat_FNs` tables, and for `hh_training` in the `hh_repeat_FPs` and `hh_repeat_FNs` tables. + + ``` + hlink $ csv repeat_FPs /my/output/1900_1910_potential_FPs.csv + hlink $ csv repeat_FNs /my/output/1900_1910_potential_FNs.csv + ``` + +6) Use your preferred methods to analyze the data you've just exported. Update the `chosen_model` in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.