diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index d642edc..633f9e0 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,6 +1,7 @@ /Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E" @@ -10,7 +11,9 @@ /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_blocking.R="DABEA252" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_data.R="9D1011B0" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_hnsw.R="2E19A832" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_mlpack.R="51D2EAA1" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F" diff --git a/R/blocking.R b/R/blocking.R index 7dbdd1b..69d9a62 100644 --- a/R/blocking.R +++ b/R/blocking.R @@ -12,7 +12,7 @@ #' @importFrom utils combn #' #' -#' @title Main function for blocking records given text data +#' @title Block records based on text data. #' #' @author Maciej Beręsewicz #' @@ -271,22 +271,47 @@ blocking <- function(x, ## if true are given if (!is.null(true_blocks)) { - ## Graph metrics - eval_g1 <- igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = FALSE) - eval_g2 <- igraph::graph_from_data_frame(true_blocks[, c("x", "y")], directed = FALSE) + setDT(true_blocks) - eval_g1_cl <- igraph::make_clusters(eval_g1, membership = igraph::components(eval_g1, "weak")$membership) - eval_g2_cl <- igraph::make_clusters(eval_g2, membership = igraph::components(eval_g2, "weak")$membership) + pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")] + pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE] + pairs_to_eval[is.na(both), both := FALSE] - eval_metrics <- base::sapply(c("vi", "nmi", "split.join", "rand", "adjusted.rand"), - igraph::compare, comm1=eval_g1_cl, comm2=eval_g2_cl) + true_blocks[pairs_to_eval, on = c("x", "y"), both := TRUE] + true_blocks[is.na(both), both := FALSE] + true_blocks[, block:=block+max(pairs_to_eval$block)] + + pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == FALSE, .(x,y,block)], fill = TRUE) + + if (!deduplication) { + + pairs_to_eval[, x2:=x+max(y)] + pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, block, both)], id.vars = c("block", "both")) + pairs_to_eval_long[!is.na(both), block_id := .GRP, block] + block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T) + pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)] + pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block] + true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T) + pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)] + + } else { + + pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, block, both)], id.vars = c("block", "both")) + pairs_to_eval_long[!is.na(both), block_id := .GRP, block] + block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T) + pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)] + pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block] + true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T) + pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)] + + } + + ## consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads) + candidate_pairs <- utils::combn(nrow(pairs_to_eval_long), 2) + + same_block <- pairs_to_eval_long$block_id[candidate_pairs[1, ]] == pairs_to_eval_long$block_id[candidate_pairs[2, ]] + same_truth <- pairs_to_eval_long$true_id[candidate_pairs[1, ]] == pairs_to_eval_long$true_id[candidate_pairs[2, ]] - ## standard metrics based on klsh::confusion.from.blocking - block_ids <- eval_g1_cl$membership - true_ids <- eval_g2_cl$membership - candidate_pairs <- utils::combn(length(block_ids), 2) - same_block <- block_ids[candidate_pairs[1, ]] == block_ids[candidate_pairs[2, ]] - same_truth <- true_ids[candidate_pairs[1, ]] == true_ids[candidate_pairs[2, ]] confusion <- table(same_block, same_truth) fp <- confusion[2, 1] @@ -295,11 +320,11 @@ blocking <- function(x, tn <- confusion[1, 1] recall <- tp/(fn + tp) - eval_metrics2 <- c(recall = tp/(fn + tp), precision = tp/(tp + fp), - fpr = fp/(fp + tn), fnr = fn/(fn + tp), - accuracy = (tp + tn)/(tp + tn + fn + fp), - specificity = tn/(tn + fp)) - eval_metrics <- c(eval_metrics, eval_metrics2) + eval_metrics <- c(recall = tp / (fn + tp), precision = tp / (tp + fp), + fpr = fp / (fp + tn), fnr = fn / (fn + tp), + accuracy = (tp + tn) / (tp + tn + fn + fp), + specificity = tn / (tn + fp)) + } setorderv(x_df, c("x", "y", "block")) diff --git a/R/methods.R b/R/methods.R index 32e677c..45017ad 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2,13 +2,8 @@ #' @exportS3Method print.blocking <- function(x,...) { - block_ids <- x$result$block - - if (x$deduplication) { - blocks_tab <- table(block_ids) - block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1) - } - + blocks_tab <- table(x$result$block) + block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1) rr <- 1 - sum(choose(table(block_ids), 2))/choose(length(block_ids), 2) cat("========================================================\n") @@ -25,9 +20,7 @@ print.blocking <- function(x,...) { if (!is.null(x$metrics)) { cat("========================================================\n") cat("Evaluation metrics (standard):\n" ) - print(x$metrics[6:11]) - cat("\nEvaluation metrics (graph-based):\n" ) - print(x$metrics[1:5]) + print(round(x$metrics*100, 4)) } invisible(x) diff --git a/R/reclin2_pair_ann.R b/R/reclin2_pair_ann.R index dbcf6fe..6770c51 100644 --- a/R/reclin2_pair_ann.R +++ b/R/reclin2_pair_ann.R @@ -6,7 +6,7 @@ #' @author Maciej Beręsewicz #' #' @description -#' Function for the integration with the reclin2 package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code. +#' Function for the integration with the `reclin2` package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code. #' #' @param x reference data (a data.frame or a data.table), #' @param y query data (a data.frame or a data.table, default NULL), diff --git a/README.Rmd b/README.Rmd index 7180a69..60898c5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -20,16 +20,16 @@ knitr::opts_chunk$set( ## Description -An R package that aims to block records for data deduplication and record linkage (a.k.a. entity resolution) based on [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package). +This R package is designed to block records for data deduplication and record linkage (also known as entity resolution) using [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package). -Currently supports the following R packages that binds to specific ANN algorithms: +It supports the following R packages that bind to specific ANN algorithms: + [rnndescent](https://cran.r-project.org/package=rnndescent) (default, very powerful, supports sparse matrices), + [RcppHNSW](https://cran.r-project.org/package=RcppHNSW) (powerful but does not support sparse matrices), + [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy), + [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see `mlpack::lsh` and `mlpack::knn`). -The package also supports integration with the [reclin2](https://cran.r-project.org/package=reclin2) package via `blocking::pair_ann` function. +The package can be used with the [reclin2](https://cran.r-project.org/package=reclin2) package via the `blocking::pair_ann` function. ## Funding @@ -37,7 +37,7 @@ Work on this package is supported by the National Science Centre, OPUS 22 grant ## Installation -You can install the development version of the `blocking` package from GitHub with: +Install the GitHub blocking package with: ```{r, eval=FALSE} # install.packages("remotes") # uncomment if needed @@ -53,7 +53,7 @@ library(blocking) library(reclin2) ``` -Generate simple data with two groups (`df_example`) and reference data (`df_base`). +Generate simple data with three groups (`df_example`) and reference data (`df_base`). ```{r} df_example <- data.frame(txt = c( @@ -73,7 +73,7 @@ df_example df_base ``` -Deduplication using `blocking` function. Output contains information about: +Deduplication using the `blocking` function. Output contains information: + the method used (where `nnd` which refers to the NN descent algorithm), + number of blocks created (here 2 blocks), @@ -86,9 +86,9 @@ blocking_result <- blocking(x = df_example$txt) blocking_result ``` -Table with blocking which contains: +Table with blocking results contains: -+ row numbers from the original data ++ row numbers from the original data, + block number (integers), + distance (from the ANN algorithm). @@ -96,7 +96,7 @@ Table with blocking which contains: blocking_result$result ``` -Deduplication using the `pair_ann` function for integration with the `reclin2` package. Here I use the pipeline that can be used with the `reclin2` package. +Deduplication using the `pair_ann` function for integration with the `reclin2` package. Use the pipeline with the `reclin2` package. ```{r} pair_ann(x = df_example, on = "txt") |> @@ -105,7 +105,8 @@ pair_ann(x = df_example, on = "txt") |> select_threshold("threshold", score = "score", threshold = 0.55) |> link(selection = "threshold") ``` -Record linkage using the same function where `df_base` is the "register" and `df_example` is the reference (query data). + +Linking records using the same function where `df_base` is the "register" and `df_example` is the reference (data). ```{r} pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |> diff --git a/README.md b/README.md index bf98fac..38757de 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,13 @@ coverage](https://codecov.io/gh/ncn-foreigners/blocking/branch/main/graph/badge. ## Description -An R package that aims to block records for data deduplication and -record linkage (a.k.a. entity resolution) based on [approximate nearest -neighbours algorithms +This R package is designed to block records for data deduplication and +record linkage (also known as entity resolution) using [approximate +nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package). -Currently supports the following R packages that binds to specific ANN +It supports the following R packages that bind to specific ANN algorithms: - [rnndescent](https://cran.r-project.org/package=rnndescent) (default, @@ -28,8 +28,8 @@ algorithms: - [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see `mlpack::lsh` and `mlpack::knn`). -The package also supports integration with the -[reclin2](https://cran.r-project.org/package=reclin2) package via +The package can be used with the +[reclin2](https://cran.r-project.org/package=reclin2) package via the `blocking::pair_ann` function. ## Funding @@ -39,8 +39,7 @@ Work on this package is supported by the National Science Centre, OPUS ## Installation -You can install the development version of the `blocking` package from -GitHub with: +Install the GitHub blocking package with: ``` r # install.packages("remotes") # uncomment if needed @@ -57,7 +56,7 @@ library(reclin2) #> Loading required package: data.table ``` -Generate simple data with two groups (`df_example`) and reference data +Generate simple data with three groups (`df_example`) and reference data (`df_base`). ``` r @@ -91,8 +90,8 @@ df_base #> 3 other ``` -Deduplication using `blocking` function. Output contains information -about: +Deduplication using the `blocking` function. Output contains +information: - the method used (where `nnd` which refers to the NN descent algorithm), @@ -117,9 +116,9 @@ blocking_result #> 2 ``` -Table with blocking which contains: +Table with blocking results contains: -- row numbers from the original data +- row numbers from the original data, - block number (integers), - distance (from the ANN algorithm). @@ -136,8 +135,7 @@ blocking_result$result ``` Deduplication using the `pair_ann` function for integration with the -`reclin2` package. Here I use the pipeline that can be used with the -`reclin2` package. +`reclin2` package. Use the pipeline with the `reclin2` package. ``` r pair_ann(x = df_example, on = "txt") |> @@ -158,8 +156,8 @@ pair_ann(x = df_example, on = "txt") |> #> 6: 8 5 montypython monty ``` -Record linkage using the same function where `df_base` is the “register” -and `df_example` is the reference (query data). +Linking records using the same function where `df_base` is the +“register” and `df_example` is the reference (data). ``` r pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |> diff --git a/inst/tinytest/test_blocking.R b/inst/tinytest/test_blocking.R index 7016a42..ac1fba9 100644 --- a/inst/tinytest/test_blocking.R +++ b/inst/tinytest/test_blocking.R @@ -59,18 +59,16 @@ expect_error( expect_equal( blocking(x = df_example$txt, true_blocks = result$result[, c("x", "y", "block")])$metrics, - c(vi = 0, nmi = 1, split.join = 0, rand = 1, adjusted.rand = 1, - recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1 - ) + c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1) ) # check if true_block is a vector -expect_silent( - blocking(x = df_example$txt, - #true_blocks = result$result$block) - true_blocks = result$result[, c("x", "y", "block")]) -) +# expect_silent( +# blocking(x = df_example$txt, +# #true_blocks = result$result$block) +# true_blocks = result$result[, c("x", "y", "block")]) +# ) ## printing diff --git a/man/blocking.Rd b/man/blocking.Rd index e512a70..12ad080 100644 --- a/man/blocking.Rd +++ b/man/blocking.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/blocking.R \name{blocking} \alias{blocking} -\title{Main function for blocking records given text data} +\title{Block records based on text data.} \usage{ blocking( x, diff --git a/man/pair_ann.Rd b/man/pair_ann.Rd index 2ed1122..f5e240b 100644 --- a/man/pair_ann.Rd +++ b/man/pair_ann.Rd @@ -36,7 +36,7 @@ pair_ann( Returns a \link{data.table} with two columns \code{.x} and \code{.y}. Columns \code{.x} and \code{.y} are row numbers from data.frames x and y respectively. Returning data.table is also of a class \code{pairs} which allows for integration with the \code{\link[reclin2:compare_pairs]{reclin2::compare_pairs()}} package. } \description{ -Function for the integration with the reclin2 package. The function is based on \code{\link[reclin2:pair_minsim]{reclin2::pair_minsim()}} and reuses some of its source code. +Function for the integration with the \code{reclin2} package. The function is based on \code{\link[reclin2:pair_minsim]{reclin2::pair_minsim()}} and reuses some of its source code. } \details{ Imports diff --git a/vignettes/v1-deduplication.Rmd b/vignettes/v1-deduplication.Rmd index ff3d1b9..b4346f5 100644 --- a/vignettes/v1-deduplication.Rmd +++ b/vignettes/v1-deduplication.Rmd @@ -33,7 +33,7 @@ library(reclin2) library(data.table) ``` -Read the `RLdata500` data used in the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository. +Read the `RLdata500` data from the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository. ```{r} df <- read.csv("https://raw.githubusercontent.com/cleanzr/dblink/dc3dd0daf55f8a303863423817a0f0042b3c275a/examples/RLdata500.csv") diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index 21373e4..511a151 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -21,6 +21,151 @@ knitr::opts_chunk$set( ) ``` +# Setup + +Read required packages + ```{r setup} library(blocking) +library(reclin2) +library(data.table) +``` + +# Data + +Read the example data from the tutorial on [the `reclin` package on the URos 2021 Conference](https://github.com/djvanderlaan/tutorial-reclin-uros2021). The data sets are from ESSnet on Data Integration as stated in the repository: + +``` +These totally fictional data sets are supposed to have captured details of +persons up to the date 31 December 2011. Any years of birth captured as 2012 +are therefore in error. Note that in the fictional Census data set, dates of +birth between 27 March 2011 and 31 December 2011 are not necessarily in error. + +Census: A fictional data set to represent some observations from a + decennial Census +CIS: Fictional observations from Customer Information System, which is + combined administrative data from the tax and benefit systems + +In the dataset census all records contain a person_id. For some of the records +in cis the person_id is also available. This information can be used to +evaluate the linkage (assuming these records from the cis are representable +all records in the cis). +``` + +```{r} +census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") +cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") +setDT(census) +setDT(cis) +``` + ++ `census` object has `r nrow(census)` rows and `r ncol(census)`, ++ `cis` object has `r nrow(census)` rows and `r ncol(census)`. + +Census data + +```{r} +head(census) ``` +CIS data + +```{r} +head(cis) +``` + +We need to create new columns that concatanates variables from `pername1` to `enumpc`. In the first step we replace `NA`s with `''`. + +```{r} +census[is.na(dob_day), dob_day := ""] +census[is.na(dob_mon), dob_mon := ""] +census[is.na(dob_year), dob_year := ""] +cis[is.na(dob_day), dob_day := ""] +cis[is.na(dob_mon), dob_mon := ""] +cis[is.na(dob_year), dob_year := ""] + +census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] +cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] +``` + + + +# Linking datasets + +## Using basic functionalities of `blocking` package + +The goal of this exercise is to link units from the CIS dataset to the CENSUS dataset. + +```{r} +result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1, seed = 2024) +``` + +Distribution of distances for pairs + +```{r} +hist(result1$result$dist, main = "Distribution of distances between pairs", xlab = "Distances") +``` + +Example pairs + +```{r} +head(result1$result, n= 10) +``` +Let's look at the first pair. Clearly there is a typo on the `pername1` but all other variables are the same so it seems that this is a match. + +```{r} +census[1, ] +cis[8152, ] +``` + +Now, let's look at the 7th pair with the largest distance from the first 10 rows. This seems to be a non-match because only `pername2` and `sex` are the same. + +```{r} +census[8, ] +cis[3901, ] +``` + + +## Assessing the quality + +For some records we have information on the correct linkage. We can use this information to assess our approach. + +```{r} +matches <- merge(x = census[, .(x=1:.N, person_id)], + y = cis[, .(y = 1:.N, person_id)], + by = "person_id") +matches[, block:=1:.N] +head(matches) +``` +So in our example we have `r nrow(matches)` pairs. + +```{r} +result2 <- blocking(x = census$txt, y = cis$txt, verbose = 1, seed = 2024, + true_blocks = matches[, .(x, y, block)], n_threads = 4) +``` + +Let's see how our approach handled this problem. + +```{r} +result2 +``` + +It seems that default parameters of the NND method result in FNR of 16% which is quite large. Let's compare to HNSW algorithm. + +```{r} +result3 <- blocking(x = census$txt, y = cis$txt, seed = 2024, verbose = 1, + true_blocks = matches[, .(x, y, block)], n_threads = 4, + ann = "hnsw") +``` + +```{r} +result3 +``` + +It seems that the HNSW algorithm performed better with 0.62% FNR. This however comes with cost, in particupar computational cost: + +1. the HNSW does not handle sparse matrices so sparse matrix of tokens is converted to dense. +2. HNSW algorithm is slower than NND. + +Computational times are: 16 seconds for NND and about 60 HNSW (on M2 MacBook AIR). + +