quality metrics finally works as expected

ncn-foreigners · May 1, 2024 · 9f461d4 · 9f461d4
1 parent c500f2e
commit 9f461d4
Show file tree

Hide file tree

Showing 11 changed files with 231 additions and 68 deletions.
diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths
@@ -1,6 +1,7 @@
 /Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E"
@@ -10,7 +11,9 @@
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_blocking.R="DABEA252"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_data.R="9D1011B0"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_hnsw.R="2E19A832"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_mlpack.R="51D2EAA1"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F"
diff --git a/R/blocking.R b/R/blocking.R
@@ -12,7 +12,7 @@
 #' @importFrom utils combn
 #'
 #'
-#' @title Main function for blocking records given text data
+#' @title Block records based on text data.
 #'
 #' @author Maciej Beręsewicz
 #'
@@ -271,22 +271,47 @@ blocking <- function(x,
   ## if true are given
   if (!is.null(true_blocks)) {
 
-    ## Graph metrics
-    eval_g1 <- igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = FALSE)
-    eval_g2 <- igraph::graph_from_data_frame(true_blocks[, c("x", "y")], directed = FALSE)
+    setDT(true_blocks)
 
-    eval_g1_cl <- igraph::make_clusters(eval_g1, membership = igraph::components(eval_g1, "weak")$membership)
-    eval_g2_cl <- igraph::make_clusters(eval_g2, membership = igraph::components(eval_g2, "weak")$membership)
+    pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")]
+    pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE]
+    pairs_to_eval[is.na(both), both := FALSE]
 
-    eval_metrics <- base::sapply(c("vi", "nmi", "split.join", "rand", "adjusted.rand"),
-                                 igraph::compare, comm1=eval_g1_cl, comm2=eval_g2_cl)
+    true_blocks[pairs_to_eval, on = c("x", "y"), both := TRUE]
+    true_blocks[is.na(both), both := FALSE]
+    true_blocks[, block:=block+max(pairs_to_eval$block)]
+
+    pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == FALSE, .(x,y,block)], fill = TRUE)
+
+    if (!deduplication) {
+
+      pairs_to_eval[, x2:=x+max(y)]
+      pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, block, both)], id.vars = c("block", "both"))
+      pairs_to_eval_long[!is.na(both), block_id := .GRP, block]
+      block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T)
+      pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)]
+      pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block]
+      true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T)
+      pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)]
+
+    } else {
+
+      pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, block, both)], id.vars = c("block", "both"))
+      pairs_to_eval_long[!is.na(both), block_id := .GRP, block]
+      block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T)
+      pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)]
+      pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block]
+      true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T)
+      pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)]
+
+    }
+
+    ## consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2,  nThreads=n_threads)
+    candidate_pairs <- utils::combn(nrow(pairs_to_eval_long), 2)
+
+    same_block <- pairs_to_eval_long$block_id[candidate_pairs[1, ]] == pairs_to_eval_long$block_id[candidate_pairs[2, ]]
+    same_truth <- pairs_to_eval_long$true_id[candidate_pairs[1, ]] == pairs_to_eval_long$true_id[candidate_pairs[2, ]]
 
-    ## standard metrics based on klsh::confusion.from.blocking
-    block_ids <- eval_g1_cl$membership
-    true_ids <- eval_g2_cl$membership
-    candidate_pairs <- utils::combn(length(block_ids), 2)
-    same_block <- block_ids[candidate_pairs[1, ]] == block_ids[candidate_pairs[2, ]]
-    same_truth <- true_ids[candidate_pairs[1, ]] == true_ids[candidate_pairs[2, ]]
     confusion <- table(same_block, same_truth)
 
     fp <- confusion[2, 1]
@@ -295,11 +320,11 @@ blocking <- function(x,
     tn <- confusion[1, 1]
     recall <- tp/(fn + tp)
 
-    eval_metrics2 <- c(recall = tp/(fn + tp), precision = tp/(tp + fp),
-                       fpr = fp/(fp + tn), fnr = fn/(fn + tp),
-                       accuracy = (tp + tn)/(tp + tn + fn + fp),
-                       specificity = tn/(tn + fp))
-    eval_metrics <- c(eval_metrics, eval_metrics2)
+    eval_metrics <- c(recall = tp / (fn + tp), precision = tp / (tp + fp),
+                      fpr = fp / (fp + tn), fnr = fn / (fn + tp),
+                      accuracy = (tp + tn) / (tp + tn + fn + fp),
+                      specificity = tn / (tn + fp))
+
   }
 
   setorderv(x_df, c("x", "y", "block"))

diff --git a/R/methods.R b/R/methods.R
@@ -2,13 +2,8 @@
 #' @exportS3Method
 print.blocking <- function(x,...) {
 
-  block_ids <- x$result$block
-
-  if (x$deduplication) {
-    blocks_tab <- table(block_ids)
-    block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1)
-  }
-
+  blocks_tab <- table(x$result$block)
+  block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1)
 
   rr <- 1 - sum(choose(table(block_ids), 2))/choose(length(block_ids), 2)
   cat("========================================================\n")
@@ -25,9 +20,7 @@ print.blocking <- function(x,...) {
   if (!is.null(x$metrics)) {
     cat("========================================================\n")
     cat("Evaluation metrics (standard):\n" )
-    print(x$metrics[6:11])
-    cat("\nEvaluation metrics (graph-based):\n" )
-    print(x$metrics[1:5])
+    print(round(x$metrics*100, 4))
 
   }
   invisible(x)

diff --git a/R/reclin2_pair_ann.R b/R/reclin2_pair_ann.R
@@ -6,7 +6,7 @@
 #' @author Maciej Beręsewicz
 #'
 #' @description
-#' Function for the integration with the reclin2 package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code.
+#' Function for the integration with the `reclin2` package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code.
 #'
 #' @param x reference data (a data.frame or a data.table),
 #' @param y query data  (a data.frame or a data.table, default NULL),

diff --git a/README.Rmd b/README.Rmd
@@ -20,24 +20,24 @@ knitr::opts_chunk$set(
 
 ## Description
 
-An R package that aims to block records for data deduplication and record linkage (a.k.a. entity resolution) based on [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package).
+This R package is designed to block records for data deduplication and record linkage (also known as entity resolution) using [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package).
 
-Currently supports the following R packages that binds to specific ANN algorithms:
+It supports the following R packages that bind to specific ANN algorithms:
 
 + [rnndescent](https://cran.r-project.org/package=rnndescent) (default, very powerful, supports sparse matrices),
 + [RcppHNSW](https://cran.r-project.org/package=RcppHNSW) (powerful but does not support sparse matrices), 
 + [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy), 
 + [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see `mlpack::lsh` and `mlpack::knn`).
 
-The package also supports integration with the [reclin2](https://cran.r-project.org/package=reclin2) package via `blocking::pair_ann` function.
+The package can be used with the [reclin2](https://cran.r-project.org/package=reclin2) package via the `blocking::pair_ann` function.
 
 ## Funding
 
 Work on this package is supported by the National Science Centre, OPUS 22 grant no. 2020/39/B/HS4/00941.
 
 ## Installation
 
-You can install the development version of the `blocking` package from GitHub with:
+Install the GitHub blocking package with:
 
 ```{r, eval=FALSE}
 # install.packages("remotes") # uncomment if needed
@@ -53,7 +53,7 @@ library(blocking)
 library(reclin2)
 ```
 
-Generate simple data with two groups (`df_example`) and reference data (`df_base`).
+Generate simple data with three groups (`df_example`) and reference data (`df_base`).
 
 ```{r}
 df_example <- data.frame(txt = c(
@@ -73,7 +73,7 @@ df_example
 df_base
 ```
 
-Deduplication using `blocking` function. Output contains information about:
+Deduplication using the `blocking` function. Output contains information:
 
   + the method used (where `nnd` which refers to the NN descent algorithm), 
   + number of blocks created (here 2 blocks),
@@ -86,17 +86,17 @@ blocking_result <- blocking(x = df_example$txt)
 blocking_result
 ```
 
-Table with blocking which contains:
+Table with blocking results contains:
 
-+ row numbers from the original data
++ row numbers from the original data,
 + block number (integers),
 + distance (from the ANN algorithm).
 
 ```{r}
 blocking_result$result
 ```
 
-Deduplication using the `pair_ann` function for integration with the `reclin2` package. Here I use the pipeline that can be used with the `reclin2` package.
+Deduplication using the `pair_ann` function for integration with the `reclin2` package. Use the pipeline with the `reclin2` package.
 
 ```{r}
 pair_ann(x = df_example, on = "txt") |>
@@ -105,7 +105,8 @@ pair_ann(x = df_example, on = "txt") |>
   select_threshold("threshold", score = "score", threshold = 0.55) |>
   link(selection = "threshold")
 ```
-Record linkage using the same function where `df_base` is the "register" and `df_example` is the reference (query data).
+
+Linking records using the same function where `df_base` is the "register" and `df_example` is the reference (data).
 
 ```{r}
 pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |>

diff --git a/README.md b/README.md
@@ -11,13 +11,13 @@ coverage](https://codecov.io/gh/ncn-foreigners/blocking/branch/main/graph/badge.
 
 ## Description
 
-An R package that aims to block records for data deduplication and
-record linkage (a.k.a. entity resolution) based on [approximate nearest
-neighbours algorithms
+This R package is designed to block records for data deduplication and
+record linkage (also known as entity resolution) using [approximate
+nearest neighbours algorithms
 (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs
 (via the `igraph` package).
 
-Currently supports the following R packages that binds to specific ANN
+It supports the following R packages that bind to specific ANN
 algorithms:
 
 - [rnndescent](https://cran.r-project.org/package=rnndescent) (default,
@@ -28,8 +28,8 @@ algorithms:
 - [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see
   `mlpack::lsh` and `mlpack::knn`).
 
-The package also supports integration with the
-[reclin2](https://cran.r-project.org/package=reclin2) package via
+The package can be used with the
+[reclin2](https://cran.r-project.org/package=reclin2) package via the
 `blocking::pair_ann` function.
 
 ## Funding
@@ -39,8 +39,7 @@ Work on this package is supported by the National Science Centre, OPUS
 
 ## Installation
 
-You can install the development version of the `blocking` package from
-GitHub with:
+Install the GitHub blocking package with:
 
 ``` r
 # install.packages("remotes") # uncomment if needed
@@ -57,7 +56,7 @@ library(reclin2)
 #> Loading required package: data.table
 ```
 
-Generate simple data with two groups (`df_example`) and reference data
+Generate simple data with three groups (`df_example`) and reference data
 (`df_base`).
 
 ``` r
@@ -91,8 +90,8 @@ df_base
 #> 3       other
 ```
 
-Deduplication using `blocking` function. Output contains information
-about:
+Deduplication using the `blocking` function. Output contains
+information:
 
 - the method used (where `nnd` which refers to the NN descent
   algorithm),
@@ -117,9 +116,9 @@ blocking_result
 #> 2
 ```
 
-Table with blocking which contains:
+Table with blocking results contains:
 
-- row numbers from the original data
+- row numbers from the original data,
 - block number (integers),
 - distance (from the ANN algorithm).
 
@@ -136,8 +135,7 @@ blocking_result$result
 ```
 
 Deduplication using the `pair_ann` function for integration with the
-`reclin2` package. Here I use the pipeline that can be used with the
-`reclin2` package.
+`reclin2` package. Use the pipeline with the `reclin2` package.
 
 ``` r
 pair_ann(x = df_example, on = "txt") |>
@@ -158,8 +156,8 @@ pair_ann(x = df_example, on = "txt") |>
 #> 6:     8     5 montypython           monty
 ```
 
-Record linkage using the same function where `df_base` is the “register”
-and `df_example` is the reference (query data).
+Linking records using the same function where `df_base` is the
+“register” and `df_example` is the reference (data).
 
 ``` r
 pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |>

diff --git a/inst/tinytest/test_blocking.R b/inst/tinytest/test_blocking.R
@@ -59,18 +59,16 @@ expect_error(
 expect_equal(
   blocking(x = df_example$txt,
            true_blocks = result$result[, c("x", "y", "block")])$metrics,
-  c(vi = 0, nmi = 1, split.join = 0, rand = 1, adjusted.rand = 1,
-    recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1
-  )
+  c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1)
 )
 
 # check if true_block is a vector
 
-expect_silent(
-  blocking(x = df_example$txt,
-           #true_blocks = result$result$block)
-           true_blocks = result$result[, c("x", "y", "block")])
-)
+# expect_silent(
+#   blocking(x = df_example$txt,
+#            #true_blocks = result$result$block)
+#            true_blocks = result$result[, c("x", "y", "block")])
+# )
 
 
 ## printing

diff --git a/man/blocking.Rd b/man/blocking.Rd
diff --git a/man/pair_ann.Rd b/man/pair_ann.Rd
diff --git a/vignettes/v1-deduplication.Rmd b/vignettes/v1-deduplication.Rmd
@@ -33,7 +33,7 @@ library(reclin2)
 library(data.table)
 ```
 
-Read the `RLdata500` data used in the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository.
+Read the `RLdata500` data from the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository.
 
 ```{r}
 df <- read.csv("https://raw.githubusercontent.com/cleanzr/dblink/dc3dd0daf55f8a303863423817a0f0042b3c275a/examples/RLdata500.csv")