Skip to content

Commit

Permalink
quality metrics finally works as expected
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed May 1, 2024
1 parent c500f2e commit 9f461d4
Show file tree
Hide file tree
Showing 11 changed files with 231 additions and 68 deletions.
3 changes: 3 additions & 0 deletions .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E"
Expand All @@ -10,7 +11,9 @@
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_blocking.R="DABEA252"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_data.R="9D1011B0"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_hnsw.R="2E19A832"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_mlpack.R="51D2EAA1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F"
63 changes: 44 additions & 19 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#' @importFrom utils combn
#'
#'
#' @title Main function for blocking records given text data
#' @title Block records based on text data.
#'
#' @author Maciej Beręsewicz
#'
Expand Down Expand Up @@ -271,22 +271,47 @@ blocking <- function(x,
## if true are given
if (!is.null(true_blocks)) {

## Graph metrics
eval_g1 <- igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = FALSE)
eval_g2 <- igraph::graph_from_data_frame(true_blocks[, c("x", "y")], directed = FALSE)
setDT(true_blocks)

eval_g1_cl <- igraph::make_clusters(eval_g1, membership = igraph::components(eval_g1, "weak")$membership)
eval_g2_cl <- igraph::make_clusters(eval_g2, membership = igraph::components(eval_g2, "weak")$membership)
pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")]
pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE]
pairs_to_eval[is.na(both), both := FALSE]

eval_metrics <- base::sapply(c("vi", "nmi", "split.join", "rand", "adjusted.rand"),
igraph::compare, comm1=eval_g1_cl, comm2=eval_g2_cl)
true_blocks[pairs_to_eval, on = c("x", "y"), both := TRUE]
true_blocks[is.na(both), both := FALSE]
true_blocks[, block:=block+max(pairs_to_eval$block)]

pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == FALSE, .(x,y,block)], fill = TRUE)

if (!deduplication) {

pairs_to_eval[, x2:=x+max(y)]
pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, block, both)], id.vars = c("block", "both"))
pairs_to_eval_long[!is.na(both), block_id := .GRP, block]
block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T)
pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)]
pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block]
true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T)
pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)]

} else {

pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, block, both)], id.vars = c("block", "both"))
pairs_to_eval_long[!is.na(both), block_id := .GRP, block]
block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T)
pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)]
pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block]
true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T)
pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)]

}

## consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads)
candidate_pairs <- utils::combn(nrow(pairs_to_eval_long), 2)

same_block <- pairs_to_eval_long$block_id[candidate_pairs[1, ]] == pairs_to_eval_long$block_id[candidate_pairs[2, ]]
same_truth <- pairs_to_eval_long$true_id[candidate_pairs[1, ]] == pairs_to_eval_long$true_id[candidate_pairs[2, ]]

## standard metrics based on klsh::confusion.from.blocking
block_ids <- eval_g1_cl$membership
true_ids <- eval_g2_cl$membership
candidate_pairs <- utils::combn(length(block_ids), 2)
same_block <- block_ids[candidate_pairs[1, ]] == block_ids[candidate_pairs[2, ]]
same_truth <- true_ids[candidate_pairs[1, ]] == true_ids[candidate_pairs[2, ]]
confusion <- table(same_block, same_truth)

fp <- confusion[2, 1]
Expand All @@ -295,11 +320,11 @@ blocking <- function(x,
tn <- confusion[1, 1]
recall <- tp/(fn + tp)

eval_metrics2 <- c(recall = tp/(fn + tp), precision = tp/(tp + fp),
fpr = fp/(fp + tn), fnr = fn/(fn + tp),
accuracy = (tp + tn)/(tp + tn + fn + fp),
specificity = tn/(tn + fp))
eval_metrics <- c(eval_metrics, eval_metrics2)
eval_metrics <- c(recall = tp / (fn + tp), precision = tp / (tp + fp),
fpr = fp / (fp + tn), fnr = fn / (fn + tp),
accuracy = (tp + tn) / (tp + tn + fn + fp),
specificity = tn / (tn + fp))

}

setorderv(x_df, c("x", "y", "block"))
Expand Down
13 changes: 3 additions & 10 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,8 @@
#' @exportS3Method
print.blocking <- function(x,...) {

block_ids <- x$result$block

if (x$deduplication) {
blocks_tab <- table(block_ids)
block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1)
}

blocks_tab <- table(x$result$block)
block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1)

rr <- 1 - sum(choose(table(block_ids), 2))/choose(length(block_ids), 2)
cat("========================================================\n")
Expand All @@ -25,9 +20,7 @@ print.blocking <- function(x,...) {
if (!is.null(x$metrics)) {
cat("========================================================\n")
cat("Evaluation metrics (standard):\n" )
print(x$metrics[6:11])
cat("\nEvaluation metrics (graph-based):\n" )
print(x$metrics[1:5])
print(round(x$metrics*100, 4))

}
invisible(x)
Expand Down
2 changes: 1 addition & 1 deletion R/reclin2_pair_ann.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @author Maciej Beręsewicz
#'
#' @description
#' Function for the integration with the reclin2 package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code.
#' Function for the integration with the `reclin2` package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code.
#'
#' @param x reference data (a data.frame or a data.table),
#' @param y query data (a data.frame or a data.table, default NULL),
Expand Down
21 changes: 11 additions & 10 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,24 @@ knitr::opts_chunk$set(

## Description

An R package that aims to block records for data deduplication and record linkage (a.k.a. entity resolution) based on [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package).
This R package is designed to block records for data deduplication and record linkage (also known as entity resolution) using [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package).

Currently supports the following R packages that binds to specific ANN algorithms:
It supports the following R packages that bind to specific ANN algorithms:

+ [rnndescent](https://cran.r-project.org/package=rnndescent) (default, very powerful, supports sparse matrices),
+ [RcppHNSW](https://cran.r-project.org/package=RcppHNSW) (powerful but does not support sparse matrices),
+ [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy),
+ [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see `mlpack::lsh` and `mlpack::knn`).

The package also supports integration with the [reclin2](https://cran.r-project.org/package=reclin2) package via `blocking::pair_ann` function.
The package can be used with the [reclin2](https://cran.r-project.org/package=reclin2) package via the `blocking::pair_ann` function.

## Funding

Work on this package is supported by the National Science Centre, OPUS 22 grant no. 2020/39/B/HS4/00941.

## Installation

You can install the development version of the `blocking` package from GitHub with:
Install the GitHub blocking package with:

```{r, eval=FALSE}
# install.packages("remotes") # uncomment if needed
Expand All @@ -53,7 +53,7 @@ library(blocking)
library(reclin2)
```

Generate simple data with two groups (`df_example`) and reference data (`df_base`).
Generate simple data with three groups (`df_example`) and reference data (`df_base`).

```{r}
df_example <- data.frame(txt = c(
Expand All @@ -73,7 +73,7 @@ df_example
df_base
```

Deduplication using `blocking` function. Output contains information about:
Deduplication using the `blocking` function. Output contains information:

+ the method used (where `nnd` which refers to the NN descent algorithm),
+ number of blocks created (here 2 blocks),
Expand All @@ -86,17 +86,17 @@ blocking_result <- blocking(x = df_example$txt)
blocking_result
```

Table with blocking which contains:
Table with blocking results contains:

+ row numbers from the original data
+ row numbers from the original data,
+ block number (integers),
+ distance (from the ANN algorithm).

```{r}
blocking_result$result
```

Deduplication using the `pair_ann` function for integration with the `reclin2` package. Here I use the pipeline that can be used with the `reclin2` package.
Deduplication using the `pair_ann` function for integration with the `reclin2` package. Use the pipeline with the `reclin2` package.

```{r}
pair_ann(x = df_example, on = "txt") |>
Expand All @@ -105,7 +105,8 @@ pair_ann(x = df_example, on = "txt") |>
select_threshold("threshold", score = "score", threshold = 0.55) |>
link(selection = "threshold")
```
Record linkage using the same function where `df_base` is the "register" and `df_example` is the reference (query data).

Linking records using the same function where `df_base` is the "register" and `df_example` is the reference (data).

```{r}
pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |>
Expand Down
32 changes: 15 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ coverage](https://codecov.io/gh/ncn-foreigners/blocking/branch/main/graph/badge.

## Description

An R package that aims to block records for data deduplication and
record linkage (a.k.a. entity resolution) based on [approximate nearest
neighbours algorithms
This R package is designed to block records for data deduplication and
record linkage (also known as entity resolution) using [approximate
nearest neighbours algorithms
(ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs
(via the `igraph` package).

Currently supports the following R packages that binds to specific ANN
It supports the following R packages that bind to specific ANN
algorithms:

- [rnndescent](https://cran.r-project.org/package=rnndescent) (default,
Expand All @@ -28,8 +28,8 @@ algorithms:
- [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see
`mlpack::lsh` and `mlpack::knn`).

The package also supports integration with the
[reclin2](https://cran.r-project.org/package=reclin2) package via
The package can be used with the
[reclin2](https://cran.r-project.org/package=reclin2) package via the
`blocking::pair_ann` function.

## Funding
Expand All @@ -39,8 +39,7 @@ Work on this package is supported by the National Science Centre, OPUS

## Installation

You can install the development version of the `blocking` package from
GitHub with:
Install the GitHub blocking package with:

``` r
# install.packages("remotes") # uncomment if needed
Expand All @@ -57,7 +56,7 @@ library(reclin2)
#> Loading required package: data.table
```

Generate simple data with two groups (`df_example`) and reference data
Generate simple data with three groups (`df_example`) and reference data
(`df_base`).

``` r
Expand Down Expand Up @@ -91,8 +90,8 @@ df_base
#> 3 other
```

Deduplication using `blocking` function. Output contains information
about:
Deduplication using the `blocking` function. Output contains
information:

- the method used (where `nnd` which refers to the NN descent
algorithm),
Expand All @@ -117,9 +116,9 @@ blocking_result
#> 2
```

Table with blocking which contains:
Table with blocking results contains:

- row numbers from the original data
- row numbers from the original data,
- block number (integers),
- distance (from the ANN algorithm).

Expand All @@ -136,8 +135,7 @@ blocking_result$result
```

Deduplication using the `pair_ann` function for integration with the
`reclin2` package. Here I use the pipeline that can be used with the
`reclin2` package.
`reclin2` package. Use the pipeline with the `reclin2` package.

``` r
pair_ann(x = df_example, on = "txt") |>
Expand All @@ -158,8 +156,8 @@ pair_ann(x = df_example, on = "txt") |>
#> 6: 8 5 montypython monty
```

Record linkage using the same function where `df_base` is the “register”
and `df_example` is the reference (query data).
Linking records using the same function where `df_base` is the
“register” and `df_example` is the reference (data).

``` r
pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |>
Expand Down
14 changes: 6 additions & 8 deletions inst/tinytest/test_blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,16 @@ expect_error(
expect_equal(
blocking(x = df_example$txt,
true_blocks = result$result[, c("x", "y", "block")])$metrics,
c(vi = 0, nmi = 1, split.join = 0, rand = 1, adjusted.rand = 1,
recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1
)
c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1)
)

# check if true_block is a vector

expect_silent(
blocking(x = df_example$txt,
#true_blocks = result$result$block)
true_blocks = result$result[, c("x", "y", "block")])
)
# expect_silent(
# blocking(x = df_example$txt,
# #true_blocks = result$result$block)
# true_blocks = result$result[, c("x", "y", "block")])
# )


## printing
Expand Down
2 changes: 1 addition & 1 deletion man/blocking.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/pair_ann.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion vignettes/v1-deduplication.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ library(reclin2)
library(data.table)
```

Read the `RLdata500` data used in the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository.
Read the `RLdata500` data from the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository.

```{r}
df <- read.csv("https://raw.githubusercontent.com/cleanzr/dblink/dc3dd0daf55f8a303863423817a0f0042b3c275a/examples/RLdata500.csv")
Expand Down
Loading

0 comments on commit 9f461d4

Please sign in to comment.