fix for #16, #17 and #18

ncn-foreigners · Apr 29, 2024 · aaa22d9 · aaa22d9
1 parent 86bf79f
commit aaa22d9
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 23 deletions.
diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths
@@ -1,11 +1,16 @@
 /Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/README.Rmd="CBB944CE"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/index-colnames.txt="0350B51E"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_annoy.R="4302FC18"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_blocking.R="DABEA252"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_data.R="9D1011B0"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_hnsw.R="2E19A832"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07"
 /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1"
+/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44"
diff --git a/R/blocking.R b/R/blocking.R
@@ -272,7 +272,7 @@ blocking <- function(x,
   if (!is.null(true_blocks)) {
 
     ## Graph metrics
-    eval_g1 <- igraph::graph_from_data_frame(eval_blocks[, c("x", "y")], directed = FALSE)
+    eval_g1 <- igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = FALSE)
     eval_g2 <- igraph::graph_from_data_frame(true_blocks[, c("x", "y")], directed = FALSE)
 
     eval_g1_cl <- igraph::make_clusters(eval_g1, membership = igraph::components(eval_g1, "weak")$membership)

diff --git a/R/reclin2_pair_ann.R b/R/reclin2_pair_ann.R
@@ -51,7 +51,7 @@ pair_ann <- function(x,
                      add_xy = TRUE,
                      ...) {
 
-  stopifnot("Only one `on` is possible" = length(on) == 1)
+  stopifnot("Only one `on` is currently supported" = NROW(on) == 1)
 
   if (!is.null(y)) deduplication <- FALSE
 
@@ -62,19 +62,17 @@ pair_ann <- function(x,
                                       deduplication = deduplication,
                                       ...)
 
-  block_ann <- data.table::as.data.table(block_result$result)
-
   x <- data.table::as.data.table(x)
   y <- data.table::as.data.table(y)
 
   a <- x[, ..on]
   a[, `:=`(".x", .I)]
-  a <- a[unique(block_ann[,.(".x"=x, block)]), on = ".x"]
+  a <- a[unique(block_result$result[,.(".x"=x, block)]), on = ".x"]
   a[, `:=`((on), NULL)]
 
   b <- y[, `..on`]
   b[, `:=`(".y", .I)]
-  b <- b[unique(block_ann[,.(".y"=y, block)]), on = ".y"]
+  b <- b[unique(block_result$result[,.(".y"=y, block)]), on = ".y"]
   b[, `:=`((on), NULL)]
 
   pairs <- merge(a, b,

diff --git a/inst/tinytest/test_reclin2.R b/inst/tinytest/test_reclin2.R
@@ -1,7 +1,7 @@
 source("test_data.R")
 
 expect_silent(
-  pair_ann(x = df_example, on = "txt", ann = "hnsw")
+  pair_ann(x = df_example, on = "txt")
 )
 
 expect_equal(

diff --git a/vignettes/v1-deduplication.Rmd b/vignettes/v1-deduplication.Rmd
@@ -59,22 +59,24 @@ head(df)
 In the next step we use the newly created column in the `blocking` function. If we specify verbose, we get information about the progress.
 
 ```{r}
-df_blocks <- blocking(x = df$txt, ann = "nnd", verbose = 1, graph = TRUE)
+df_blocks <- blocking(x = df$txt, ann = "nnd", verbose = 1, graph = TRUE, seed = 2024)
 ```
 
 Results are as follows:
 
-+ based in `rnndescent` we created . blocks,
-+ it was based on 429 columns (2 character shingles),
-+ we have 48 blocks of 2 elements, 34 blocks of 3 elements, ..., 1 block of 15 elements.
++ based in `rnndescent` we have created `r NROW(unique(df_blocks$result$block))` blocks,
++ it was based on `r NROW(unique(df_blocks$colnames))` columns (2 character shingles),
++ we have 93 blocks of 2 elements, 38 blocks of 3 elements, ..., 3 block of 6 elements.
 
 ```{r}
 df_blocks
 ```
+
 Structure of the object is as follows:
 
 + `result` - a data.table with identifiers and block IDs,
 + `method` - the method used,
++ `deduplication` -- whether deduplication was applied,
 + `metrics` - standard metrics and based on the `igraph::compare` methods for comparing graphs (here NULL),
 + `colnames` - column names used for the comparison,
 + `graph` -- an `igraph` object mainly for visualisation.
@@ -90,37 +92,51 @@ plot(df_blocks$graph, vertex.size=1, vertex.label = NA)
 
 The resulting data.table has three columns:
 
-+ `x` - Reference dataset (i.e. `df`) -- this may not contain all units of `df`,
-+ `y` - query (each row of `df`) -- this will return all units of `df`,
-+ `block` -- the block ID.
++ `x` -- reference dataset (i.e. `df`) -- this may not contain all units of `df`,
++ `y` - query (each row of `df`) -- this may not contain all units of `df`,
++ `block` -- the block ID,
++ `dist` -- distance between objects.
 
 ```{r}
 head(df_blocks$result)
 ```
 
+Create long `data.table` with information on blocks and units from original dataset.
+
+```{r}
+df_block_melted <- melt(df_blocks$result, id.vars = c("block", "dist"))
+df_block_melted_rec_block <- unique(df_block_melted[, .(rec_id=value, block)])
+head(df_block_melted_rec_block)
+```
+
 We add block information to the final dataset.
 
 ```{r}
-df_block_result <- copy(df_blocks$result[order(y),])
-df[, block_id := df_block_result$block]
-df[, block_dist := df_block_result$dist]
+df[df_block_melted_rec_block, on = "rec_id", block_id := i.block]
 head(df)
 ```
 
-Finally, we can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks. 
+We can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks. 
 
 ```{r}
 df[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks]
 ```
-Compare distances for block that contains different (1) and the same (2) units. 
+
+We can visualise the distances between units stored in the `df_blocks$result` data set. Clearly we a mixture of two groups: matches (close to 0) and non-matches (close to 1). 
 
 ```{r}
-boxplot(block_dist ~ id_count, data=df, xlab = "Block type", ylab = "Distances")
+hist(df_blocks$result$dist, xlab = "Distances", ylab = "Frequency", breaks = "fd",
+     main = "Distances calculated between units")
 ```
 
+Finally, we can visualise the result based on the information whether block contains matches or not.
+
 ```{r}
-plot(density(df[id_count==2]$block_dist), col = "blue", xlim = c(0, 0.8), 
-     main = "Distribution of distances between\nclusters type (1=red, 2=blue)")
-lines(density(df[id_count==1]$block_dist), col = "red", xlim = c(0, 0.8))
+df_for_density <- copy(df_block_melted[block %in% df$block_id])
+df_for_density[, match:= block %in% df[id_count == 2]$block_id]
+
+plot(density(df_for_density[match==FALSE]$dist), col = "blue", xlim = c(0, 0.8), 
+     main = "Distribution of distances between\nclusters type (match=red, non-match=blue)")
+lines(density(df_for_density[match==TRUE]$dist), col = "red", xlim = c(0, 0.8))
 ```