Skip to content

Commit

Permalink
fix for #16, #17 and #18
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed Apr 29, 2024
1 parent 86bf79f commit aaa22d9
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 23 deletions.
5 changes: 5 additions & 0 deletions .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
/Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/README.Rmd="CBB944CE"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/index-colnames.txt="0350B51E"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_annoy.R="4302FC18"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_blocking.R="DABEA252"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_data.R="9D1011B0"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_hnsw.R="2E19A832"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44"
2 changes: 1 addition & 1 deletion R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ blocking <- function(x,
if (!is.null(true_blocks)) {

## Graph metrics
eval_g1 <- igraph::graph_from_data_frame(eval_blocks[, c("x", "y")], directed = FALSE)
eval_g1 <- igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = FALSE)
eval_g2 <- igraph::graph_from_data_frame(true_blocks[, c("x", "y")], directed = FALSE)

eval_g1_cl <- igraph::make_clusters(eval_g1, membership = igraph::components(eval_g1, "weak")$membership)
Expand Down
8 changes: 3 additions & 5 deletions R/reclin2_pair_ann.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pair_ann <- function(x,
add_xy = TRUE,
...) {

stopifnot("Only one `on` is possible" = length(on) == 1)
stopifnot("Only one `on` is currently supported" = NROW(on) == 1)

if (!is.null(y)) deduplication <- FALSE

Expand All @@ -62,19 +62,17 @@ pair_ann <- function(x,
deduplication = deduplication,
...)

block_ann <- data.table::as.data.table(block_result$result)

x <- data.table::as.data.table(x)
y <- data.table::as.data.table(y)

a <- x[, ..on]
a[, `:=`(".x", .I)]
a <- a[unique(block_ann[,.(".x"=x, block)]), on = ".x"]
a <- a[unique(block_result$result[,.(".x"=x, block)]), on = ".x"]
a[, `:=`((on), NULL)]

b <- y[, `..on`]
b[, `:=`(".y", .I)]
b <- b[unique(block_ann[,.(".y"=y, block)]), on = ".y"]
b <- b[unique(block_result$result[,.(".y"=y, block)]), on = ".y"]
b[, `:=`((on), NULL)]

pairs <- merge(a, b,
Expand Down
2 changes: 1 addition & 1 deletion inst/tinytest/test_reclin2.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
source("test_data.R")

expect_silent(
pair_ann(x = df_example, on = "txt", ann = "hnsw")
pair_ann(x = df_example, on = "txt")
)

expect_equal(
Expand Down
48 changes: 32 additions & 16 deletions vignettes/v1-deduplication.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,24 @@ head(df)
In the next step we use the newly created column in the `blocking` function. If we specify verbose, we get information about the progress.

```{r}
df_blocks <- blocking(x = df$txt, ann = "nnd", verbose = 1, graph = TRUE)
df_blocks <- blocking(x = df$txt, ann = "nnd", verbose = 1, graph = TRUE, seed = 2024)
```

Results are as follows:

+ based in `rnndescent` we created . blocks,
+ it was based on 429 columns (2 character shingles),
+ we have 48 blocks of 2 elements, 34 blocks of 3 elements, ..., 1 block of 15 elements.
+ based in `rnndescent` we have created `r NROW(unique(df_blocks$result$block))` blocks,
+ it was based on `r NROW(unique(df_blocks$colnames))` columns (2 character shingles),
+ we have 93 blocks of 2 elements, 38 blocks of 3 elements, ..., 3 block of 6 elements.

```{r}
df_blocks
```

Structure of the object is as follows:

+ `result` - a data.table with identifiers and block IDs,
+ `method` - the method used,
+ `deduplication` -- whether deduplication was applied,
+ `metrics` - standard metrics and based on the `igraph::compare` methods for comparing graphs (here NULL),
+ `colnames` - column names used for the comparison,
+ `graph` -- an `igraph` object mainly for visualisation.
Expand All @@ -90,37 +92,51 @@ plot(df_blocks$graph, vertex.size=1, vertex.label = NA)

The resulting data.table has three columns:

+ `x` - Reference dataset (i.e. `df`) -- this may not contain all units of `df`,
+ `y` - query (each row of `df`) -- this will return all units of `df`,
+ `block` -- the block ID.
+ `x` -- reference dataset (i.e. `df`) -- this may not contain all units of `df`,
+ `y` - query (each row of `df`) -- this may not contain all units of `df`,
+ `block` -- the block ID,
+ `dist` -- distance between objects.

```{r}
head(df_blocks$result)
```

Create long `data.table` with information on blocks and units from original dataset.

```{r}
df_block_melted <- melt(df_blocks$result, id.vars = c("block", "dist"))
df_block_melted_rec_block <- unique(df_block_melted[, .(rec_id=value, block)])
head(df_block_melted_rec_block)
```

We add block information to the final dataset.

```{r}
df_block_result <- copy(df_blocks$result[order(y),])
df[, block_id := df_block_result$block]
df[, block_dist := df_block_result$dist]
df[df_block_melted_rec_block, on = "rec_id", block_id := i.block]
head(df)
```

Finally, we can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks.
We can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks.

```{r}
df[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks]
```
Compare distances for block that contains different (1) and the same (2) units.

We can visualise the distances between units stored in the `df_blocks$result` data set. Clearly we a mixture of two groups: matches (close to 0) and non-matches (close to 1).

```{r}
boxplot(block_dist ~ id_count, data=df, xlab = "Block type", ylab = "Distances")
hist(df_blocks$result$dist, xlab = "Distances", ylab = "Frequency", breaks = "fd",
main = "Distances calculated between units")
```

Finally, we can visualise the result based on the information whether block contains matches or not.

```{r}
plot(density(df[id_count==2]$block_dist), col = "blue", xlim = c(0, 0.8),
main = "Distribution of distances between\nclusters type (1=red, 2=blue)")
lines(density(df[id_count==1]$block_dist), col = "red", xlim = c(0, 0.8))
df_for_density <- copy(df_block_melted[block %in% df$block_id])
df_for_density[, match:= block %in% df[id_count == 2]$block_id]
plot(density(df_for_density[match==FALSE]$dist), col = "blue", xlim = c(0, 0.8),
main = "Distribution of distances between\nclusters type (match=red, non-match=blue)")
lines(density(df_for_density[match==TRUE]$dist), col = "red", xlim = c(0, 0.8))
```

0 comments on commit aaa22d9

Please sign in to comment.