You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
library(gutenbergr)
#> Warning: package 'gutenbergr' was built under R version 3.5.3
library(tidytext)
library(tidyverse)
#> -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.1 --
#> v ggplot2 3.1.0 v purrr 0.3.1
#> v tibble 2.0.1 v dplyr 0.8.0.1
#> v tidyr 0.8.3 v stringr 1.3.1
#> v readr 1.3.1 v forcats 0.3.0
#> -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
library(widyr)
#> Warning: package 'widyr' was built under R version 3.5.3
TI <- gutenberg_works(title == "Treasure Island") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "T.I.")
#> Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
#> Using mirror http://aleph.gutenberg.org
Wi <- gutenberg_works(title == "The Wonderful Wizard of Oz") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "Wiz")
Co <- gutenberg_works(title == "The United States Constitution") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "Con")
JFK <- gutenberg_works(title == "John F. Kennedy's Inaugural Address") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "JFK")
## Combine
df <- bind_rows(TI, Wi, Co, JFK)
## Do similarity
df %>%
bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE)
#> # A tibble: 6 x 3
#> item1 item2 similarity
#> <chr> <chr> <dbl>
#> 1 Wiz T.I. 0.349
#> 2 Con JFK 0.0513
#> 3 T.I. JFK 0.0483
#> 4 Con T.I. 0.0314
#> 5 Wiz JFK 0.0301
#> 6 Con Wiz 0.0155
## So far so good, but what if I wanted to see which is most likely to say "I love you"?
Love <- tibble(word = rep("I love you", 10), source = "TEST") %>% unnest_tokens(word, word) %>%
count(source, word, sort = TRUE)
## With four sources it's possible:
bind_rows(Love, df) %>%
bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE) %>%
filter(item1 == "TEST") %>% select(-item1)
#> # A tibble: 4 x 2
#> item2 similarity
#> <chr> <dbl>
#> 1 T.I. 0.0654
#> 2 Wiz 0.0526
#> 3 JFK 0.0267
#> 4 Con 0
## But with only two, it errors out:
df2 <- bind_rows(TI, Wi)
bind_rows(Love, df2) %>%
bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE) %>%
filter(item1 == "TEST") %>% select(-item1)
#> Error in `colnames<-`(`*tmp*`, value = c("item1", "item2", "value")): attempt to set 'colnames' on an object with less than two dimensions
## How come?
The limit should probably be documented no?
The text was updated successfully, but these errors were encountered:
a bit of a longwinded RepEx, but:
The limit should probably be documented no?
The text was updated successfully, but these errors were encountered: