Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102

ernestaigner · 2022-03-14T08:01:30Z

Hi,

This function is an adaptation for RAKE function. In difference to the original function the output is an annotation table with corrected IDs and merged fields (in case of other fields).

best,
Ernest

## function that creates keywords and adapts annotation table and overwrites old information
fun.kw.short <- function(at,UPOS=c("NOUN", "NOUN"),LRF=50,LR=1.5) {
  # Calculate the Rake. Keep the full information of the annotation table. This is based on udpipe::keyword_rake()
  library(udpipe)
  library(stringr)
  ##  create one common ID on the level of sentences.
  at[,group:=unique_identifier(at, fields = c("doc_id","paragraph_id","sentence_id"))]
  ## copy lemma to 'word' to merge lemma to keywords. 
  at[,word:=lemma]
  
  # Rake 
  ## code relevant terms
  at[,.relevant:=upos %in% UPOS]
  ## Add Keyword_id
  at[,keyword_id:=data.table::rleid(group,.relevant)]
  ## Calculate degree of each keyword (number of words minues 1).
  at[.relevant != FALSE,degree:=.N-1L,keyword_id]
  ## create keyowrds
  at[.relevant != FALSE,keyword:=paste(word, collapse = " "),keyword_id]
  ## Calculate degree of each word (sum of degrees of a word).
  at[.relevant != FALSE,word_degree:=sum(degree),word]
  ## Calculate frequency of a word
  at[.relevant != FALSE,word_freq:=.N,word]
  ## Calculate rake of a word
  at[.relevant != FALSE,word_rake:=word_degree/word_freq]
  # Keywords
  ## Number of times the keyword is used. 
  at[.relevant != FALSE, keyword_freq:=length(unique(keyword_id)), .(keyword, word)]
  ## Ngram
  at[.relevant != FALSE, keyword_ngram:=length(unique(word)), .(keyword)]
  ## Rake score of keywords (sum of rake of the words).
  at[.relevant != FALSE, keyword_rake:=sum(word_rake),keyword_id]
  
  ##replace lemma and add ngram of lemma
  at[,lemma:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword,lemma)]
  at[,lemma_ng:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword_ngram,1)]
  
  # Remove added variables
  at[,(c("keyword","keyword_ngram","keyword_freq","keyword_rake",".relevant","word_freq","word_rake","word_degree","degree","keyword_id","group","word")):=NULL]
  
  # merge head_token_ids variables
  ## create new token_id
  setorder(at,doc_id,paragraph_id,sentence_id,token_id)
  at[,token_id2:=data.table::rleid(lemma),.(doc_id,paragraph_id,sentence_id)]
  ## recode id cols
  id_cols <- c("doc_id","paragraph_id","sentence_id","token_id","head_token_id","token_id2")
  fun.ext.int <- function (x) {x+10^max(nchar(x))}
  at[,(id_cols):=lapply(.SD,fun.ext.int),.SDcols=id_cols]
  at[,(id_cols):=lapply(.SD,as.numeric),.SDcols=id_cols]
  ## create new id cols
  at[,id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id))]
  at[,id2:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id2))]
  at[,head_id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,head_token_id))]
  
  # create new head ids
  at[,head_id2:=at[,.(id1,id2)][,unique(.SD)][match(at$head_id1,id1),id2]]
  ## add dependency relation to root
  at[dep_rel=="root",head_id2:=head_id1]
  ## create new head_token_id
  at[,head_token_id2:=head_id2 %% 10000]
  at[,head_token_id2:=ifelse(head_token_id2==token_id2,NA,head_token_id2)]
  at[,c("token_id","head_id1","head_id2","id1","id2","head_token_id"):=NULL]
  setnames(at,gsub("id2","id",names(at)))
 
  # merge entries in upos, xpos, feats, dep_rel, token and misc. 
  cols <- c("doc_id","paragraph_id","sentence_id","token_id")
  cols1 <- c("upos","xpos","feats","dep_rel","token","misc")
  at[,deps:=NULL] ##column not needed, when used, as.character might be needed
  #at[,(cols1):=lapply(.SD,as.character),.SDcols=cols1]
  my.fun <- function(x) paste0(x[!is.na(x)&!duplicated(x)],collapse="_")
  at[,(cols1):=lapply(.SD,my.fun),by=cols,.SDcols=cols1]
  
  # export
  at[,unique(.SD)]  
  
}

The text was updated successfully, but these errors were encountered:

jwijffels · 2022-03-14T08:15:36Z

thank you, can you provide an example on how to use the function?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102

ernestaigner commented Mar 14, 2022

jwijffels commented Mar 14, 2022

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102

Comments

ernestaigner commented Mar 14, 2022

jwijffels commented Mar 14, 2022