You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
This function is an adaptation for RAKE function. In difference to the original function the output is an annotation table with corrected IDs and merged fields (in case of other fields).
best,
Ernest
## function that creates keywords and adapts annotation table and overwrites old information
fun.kw.short <- function(at,UPOS=c("NOUN", "NOUN"),LRF=50,LR=1.5) {
# Calculate the Rake. Keep the full information of the annotation table. This is based on udpipe::keyword_rake()
library(udpipe)
library(stringr)
## create one common ID on the level of sentences.
at[,group:=unique_identifier(at, fields = c("doc_id","paragraph_id","sentence_id"))]
## copy lemma to 'word' to merge lemma to keywords.
at[,word:=lemma]
# Rake
## code relevant terms
at[,.relevant:=upos %in% UPOS]
## Add Keyword_id
at[,keyword_id:=data.table::rleid(group,.relevant)]
## Calculate degree of each keyword (number of words minues 1).
at[.relevant != FALSE,degree:=.N-1L,keyword_id]
## create keyowrds
at[.relevant != FALSE,keyword:=paste(word, collapse = " "),keyword_id]
## Calculate degree of each word (sum of degrees of a word).
at[.relevant != FALSE,word_degree:=sum(degree),word]
## Calculate frequency of a word
at[.relevant != FALSE,word_freq:=.N,word]
## Calculate rake of a word
at[.relevant != FALSE,word_rake:=word_degree/word_freq]
# Keywords
## Number of times the keyword is used.
at[.relevant != FALSE, keyword_freq:=length(unique(keyword_id)), .(keyword, word)]
## Ngram
at[.relevant != FALSE, keyword_ngram:=length(unique(word)), .(keyword)]
## Rake score of keywords (sum of rake of the words).
at[.relevant != FALSE, keyword_rake:=sum(word_rake),keyword_id]
##replace lemma and add ngram of lemma
at[,lemma:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword,lemma)]
at[,lemma_ng:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword_ngram,1)]
# Remove added variables
at[,(c("keyword","keyword_ngram","keyword_freq","keyword_rake",".relevant","word_freq","word_rake","word_degree","degree","keyword_id","group","word")):=NULL]
# merge head_token_ids variables
## create new token_id
setorder(at,doc_id,paragraph_id,sentence_id,token_id)
at[,token_id2:=data.table::rleid(lemma),.(doc_id,paragraph_id,sentence_id)]
## recode id cols
id_cols <- c("doc_id","paragraph_id","sentence_id","token_id","head_token_id","token_id2")
fun.ext.int <- function (x) {x+10^max(nchar(x))}
at[,(id_cols):=lapply(.SD,fun.ext.int),.SDcols=id_cols]
at[,(id_cols):=lapply(.SD,as.numeric),.SDcols=id_cols]
## create new id cols
at[,id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id))]
at[,id2:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id2))]
at[,head_id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,head_token_id))]
# create new head ids
at[,head_id2:=at[,.(id1,id2)][,unique(.SD)][match(at$head_id1,id1),id2]]
## add dependency relation to root
at[dep_rel=="root",head_id2:=head_id1]
## create new head_token_id
at[,head_token_id2:=head_id2 %% 10000]
at[,head_token_id2:=ifelse(head_token_id2==token_id2,NA,head_token_id2)]
at[,c("token_id","head_id1","head_id2","id1","id2","head_token_id"):=NULL]
setnames(at,gsub("id2","id",names(at)))
# merge entries in upos, xpos, feats, dep_rel, token and misc.
cols <- c("doc_id","paragraph_id","sentence_id","token_id")
cols1 <- c("upos","xpos","feats","dep_rel","token","misc")
at[,deps:=NULL] ##column not needed, when used, as.character might be needed
#at[,(cols1):=lapply(.SD,as.character),.SDcols=cols1]
my.fun <- function(x) paste0(x[!is.na(x)&!duplicated(x)],collapse="_")
at[,(cols1):=lapply(.SD,my.fun),by=cols,.SDcols=cols1]
# export
at[,unique(.SD)]
}
The text was updated successfully, but these errors were encountered:
Hi,
This function is an adaptation for RAKE function. In difference to the original function the output is an annotation table with corrected IDs and merged fields (in case of other fields).
best,
Ernest
The text was updated successfully, but these errors were encountered: