diff --git a/src/LM/api.jl b/src/LM/api.jl index 55a542c7..8a0ef80d 100644 --- a/src/LM/api.jl +++ b/src/LM/api.jl @@ -1,23 +1,34 @@ #TO DO # Doc string -function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context) - score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1]) +""" +$(TYPEDSIGNATURES) +""" +function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64 + score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin]) end -function logscore(m::Langmodel, temp_lm::DefaultDict, word, context) +""" +$(TYPEDSIGNATURES) +""" +function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64 log2(maskedscore(m, temp_lm, word, context)) end -function entropy(m::Langmodel, lm::DefaultDict, text_ngram) - local log_set=Float64[] +""" +$(TYPEDSIGNATURES) +""" +function entropy(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64 + sum::Float64 = 0.0 for ngram in text_ngram ngram = split(ngram) - push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " "))) - #println(logscore(m,lm,ngram[end],ngram[1:end-1])) + sum += logscore(m, lm, ngram[end], join(ngram[begin:end-1], " ")) end - return(sum(log_set)/length(log_set)) + return sum/length(text_ngram) end -function perplexity(m::Langmodel, lm::DefaultDict, text_ngram) - return(2^(entropy(m, lm, text_ngram))) +""" +$(TYPEDSIGNATURES) +""" +function perplexity(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64 + return 2^(entropy(m, lm, text_ngram)) end diff --git a/src/LM/counter.jl b/src/LM/counter.jl index 2ae548fd..f6843340 100644 --- a/src/LM/counter.jl +++ b/src/LM/counter.jl @@ -1,17 +1,18 @@ using DataStructures """ - counter is used to make conditional distribution, which is used by score functions to - calculate conditional frequency distribution +$(TYPEDSIGNATURES) + +counter is used to make conditional distribution, which is used by score functions to +calculate conditional frequency distribution """ function counter2(data, min::Integer, max::Integer) data = everygram(data, min_len=min, max_len=max) data = split.(data) - temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String})) - for i in 1:length(data) - history,word = data[i][1:end-1], data[i][end] + temp_lm = DefaultDict{SubString{String},Accumulator{String,Int64}}(counter(SubString{String})) + for i in eachindex(data) + history, word = data[i][begin:end-1], data[i][end] temp_lm[join(history, " ")][word] += 1 end return temp_lm end - diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl index e240e53f..d831911d 100644 --- a/src/LM/preprocessing.jl +++ b/src/LM/preprocessing.jl @@ -21,8 +21,8 @@ julia> a = everygram(seq,min_len=1, max_len=-1) ``` """ -function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString} - ngram = [] +function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)::Vector{String} where {T<:AbstractString} + ngram = String[] if max_len == -1 max_len = length(seq) end @@ -30,7 +30,7 @@ function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: temp = ngramizenew(seq, n) ngram = append!(ngram, temp) end - return(ngram) + return (ngram) end """ @@ -54,16 +54,18 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true) "5 " ``` """ -function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} +function padding_ngram( + word::Vector{T}, n=1; + pad_left=false, pad_right=false, + left_pad_symbol="", right_pad_symbol="" +) where {T<:AbstractString} local seq seq = word - if pad_left == true - prepend!(seq, [left_pad_symbol]) - end - if pad_right == true - push!(seq, right_pad_symbol) - end - return ngramizenew(seq, n) + + pad_left == true && prepend!(seq, [left_pad_symbol]) + pad_right == true && push!(seq, right_pad_symbol) + + return ngramizenew(seq, n) end """ @@ -85,16 +87,16 @@ julia> ngramizenew(seq ,2) "To not" ``` """ -function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString} +function ngramizenew(words::Vector{T}, nlist::Integer...)::Vector{String} where {T<:AbstractString} n_words = length(words) - tokens = [] + tokens = String[] + + for n in nlist, + index in 1:(n_words-n+1) - for n in nlist - for index in 1:(n_words - n + 1) - token = join(words[index:(index + n - 1)], " ") - push!(tokens,token) - end + token = join(words[index:(index+n-1)], " ") + push!(tokens, token) end return tokens end diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl index 949a469d..ad8f94f6 100644 --- a/src/LM/vocab.jl +++ b/src/LM/vocab.jl @@ -70,46 +70,55 @@ julia> vocabulary.vocab["b"] ``` """ mutable struct Vocabulary -vocab::Dict{String, Int64} -unk_cutoff::Int -unk_label::String -allword::Array{String, 1} + vocab::Dict{String,Int64} + unk_cutoff::Int + unk_label::String + allword::Vector{String} end -function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} +""" +$(TYPEDSIGNATURES) +""" +function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where {T<:AbstractString} if unk_label in word error("unk_label is in vocab") else - word= push!(word, unk_label) + word = push!(word, unk_label) end vocab = countmap(word) for value in vocab - if value[2]""))) """ +$(TYPEDSIGNATURES) + Create a dict that maps elements in input array to their frequencies. """ -function frequencies(xs) +function frequencies(xs::AbstractVector{T})::Dict{T,Int} where {T<:Any} frequencies = Dict{eltype(xs),Int}() for x in xs frequencies[x] = get(frequencies, x, 0) + 1 @@ -16,13 +18,13 @@ function frequencies(xs) end """ - features(::AbstractDict, dict) +$(TYPEDSIGNATURES) Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`. """ -function features(fs::AbstractDict, dict) - bag = zeros(Int, size(dict)) - for i = 1:length(dict) +function features(fs::AbstractDict, dict::AbstractVector)::Vector{Int} + bag = Vector{Int}(undef, size(dict)) + for i = eachindex(dict) bag[i] = get(fs, dict[i], 0) end return bag diff --git a/src/corpus.jl b/src/corpus.jl index b4a31a10..df8b5f79 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -236,11 +236,8 @@ function update_inverse_index!(crps::Corpus) ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc) ngram_arr = convert(Array{String,1}, ngram_arr) for ngram in ngram_arr - if haskey(idx, ngram) - push!(idx[ngram], i) - else - idx[ngram] = [i] - end + key = get!(() -> [], idx, ngram) + push!(key, i) end end for key in keys(idx) diff --git a/src/dtm.jl b/src/dtm.jl index 4bfa3b3d..fcbf4eda 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -168,14 +168,15 @@ tdm(crps::Corpus) = dtm(crps)' #' function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T ngs = ngrams(d) - indices = Array{Int}(undef, 0) - values = Array{Int}(undef, 0) - terms = sort(collect(keys(lex))) + indices = Int[] + values = Int[] + terms = sort!(collect(keys(lex))) column_indices = columnindices(terms) for ngram in keys(ngs) - if haskey(column_indices, ngram) - push!(indices, column_indices[ngram]) + key = get(column_indices, ngram, nothing) + if !isnothing(key) + push!(indices, key) push!(values, ngs[ngram]) end end diff --git a/src/lda.jl b/src/lda.jl index 342b9645..cbdd55f1 100644 --- a/src/lda.jl +++ b/src/lda.jl @@ -80,7 +80,8 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, for target_topicid in 1:ntopics topicprob = (doc.topicidcount[target_topicid] + beta) / (document_lenth + beta * ntopics) - wordprob = (get(topics[target_topicid].wordcount, word, 0)+ alpha) / (topics[target_topicid].count + alpha * number_of_words) + topic = topics[target_topicid] + wordprob = (get(topic.wordcount, word, 0)+ alpha) / (topic.count + alpha * number_of_words) probs[target_topicid] = topicprob * wordprob end normalize_probs = sum(probs) diff --git a/src/preprocessing.jl b/src/preprocessing.jl index e71051ad..e9dd5653 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -69,11 +69,8 @@ function remove_corrupt_utf8!(d::NGramDocument) new_ngrams = Dict{AbstractString, Int}() for token in keys(d.ngrams) new_token = remove_corrupt_utf8(token) - if haskey(new_ngrams, new_token) - new_ngrams[new_token] = new_ngrams[new_token] + 1 - else - new_ngrams[new_token] = 1 - end + count = get(new_ngrams, new_token, 0) + new_ngrams[new_token] = count + 1 end d.ngrams = new_ngrams end @@ -130,11 +127,8 @@ function remove_case!(d::NGramDocument) new_ngrams = Dict{AbstractString, Int}() for token in keys(d.ngrams) new_token = remove_case(token) - if haskey(new_ngrams, new_token) - new_ngrams[new_token] = new_ngrams[new_token] + 1 - else - new_ngrams[new_token] = 1 - end + count = get(new_ngrams, new_token, 0) + new_ngrams[new_token] = count + 1 end d.ngrams = new_ngrams end @@ -474,24 +468,22 @@ function remove_patterns(s::AbstractString, rex::Regex) return replace(s, rex => "") end -function remove_patterns(s::SubString{T}, rex::Regex) where T <: String +function remove_patterns(s::SubString{T}, rex::Regex) where {T<:String} iob = IOBuffer() - ioffset = s.offset - data = codeunits(s.string) ibegin = 1 for m in eachmatch(rex, s) - len = m.match.offset-ibegin - next = nextind(s, lastindex(m.match)+m.match.offset) + len = m.match.offset - ibegin + next = nextind(s, lastindex(m.match) + m.match.offset) if len > 0 - write(iob, SubString(s, ibegin, ibegin+len)) - if next != length(s)+1 - write(iob, ' ') - end + write(iob, SubString(s, ibegin, ibegin + len)) + if next != length(s) + 1 + write(iob, ' ') + end end ibegin = next end - len = lastindex(s) - ibegin - (len > 0) && write(iob, SubString(s, ibegin, ibegin+len)) + len = lastindex(s) - ibegin + (len > 0) && write(iob, SubString(s, ibegin, ibegin + len)) String(take!(iob)) end @@ -519,11 +511,8 @@ function remove_patterns!(d::NGramDocument, rex::Regex) new_ngrams = Dict{AbstractString, Int}() for token in keys(d.ngrams) new_token = remove_patterns(token, rex) - if haskey(new_ngrams, new_token) - new_ngrams[new_token] = new_ngrams[new_token] + 1 - else - new_ngrams[new_token] = 1 - end + count = get(new_ngrams, new_token, 0) + new_ngrams[new_token] = count + 1 end d.ngrams = new_ngrams nothing diff --git a/src/stemmer.jl b/src/stemmer.jl index bf18b367..e402a912 100644 --- a/src/stemmer.jl +++ b/src/stemmer.jl @@ -37,11 +37,8 @@ function stem!(stemmer::Stemmer, d::NGramDocument) for token in keys(d.ngrams) new_token = stem(stemmer, token) if new_token != token - if haskey(d.ngrams, new_token) - d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token] - else - d.ngrams[new_token] = d.ngrams[token] - end + count = get(d.ngrams, new_token, 0) + d.ngrams[new_token] = count + d.ngrams[token] delete!(d.ngrams, token) end end diff --git a/src/tf_idf.jl b/src/tf_idf.jl index 26666136..202cb871 100644 --- a/src/tf_idf.jl +++ b/src/tf_idf.jl @@ -113,10 +113,10 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 < idf = log.(n ./ documents_containing_term) # TF-IDF is the product of TF and IDF - for i in 1:n - for j in 1:p - tfidf[i, j] = tfidf[i, j] * idf[j] - end + for i in 1:n, + j in 1:p + + tfidf[i, j] *= idf[j] end return tfidf