diff --git a/src/LM/api.jl b/src/LM/api.jl
index 55a542c7..8a0ef80d 100644
--- a/src/LM/api.jl
+++ b/src/LM/api.jl
@@ -1,23 +1,34 @@
#TO DO
# Doc string
-function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
- score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
+"""
+$(TYPEDSIGNATURES)
+"""
+function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
+ score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
end
-function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
+"""
+$(TYPEDSIGNATURES)
+"""
+function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
log2(maskedscore(m, temp_lm, word, context))
end
-function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
- local log_set=Float64[]
+"""
+$(TYPEDSIGNATURES)
+"""
+function entropy(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
+ sum::Float64 = 0.0
for ngram in text_ngram
ngram = split(ngram)
- push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
- #println(logscore(m,lm,ngram[end],ngram[1:end-1]))
+ sum += logscore(m, lm, ngram[end], join(ngram[begin:end-1], " "))
end
- return(sum(log_set)/length(log_set))
+ return sum/length(text_ngram)
end
-function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
- return(2^(entropy(m, lm, text_ngram)))
+"""
+$(TYPEDSIGNATURES)
+"""
+function perplexity(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
+ return 2^(entropy(m, lm, text_ngram))
end
diff --git a/src/LM/counter.jl b/src/LM/counter.jl
index 2ae548fd..f6843340 100644
--- a/src/LM/counter.jl
+++ b/src/LM/counter.jl
@@ -1,17 +1,18 @@
using DataStructures
"""
- counter is used to make conditional distribution, which is used by score functions to
- calculate conditional frequency distribution
+$(TYPEDSIGNATURES)
+
+counter is used to make conditional distribution, which is used by score functions to
+calculate conditional frequency distribution
"""
function counter2(data, min::Integer, max::Integer)
data = everygram(data, min_len=min, max_len=max)
data = split.(data)
- temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
- for i in 1:length(data)
- history,word = data[i][1:end-1], data[i][end]
+ temp_lm = DefaultDict{SubString{String},Accumulator{String,Int64}}(counter(SubString{String}))
+ for i in eachindex(data)
+ history, word = data[i][begin:end-1], data[i][end]
temp_lm[join(history, " ")][word] += 1
end
return temp_lm
end
-
diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
index e240e53f..d831911d 100644
--- a/src/LM/preprocessing.jl
+++ b/src/LM/preprocessing.jl
@@ -21,8 +21,8 @@ julia> a = everygram(seq,min_len=1, max_len=-1)
```
"""
-function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
- ngram = []
+function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)::Vector{String} where {T<:AbstractString}
+ ngram = String[]
if max_len == -1
max_len = length(seq)
end
@@ -30,7 +30,7 @@ function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <:
temp = ngramizenew(seq, n)
ngram = append!(ngram, temp)
end
- return(ngram)
+ return (ngram)
end
"""
@@ -54,16 +54,18 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
"5 "
```
"""
-function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString}
+function padding_ngram(
+ word::Vector{T}, n=1;
+ pad_left=false, pad_right=false,
+ left_pad_symbol="", right_pad_symbol=""
+) where {T<:AbstractString}
local seq
seq = word
- if pad_left == true
- prepend!(seq, [left_pad_symbol])
- end
- if pad_right == true
- push!(seq, right_pad_symbol)
- end
- return ngramizenew(seq, n)
+
+ pad_left == true && prepend!(seq, [left_pad_symbol])
+ pad_right == true && push!(seq, right_pad_symbol)
+
+ return ngramizenew(seq, n)
end
"""
@@ -85,16 +87,16 @@ julia> ngramizenew(seq ,2)
"To not"
```
"""
-function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+function ngramizenew(words::Vector{T}, nlist::Integer...)::Vector{String} where {T<:AbstractString}
n_words = length(words)
- tokens = []
+ tokens = String[]
+
+ for n in nlist,
+ index in 1:(n_words-n+1)
- for n in nlist
- for index in 1:(n_words - n + 1)
- token = join(words[index:(index + n - 1)], " ")
- push!(tokens,token)
- end
+ token = join(words[index:(index+n-1)], " ")
+ push!(tokens, token)
end
return tokens
end
diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
index 949a469d..ad8f94f6 100644
--- a/src/LM/vocab.jl
+++ b/src/LM/vocab.jl
@@ -70,46 +70,55 @@ julia> vocabulary.vocab["b"]
```
"""
mutable struct Vocabulary
-vocab::Dict{String, Int64}
-unk_cutoff::Int
-unk_label::String
-allword::Array{String, 1}
+ vocab::Dict{String,Int64}
+ unk_cutoff::Int
+ unk_label::String
+ allword::Vector{String}
end
-function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString}
+"""
+$(TYPEDSIGNATURES)
+"""
+function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where {T<:AbstractString}
if unk_label in word
error("unk_label is in vocab")
else
- word= push!(word, unk_label)
+ word = push!(word, unk_label)
end
vocab = countmap(word)
for value in vocab
- if value[2]"")))
"""
+$(TYPEDSIGNATURES)
+
Create a dict that maps elements in input array to their frequencies.
"""
-function frequencies(xs)
+function frequencies(xs::AbstractVector{T})::Dict{T,Int} where {T<:Any}
frequencies = Dict{eltype(xs),Int}()
for x in xs
frequencies[x] = get(frequencies, x, 0) + 1
@@ -16,13 +18,13 @@ function frequencies(xs)
end
"""
- features(::AbstractDict, dict)
+$(TYPEDSIGNATURES)
Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`.
"""
-function features(fs::AbstractDict, dict)
- bag = zeros(Int, size(dict))
- for i = 1:length(dict)
+function features(fs::AbstractDict, dict::AbstractVector)::Vector{Int}
+ bag = Vector{Int}(undef, size(dict))
+ for i = eachindex(dict)
bag[i] = get(fs, dict[i], 0)
end
return bag
diff --git a/src/corpus.jl b/src/corpus.jl
index b4a31a10..df8b5f79 100644
--- a/src/corpus.jl
+++ b/src/corpus.jl
@@ -236,11 +236,8 @@ function update_inverse_index!(crps::Corpus)
ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc)
ngram_arr = convert(Array{String,1}, ngram_arr)
for ngram in ngram_arr
- if haskey(idx, ngram)
- push!(idx[ngram], i)
- else
- idx[ngram] = [i]
- end
+ key = get!(() -> [], idx, ngram)
+ push!(key, i)
end
end
for key in keys(idx)
diff --git a/src/dtm.jl b/src/dtm.jl
index 4bfa3b3d..fcbf4eda 100644
--- a/src/dtm.jl
+++ b/src/dtm.jl
@@ -168,14 +168,15 @@ tdm(crps::Corpus) = dtm(crps)' #'
function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T
ngs = ngrams(d)
- indices = Array{Int}(undef, 0)
- values = Array{Int}(undef, 0)
- terms = sort(collect(keys(lex)))
+ indices = Int[]
+ values = Int[]
+ terms = sort!(collect(keys(lex)))
column_indices = columnindices(terms)
for ngram in keys(ngs)
- if haskey(column_indices, ngram)
- push!(indices, column_indices[ngram])
+ key = get(column_indices, ngram, nothing)
+ if !isnothing(key)
+ push!(indices, key)
push!(values, ngs[ngram])
end
end
diff --git a/src/lda.jl b/src/lda.jl
index 342b9645..cbdd55f1 100644
--- a/src/lda.jl
+++ b/src/lda.jl
@@ -80,7 +80,8 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int,
for target_topicid in 1:ntopics
topicprob = (doc.topicidcount[target_topicid] + beta) / (document_lenth + beta * ntopics)
- wordprob = (get(topics[target_topicid].wordcount, word, 0)+ alpha) / (topics[target_topicid].count + alpha * number_of_words)
+ topic = topics[target_topicid]
+ wordprob = (get(topic.wordcount, word, 0)+ alpha) / (topic.count + alpha * number_of_words)
probs[target_topicid] = topicprob * wordprob
end
normalize_probs = sum(probs)
diff --git a/src/preprocessing.jl b/src/preprocessing.jl
index e71051ad..e9dd5653 100644
--- a/src/preprocessing.jl
+++ b/src/preprocessing.jl
@@ -69,11 +69,8 @@ function remove_corrupt_utf8!(d::NGramDocument)
new_ngrams = Dict{AbstractString, Int}()
for token in keys(d.ngrams)
new_token = remove_corrupt_utf8(token)
- if haskey(new_ngrams, new_token)
- new_ngrams[new_token] = new_ngrams[new_token] + 1
- else
- new_ngrams[new_token] = 1
- end
+ count = get(new_ngrams, new_token, 0)
+ new_ngrams[new_token] = count + 1
end
d.ngrams = new_ngrams
end
@@ -130,11 +127,8 @@ function remove_case!(d::NGramDocument)
new_ngrams = Dict{AbstractString, Int}()
for token in keys(d.ngrams)
new_token = remove_case(token)
- if haskey(new_ngrams, new_token)
- new_ngrams[new_token] = new_ngrams[new_token] + 1
- else
- new_ngrams[new_token] = 1
- end
+ count = get(new_ngrams, new_token, 0)
+ new_ngrams[new_token] = count + 1
end
d.ngrams = new_ngrams
end
@@ -474,24 +468,22 @@ function remove_patterns(s::AbstractString, rex::Regex)
return replace(s, rex => "")
end
-function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
+function remove_patterns(s::SubString{T}, rex::Regex) where {T<:String}
iob = IOBuffer()
- ioffset = s.offset
- data = codeunits(s.string)
ibegin = 1
for m in eachmatch(rex, s)
- len = m.match.offset-ibegin
- next = nextind(s, lastindex(m.match)+m.match.offset)
+ len = m.match.offset - ibegin
+ next = nextind(s, lastindex(m.match) + m.match.offset)
if len > 0
- write(iob, SubString(s, ibegin, ibegin+len))
- if next != length(s)+1
- write(iob, ' ')
- end
+ write(iob, SubString(s, ibegin, ibegin + len))
+ if next != length(s) + 1
+ write(iob, ' ')
+ end
end
ibegin = next
end
- len = lastindex(s) - ibegin
- (len > 0) && write(iob, SubString(s, ibegin, ibegin+len))
+ len = lastindex(s) - ibegin
+ (len > 0) && write(iob, SubString(s, ibegin, ibegin + len))
String(take!(iob))
end
@@ -519,11 +511,8 @@ function remove_patterns!(d::NGramDocument, rex::Regex)
new_ngrams = Dict{AbstractString, Int}()
for token in keys(d.ngrams)
new_token = remove_patterns(token, rex)
- if haskey(new_ngrams, new_token)
- new_ngrams[new_token] = new_ngrams[new_token] + 1
- else
- new_ngrams[new_token] = 1
- end
+ count = get(new_ngrams, new_token, 0)
+ new_ngrams[new_token] = count + 1
end
d.ngrams = new_ngrams
nothing
diff --git a/src/stemmer.jl b/src/stemmer.jl
index bf18b367..e402a912 100644
--- a/src/stemmer.jl
+++ b/src/stemmer.jl
@@ -37,11 +37,8 @@ function stem!(stemmer::Stemmer, d::NGramDocument)
for token in keys(d.ngrams)
new_token = stem(stemmer, token)
if new_token != token
- if haskey(d.ngrams, new_token)
- d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
- else
- d.ngrams[new_token] = d.ngrams[token]
- end
+ count = get(d.ngrams, new_token, 0)
+ d.ngrams[new_token] = count + d.ngrams[token]
delete!(d.ngrams, token)
end
end
diff --git a/src/tf_idf.jl b/src/tf_idf.jl
index 26666136..202cb871 100644
--- a/src/tf_idf.jl
+++ b/src/tf_idf.jl
@@ -113,10 +113,10 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <
idf = log.(n ./ documents_containing_term)
# TF-IDF is the product of TF and IDF
- for i in 1:n
- for j in 1:p
- tfidf[i, j] = tfidf[i, j] * idf[j]
- end
+ for i in 1:n,
+ j in 1:p
+
+ tfidf[i, j] *= idf[j]
end
return tfidf