minor performance and style fixes. No logic change

JuliaText · Oct 24, 2023 · 696ce4e · 696ce4e
1 parent 071d228
commit 696ce4e
Show file tree

Hide file tree

Showing 11 changed files with 111 additions and 101 deletions.
diff --git a/src/LM/api.jl b/src/LM/api.jl
@@ -1,23 +1,34 @@
 #TO DO 
 # Doc string
-function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
-   score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
+"""
+$(TYPEDSIGNATURES)
+"""
+function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
+   score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
 end
 
-function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
+"""
+$(TYPEDSIGNATURES)
+"""
+function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
     log2(maskedscore(m, temp_lm, word, context))
 end
 
-function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
-    local log_set=Float64[]
+"""
+$(TYPEDSIGNATURES)
+"""
+function entropy(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
+    sum::Float64 = 0.0
     for ngram in text_ngram
         ngram = split(ngram)
-        push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
-        #println(logscore(m,lm,ngram[end],ngram[1:end-1]))
+        sum += logscore(m, lm, ngram[end], join(ngram[begin:end-1], " "))
     end
-    return(sum(log_set)/length(log_set))
+    return sum/length(text_ngram)
 end
 
-function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
-    return(2^(entropy(m, lm, text_ngram)))
+"""
+$(TYPEDSIGNATURES)
+"""
+function perplexity(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
+    return 2^(entropy(m, lm, text_ngram))
 end
diff --git a/src/LM/counter.jl b/src/LM/counter.jl
@@ -1,17 +1,18 @@
 using DataStructures
 
 """
-    counter is used to make conditional distribution, which is used by score functions to 
-    calculate conditional frequency distribution
+$(TYPEDSIGNATURES)
+
+counter is used to make conditional distribution, which is used by score functions to 
+calculate conditional frequency distribution
 """
 function counter2(data, min::Integer, max::Integer)
     data = everygram(data, min_len=min, max_len=max)
     data = split.(data)
-    temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
-    for i in 1:length(data)
-        history,word = data[i][1:end-1], data[i][end]
+    temp_lm = DefaultDict{SubString{String},Accumulator{String,Int64}}(counter(SubString{String}))
+    for i in eachindex(data)
+        history, word = data[i][begin:end-1], data[i][end]
         temp_lm[join(history, " ")][word] += 1
     end
     return temp_lm
 end
-
diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
@@ -21,16 +21,16 @@ julia> a = everygram(seq,min_len=1, max_len=-1)
 ```
    
 """
-function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
-    ngram = []
+function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)::Vector{String} where {T<:AbstractString}
+    ngram = String[]
     if max_len == -1
         max_len = length(seq)
     end
     for n in range(min_len, stop=max_len)
         temp = ngramizenew(seq, n)
         ngram = append!(ngram, temp)
     end
-    return(ngram)
+    return (ngram)
 end
 
 """
@@ -54,16 +54,18 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
   "5 </s>"
 ```
 """
-function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
+function padding_ngram(
+    word::Vector{T}, n=1;
+    pad_left=false, pad_right=false,
+    left_pad_symbol="<s>", right_pad_symbol="</s>"
+) where {T<:AbstractString}
     local seq
     seq = word
-    if pad_left == true
-        prepend!(seq, [left_pad_symbol])
-    end 
-    if pad_right == true
-        push!(seq, right_pad_symbol)
-    end
-    return  ngramizenew(seq, n)
+
+    pad_left == true && prepend!(seq, [left_pad_symbol])
+    pad_right == true && push!(seq, right_pad_symbol)
+
+    return ngramizenew(seq, n)
 end
 
 """
@@ -85,16 +87,16 @@ julia> ngramizenew(seq ,2)
   "To not"
 ```
 """
-function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+function ngramizenew(words::Vector{T}, nlist::Integer...)::Vector{String} where {T<:AbstractString}
     n_words = length(words)
 
-    tokens = []
+    tokens = String[]
+
+    for n in nlist,
+        index in 1:(n_words-n+1)
 
-    for n in nlist
-        for index in 1:(n_words - n + 1)
-            token = join(words[index:(index + n - 1)], " ")
-            push!(tokens,token)
-        end
+        token = join(words[index:(index+n-1)], " ")
+        push!(tokens, token)
     end
     return tokens
 end

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
@@ -70,46 +70,55 @@ julia> vocabulary.vocab["b"]
 ```
 """
 mutable struct Vocabulary
-vocab::Dict{String, Int64}
-unk_cutoff::Int
-unk_label::String
-allword::Array{String, 1}
+    vocab::Dict{String,Int64}
+    unk_cutoff::Int
+    unk_label::String
+    allword::Vector{String}
 end
 
-function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+"""
+$(TYPEDSIGNATURES)
+"""
+function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
     if unk_label in word
         error("unk_label is in vocab")
     else
-    word= push!(word, unk_label)
+        word = push!(word, unk_label)
     end
     vocab = countmap(word)
     for value in vocab
-        if value[2]<unk_cutoff && value[1] != unk_label
+        if value[2] < unk_cutoff && value[1] != unk_label
             delete!(vocab, value[1])
         end
     end
     Vocabulary(vocab, unk_cutoff, unk_label, word)
 end
 
+"""
+$(TYPEDSIGNATURES)
+
+See [`Vocabulary`](@ref)
+"""
 function update(vocab::Vocabulary, words)
     vocab.allword = append!(vocab.allword, words)
-    vocab.vocab=addcounts!(vocab.vocab, words)
+    vocab.vocab = addcounts!(vocab.vocab, words)
 end
 
 """
+$(TYPEDSIGNATURES)
+
 lookup a sequence or words in the vocabulary
 
 Return an Array of String
+
+See [`Vocabulary`](@ref)
 """
-function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
-    look = []
-    for w in word
+function lookup(voc::Vocabulary, word::AbstractVector{T})::Vector{T} where {T<:AbstractString}
+    return map(word) do w
         if w in keys(voc.vocab)
-            push!(look, w) 
-        else     
-            push!(look, voc.unk_label) 
+            w
+        else
+            voc.unk_label
         end
     end
-    return look
 end
-
diff --git a/src/bayes.jl b/src/bayes.jl
@@ -5,9 +5,11 @@ export NaiveBayesClassifier
 simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>"")))
 
 """
+$(TYPEDSIGNATURES)
+
 Create a dict that maps elements in input array to their frequencies.
 """
-function frequencies(xs)
+function frequencies(xs::AbstractVector{T})::Dict{T,Int} where {T<:Any}
     frequencies = Dict{eltype(xs),Int}()
     for x in xs
         frequencies[x] = get(frequencies, x, 0) + 1
@@ -16,13 +18,13 @@ function frequencies(xs)
 end
 
 """
-    features(::AbstractDict, dict)
+$(TYPEDSIGNATURES)
 
 Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`.
 """
-function features(fs::AbstractDict, dict)
-    bag = zeros(Int, size(dict))
-    for i = 1:length(dict)
+function features(fs::AbstractDict, dict::AbstractVector)::Vector{Int}
+    bag = Vector{Int}(undef, size(dict))
+    for i = eachindex(dict)
         bag[i] = get(fs, dict[i], 0)
     end
     return bag

diff --git a/src/corpus.jl b/src/corpus.jl
@@ -236,11 +236,8 @@ function update_inverse_index!(crps::Corpus)
         ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc)
         ngram_arr = convert(Array{String,1}, ngram_arr)
         for ngram in ngram_arr
-            if haskey(idx, ngram)
-                push!(idx[ngram], i)
-            else
-                idx[ngram] = [i]
-            end
+            key = get!(() -> [], idx, ngram)
+            push!(key, i)
         end
     end
     for key in keys(idx)

diff --git a/src/dtm.jl b/src/dtm.jl
@@ -168,14 +168,15 @@ tdm(crps::Corpus) = dtm(crps)' #'
 
 function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T
     ngs = ngrams(d)
-    indices = Array{Int}(undef, 0)
-    values = Array{Int}(undef, 0)
-    terms = sort(collect(keys(lex)))
+    indices = Int[]
+    values = Int[]
+    terms = sort!(collect(keys(lex)))
     column_indices = columnindices(terms)
 
     for ngram in keys(ngs)
-        if haskey(column_indices, ngram)
-            push!(indices, column_indices[ngram])
+        key = get(column_indices, ngram, nothing)
+        if !isnothing(key)
+            push!(indices, key)
             push!(values, ngs[ngram])
         end
     end

diff --git a/src/lda.jl b/src/lda.jl
@@ -80,7 +80,8 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int,
 
                 for target_topicid in 1:ntopics
                     topicprob = (doc.topicidcount[target_topicid] + beta) / (document_lenth + beta * ntopics)
-                    wordprob = (get(topics[target_topicid].wordcount, word, 0)+ alpha) / (topics[target_topicid].count + alpha * number_of_words)
+                    topic = topics[target_topicid]
+                    wordprob = (get(topic.wordcount, word, 0)+ alpha) / (topic.count + alpha * number_of_words)
                     probs[target_topicid] = topicprob * wordprob
                 end
                 normalize_probs = sum(probs)

diff --git a/src/preprocessing.jl b/src/preprocessing.jl
@@ -69,11 +69,8 @@ function remove_corrupt_utf8!(d::NGramDocument)
     new_ngrams = Dict{AbstractString, Int}()
     for token in keys(d.ngrams)
         new_token = remove_corrupt_utf8(token)
-        if haskey(new_ngrams, new_token)
-            new_ngrams[new_token] = new_ngrams[new_token] + 1
-        else
-            new_ngrams[new_token] = 1
-        end
+        count = get(new_ngrams, new_token, 0)
+        new_ngrams[new_token] = count + 1
     end
     d.ngrams = new_ngrams
 end
@@ -130,11 +127,8 @@ function remove_case!(d::NGramDocument)
     new_ngrams = Dict{AbstractString, Int}()
     for token in keys(d.ngrams)
         new_token = remove_case(token)
-        if haskey(new_ngrams, new_token)
-            new_ngrams[new_token] = new_ngrams[new_token] + 1
-        else
-            new_ngrams[new_token] = 1
-        end
+        count = get(new_ngrams, new_token, 0)
+        new_ngrams[new_token] = count + 1
     end
     d.ngrams = new_ngrams
 end
@@ -474,24 +468,22 @@ function remove_patterns(s::AbstractString, rex::Regex)
     return replace(s, rex => "")
 end
 
-function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
+function remove_patterns(s::SubString{T}, rex::Regex) where {T<:String}
     iob = IOBuffer()
-    ioffset = s.offset
-    data = codeunits(s.string)
     ibegin = 1
     for m in eachmatch(rex, s)
-        len = m.match.offset-ibegin
-	next = nextind(s, lastindex(m.match)+m.match.offset)
+        len = m.match.offset - ibegin
+        next = nextind(s, lastindex(m.match) + m.match.offset)
         if len > 0
-            write(iob, SubString(s, ibegin, ibegin+len))
-            if  next != length(s)+1
-            	write(iob, ' ')
-	    end
+            write(iob, SubString(s, ibegin, ibegin + len))
+            if next != length(s) + 1
+                write(iob, ' ')
+            end
         end
         ibegin = next
     end
-    len = lastindex(s) - ibegin 
-    (len > 0) && write(iob, SubString(s, ibegin, ibegin+len))
+    len = lastindex(s) - ibegin
+    (len > 0) && write(iob, SubString(s, ibegin, ibegin + len))
     String(take!(iob))
 end
 
@@ -519,11 +511,8 @@ function remove_patterns!(d::NGramDocument, rex::Regex)
     new_ngrams = Dict{AbstractString, Int}()
     for token in keys(d.ngrams)
         new_token = remove_patterns(token, rex)
-        if haskey(new_ngrams, new_token)
-            new_ngrams[new_token] = new_ngrams[new_token] + 1
-        else
-            new_ngrams[new_token] = 1
-        end
+        count = get(new_ngrams, new_token, 0)
+        new_ngrams[new_token] = count + 1 
     end
     d.ngrams = new_ngrams
     nothing

diff --git a/src/stemmer.jl b/src/stemmer.jl
@@ -37,11 +37,8 @@ function stem!(stemmer::Stemmer, d::NGramDocument)
     for token in keys(d.ngrams)
         new_token = stem(stemmer, token)
         if new_token != token
-            if haskey(d.ngrams, new_token)
-                d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
-            else
-                d.ngrams[new_token] = d.ngrams[token]
-            end
+            count = get(d.ngrams, new_token, 0)
+            d.ngrams[new_token] = count + d.ngrams[token]
             delete!(d.ngrams, token)
         end
     end

diff --git a/src/tf_idf.jl b/src/tf_idf.jl
@@ -113,10 +113,10 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <
     idf = log.(n ./ documents_containing_term)
 
     # TF-IDF is the product of TF and IDF
-    for i in 1:n
-        for j in 1:p
-           tfidf[i, j] = tfidf[i, j] * idf[j]
-        end
+    for i in 1:n,
+        j in 1:p
+
+        tfidf[i, j] *= idf[j]
     end
 
     return tfidf