Skip to content

Commit

Permalink
minor performance and style fixes. No logic change
Browse files Browse the repository at this point in the history
  • Loading branch information
rssdev10 committed Oct 24, 2023
1 parent 071d228 commit 696ce4e
Show file tree
Hide file tree
Showing 11 changed files with 111 additions and 101 deletions.
31 changes: 21 additions & 10 deletions src/LM/api.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,34 @@
#TO DO
# Doc string
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
"""
$(TYPEDSIGNATURES)
"""
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
end

function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
"""
$(TYPEDSIGNATURES)
"""
function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
log2(maskedscore(m, temp_lm, word, context))
end

function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
local log_set=Float64[]
"""
$(TYPEDSIGNATURES)
"""
function entropy(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
sum::Float64 = 0.0
for ngram in text_ngram
ngram = split(ngram)
push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
#println(logscore(m,lm,ngram[end],ngram[1:end-1]))
sum += logscore(m, lm, ngram[end], join(ngram[begin:end-1], " "))
end
return(sum(log_set)/length(log_set))
return sum/length(text_ngram)
end

function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
return(2^(entropy(m, lm, text_ngram)))
"""
$(TYPEDSIGNATURES)
"""
function perplexity(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
return 2^(entropy(m, lm, text_ngram))
end
13 changes: 7 additions & 6 deletions src/LM/counter.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
using DataStructures

"""
counter is used to make conditional distribution, which is used by score functions to
calculate conditional frequency distribution
$(TYPEDSIGNATURES)
counter is used to make conditional distribution, which is used by score functions to
calculate conditional frequency distribution
"""
function counter2(data, min::Integer, max::Integer)
data = everygram(data, min_len=min, max_len=max)
data = split.(data)
temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
for i in 1:length(data)
history,word = data[i][1:end-1], data[i][end]
temp_lm = DefaultDict{SubString{String},Accumulator{String,Int64}}(counter(SubString{String}))
for i in eachindex(data)
history, word = data[i][begin:end-1], data[i][end]
temp_lm[join(history, " ")][word] += 1
end
return temp_lm
end

38 changes: 20 additions & 18 deletions src/LM/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ julia> a = everygram(seq,min_len=1, max_len=-1)
```
"""
function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
ngram = []
function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)::Vector{String} where {T<:AbstractString}
ngram = String[]
if max_len == -1
max_len = length(seq)
end
for n in range(min_len, stop=max_len)
temp = ngramizenew(seq, n)
ngram = append!(ngram, temp)
end
return(ngram)
return (ngram)
end

"""
Expand All @@ -54,16 +54,18 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
"5 </s>"
```
"""
function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
function padding_ngram(
word::Vector{T}, n=1;
pad_left=false, pad_right=false,
left_pad_symbol="<s>", right_pad_symbol="</s>"
) where {T<:AbstractString}
local seq
seq = word
if pad_left == true
prepend!(seq, [left_pad_symbol])
end
if pad_right == true
push!(seq, right_pad_symbol)
end
return ngramizenew(seq, n)

pad_left == true && prepend!(seq, [left_pad_symbol])
pad_right == true && push!(seq, right_pad_symbol)

return ngramizenew(seq, n)
end

"""
Expand All @@ -85,16 +87,16 @@ julia> ngramizenew(seq ,2)
"To not"
```
"""
function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
function ngramizenew(words::Vector{T}, nlist::Integer...)::Vector{String} where {T<:AbstractString}
n_words = length(words)

tokens = []
tokens = String[]

for n in nlist,
index in 1:(n_words-n+1)

for n in nlist
for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
push!(tokens,token)
end
token = join(words[index:(index+n-1)], " ")
push!(tokens, token)
end
return tokens
end
Expand Down
41 changes: 25 additions & 16 deletions src/LM/vocab.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,46 +70,55 @@ julia> vocabulary.vocab["b"]
```
"""
mutable struct Vocabulary
vocab::Dict{String, Int64}
unk_cutoff::Int
unk_label::String
allword::Array{String, 1}
vocab::Dict{String,Int64}
unk_cutoff::Int
unk_label::String
allword::Vector{String}
end

function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
"""
$(TYPEDSIGNATURES)
"""
function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
if unk_label in word
error("unk_label is in vocab")
else
word= push!(word, unk_label)
word = push!(word, unk_label)
end
vocab = countmap(word)
for value in vocab
if value[2]<unk_cutoff && value[1] != unk_label
if value[2] < unk_cutoff && value[1] != unk_label
delete!(vocab, value[1])
end
end
Vocabulary(vocab, unk_cutoff, unk_label, word)
end

"""
$(TYPEDSIGNATURES)
See [`Vocabulary`](@ref)
"""
function update(vocab::Vocabulary, words)
vocab.allword = append!(vocab.allword, words)
vocab.vocab=addcounts!(vocab.vocab, words)
vocab.vocab = addcounts!(vocab.vocab, words)
end

"""
$(TYPEDSIGNATURES)
lookup a sequence or words in the vocabulary
Return an Array of String
See [`Vocabulary`](@ref)
"""
function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
look = []
for w in word
function lookup(voc::Vocabulary, word::AbstractVector{T})::Vector{T} where {T<:AbstractString}
return map(word) do w
if w in keys(voc.vocab)
push!(look, w)
else
push!(look, voc.unk_label)
w
else
voc.unk_label
end
end
return look
end

12 changes: 7 additions & 5 deletions src/bayes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ export NaiveBayesClassifier
simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>"")))

"""
$(TYPEDSIGNATURES)
Create a dict that maps elements in input array to their frequencies.
"""
function frequencies(xs)
function frequencies(xs::AbstractVector{T})::Dict{T,Int} where {T<:Any}
frequencies = Dict{eltype(xs),Int}()
for x in xs
frequencies[x] = get(frequencies, x, 0) + 1
Expand All @@ -16,13 +18,13 @@ function frequencies(xs)
end

"""
features(::AbstractDict, dict)
$(TYPEDSIGNATURES)
Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`.
"""
function features(fs::AbstractDict, dict)
bag = zeros(Int, size(dict))
for i = 1:length(dict)
function features(fs::AbstractDict, dict::AbstractVector)::Vector{Int}
bag = Vector{Int}(undef, size(dict))
for i = eachindex(dict)
bag[i] = get(fs, dict[i], 0)
end
return bag
Expand Down
7 changes: 2 additions & 5 deletions src/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -236,11 +236,8 @@ function update_inverse_index!(crps::Corpus)
ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc)
ngram_arr = convert(Array{String,1}, ngram_arr)
for ngram in ngram_arr
if haskey(idx, ngram)
push!(idx[ngram], i)
else
idx[ngram] = [i]
end
key = get!(() -> [], idx, ngram)
push!(key, i)
end
end
for key in keys(idx)
Expand Down
11 changes: 6 additions & 5 deletions src/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,15 @@ tdm(crps::Corpus) = dtm(crps)' #'

function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T
ngs = ngrams(d)
indices = Array{Int}(undef, 0)
values = Array{Int}(undef, 0)
terms = sort(collect(keys(lex)))
indices = Int[]
values = Int[]
terms = sort!(collect(keys(lex)))
column_indices = columnindices(terms)

for ngram in keys(ngs)
if haskey(column_indices, ngram)
push!(indices, column_indices[ngram])
key = get(column_indices, ngram, nothing)
if !isnothing(key)
push!(indices, key)
push!(values, ngs[ngram])
end
end
Expand Down
3 changes: 2 additions & 1 deletion src/lda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int,

for target_topicid in 1:ntopics
topicprob = (doc.topicidcount[target_topicid] + beta) / (document_lenth + beta * ntopics)
wordprob = (get(topics[target_topicid].wordcount, word, 0)+ alpha) / (topics[target_topicid].count + alpha * number_of_words)
topic = topics[target_topicid]
wordprob = (get(topic.wordcount, word, 0)+ alpha) / (topic.count + alpha * number_of_words)
probs[target_topicid] = topicprob * wordprob
end
normalize_probs = sum(probs)
Expand Down
41 changes: 15 additions & 26 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,8 @@ function remove_corrupt_utf8!(d::NGramDocument)
new_ngrams = Dict{AbstractString, Int}()
for token in keys(d.ngrams)
new_token = remove_corrupt_utf8(token)
if haskey(new_ngrams, new_token)
new_ngrams[new_token] = new_ngrams[new_token] + 1
else
new_ngrams[new_token] = 1
end
count = get(new_ngrams, new_token, 0)
new_ngrams[new_token] = count + 1
end
d.ngrams = new_ngrams
end
Expand Down Expand Up @@ -130,11 +127,8 @@ function remove_case!(d::NGramDocument)
new_ngrams = Dict{AbstractString, Int}()
for token in keys(d.ngrams)
new_token = remove_case(token)
if haskey(new_ngrams, new_token)
new_ngrams[new_token] = new_ngrams[new_token] + 1
else
new_ngrams[new_token] = 1
end
count = get(new_ngrams, new_token, 0)
new_ngrams[new_token] = count + 1
end
d.ngrams = new_ngrams
end
Expand Down Expand Up @@ -474,24 +468,22 @@ function remove_patterns(s::AbstractString, rex::Regex)
return replace(s, rex => "")
end

function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
function remove_patterns(s::SubString{T}, rex::Regex) where {T<:String}
iob = IOBuffer()
ioffset = s.offset
data = codeunits(s.string)
ibegin = 1
for m in eachmatch(rex, s)
len = m.match.offset-ibegin
next = nextind(s, lastindex(m.match)+m.match.offset)
len = m.match.offset - ibegin
next = nextind(s, lastindex(m.match) + m.match.offset)
if len > 0
write(iob, SubString(s, ibegin, ibegin+len))
if next != length(s)+1
write(iob, ' ')
end
write(iob, SubString(s, ibegin, ibegin + len))
if next != length(s) + 1
write(iob, ' ')
end
end
ibegin = next
end
len = lastindex(s) - ibegin
(len > 0) && write(iob, SubString(s, ibegin, ibegin+len))
len = lastindex(s) - ibegin
(len > 0) && write(iob, SubString(s, ibegin, ibegin + len))
String(take!(iob))
end

Expand Down Expand Up @@ -519,11 +511,8 @@ function remove_patterns!(d::NGramDocument, rex::Regex)
new_ngrams = Dict{AbstractString, Int}()
for token in keys(d.ngrams)
new_token = remove_patterns(token, rex)
if haskey(new_ngrams, new_token)
new_ngrams[new_token] = new_ngrams[new_token] + 1
else
new_ngrams[new_token] = 1
end
count = get(new_ngrams, new_token, 0)
new_ngrams[new_token] = count + 1
end
d.ngrams = new_ngrams
nothing
Expand Down
7 changes: 2 additions & 5 deletions src/stemmer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,8 @@ function stem!(stemmer::Stemmer, d::NGramDocument)
for token in keys(d.ngrams)
new_token = stem(stemmer, token)
if new_token != token
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
end
count = get(d.ngrams, new_token, 0)
d.ngrams[new_token] = count + d.ngrams[token]
delete!(d.ngrams, token)
end
end
Expand Down
8 changes: 4 additions & 4 deletions src/tf_idf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <
idf = log.(n ./ documents_containing_term)

# TF-IDF is the product of TF and IDF
for i in 1:n
for j in 1:p
tfidf[i, j] = tfidf[i, j] * idf[j]
end
for i in 1:n,
j in 1:p

tfidf[i, j] *= idf[j]
end

return tfidf
Expand Down

0 comments on commit 696ce4e

Please sign in to comment.