From ee99e857b99c97f77b5cf7c3bc79d496aa872872 Mon Sep 17 00:00:00 2001 From: atantos Date: Wed, 10 Jan 2024 13:39:21 +0200 Subject: [PATCH 1/5] edited the .gitignore file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e845b0fd..a92ef0bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store docs/build Manifest.toml +.vscode/ From d65986445c3f6c022bd1d37d63c86b34d2d7d65d Mon Sep 17 00:00:00 2001 From: atantos Date: Wed, 10 Jan 2024 15:20:16 +0200 Subject: [PATCH 2/5] Added vocabulary extraction function. --- src/TextAnalysis.jl | 226 ++++++++++++++++++++++---------------------- src/coom.jl | 25 +++-- src/document.jl | 88 ++++++++++++++--- test/document.jl | 21 ++-- 4 files changed, 217 insertions(+), 143 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 53c1470d..2d6b0fad 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -1,115 +1,115 @@ module TextAnalysis - using SparseArrays - using Printf - using LinearAlgebra - using StatsBase: countmap,addcounts! - using Languages - using WordTokenizers - using Snowball - - using Tables - using DataStructures - using Statistics - using Serialization - using ProgressMeter - using DocStringExtensions - - import Base: depwarn, merge! - import Serialization: serialize, deserialize - - export AbstractDocument, Document - export FileDocument, StringDocument, TokenDocument, NGramDocument - export GenericDocument - export Corpus, DirectoryCorpus - export stemmer_types, Stemmer - export DocumentTermMatrix - export text, tokens, ngrams - export text!, tokens!, ngrams! - export documents - export language, title, author, timestamp - export languages, titles, authors, timestamps - export language!, title!, author!, timestamp! - export languages!, titles!, authors!, timestamps! - export ngram_complexity - export lexicon, update_lexicon!, lexical_frequency, lexicon_size - export inverse_index, update_inverse_index!, index_size - export remove_corrupt_utf8 - export remove_corrupt_utf8! - export remove_case - export remove_case! - export remove_words, remove_stop_words - export remove_words!, remove_stop_words! - export stem, tag_pos - export stem!, tag_pos! - export remove_html_tags, remove_html_tags! - export prepare! - export frequent_terms, sparse_terms - export remove_frequent_terms!, remove_sparse_terms! - export dtv, each_dtv, dtm, tdm - export TextHashFunction, index_hash, cardinality, hash_function, hash_function! - export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm - export CooMatrix, coom - export standardize! - export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity - export tf!, tf_idf!, bm_25!, lda! - export remove_patterns!, remove_patterns - export prune! - - export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation - export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles - export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags - - export NaiveBayesClassifier - export tag_scheme! - - export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax - export bleu_score - - export PerceptronTagger, fit!, predict - - export Vocabulary, lookup, update - export everygram, padding_ngram - export maskedscore, logscore, entropy, perplexity - export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score - - export tokenize #imported from WordTokenizers - - include("tokenizer.jl") - include("ngramizer.jl") - include("document.jl") - include("hash.jl") - include("corpus.jl") - include("metadata.jl") - include("preprocessing.jl") - - include("stemmer.jl") - include("dtm.jl") - include("tf_idf.jl") - include("lsa.jl") - include("lda.jl") - include("summarizer.jl") - include("show.jl") - include("bayes.jl") - include("deprecations.jl") - include("tagging_schemes.jl") - include("utils.jl") - - include("evaluation_metrics.jl") - include("translate_evaluation/bleu_score.jl") - include("coom.jl") - - - - # Lang_model - include("LM/vocab.jl") - include("LM/langmodel.jl") - include("LM/api.jl") - include("LM/counter.jl") - include("LM/preprocessing.jl") - - - - function __init__() - - end +using SparseArrays +using Printf +using LinearAlgebra +using StatsBase: countmap, addcounts! +using Languages +using WordTokenizers +using Snowball + +using Tables +using DataStructures +using Statistics +using Serialization +using ProgressMeter +using DocStringExtensions + +import Base: depwarn, merge! +import Serialization: serialize, deserialize + +export AbstractDocument, Document +export FileDocument, StringDocument, TokenDocument, NGramDocument +export GenericDocument +export Corpus, DirectoryCorpus +export stemmer_types, Stemmer +export DocumentTermMatrix +export text, tokens, ngrams, vocab +export text!, tokens!, ngrams! +export documents +export language, title, author, timestamp +export languages, titles, authors, timestamps +export language!, title!, author!, timestamp! +export languages!, titles!, authors!, timestamps! +export ngram_complexity +export lexicon, update_lexicon!, lexical_frequency, lexicon_size +export inverse_index, update_inverse_index!, index_size +export remove_corrupt_utf8 +export remove_corrupt_utf8! +export remove_case +export remove_case! +export remove_words, remove_stop_words +export remove_words!, remove_stop_words! +export stem, tag_pos +export stem!, tag_pos! +export remove_html_tags, remove_html_tags! +export prepare! +export frequent_terms, sparse_terms +export remove_frequent_terms!, remove_sparse_terms! +export dtv, each_dtv, dtm, tdm +export TextHashFunction, index_hash, cardinality, hash_function, hash_function! +export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm +export CooMatrix, coom +export standardize! +export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity +export tf!, tf_idf!, bm_25!, lda! +export remove_patterns!, remove_patterns +export prune! + +export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation +export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles +export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags + +export NaiveBayesClassifier +export tag_scheme! + +export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax +export bleu_score + +export PerceptronTagger, fit!, predict + +export Vocabulary, lookup, update +export everygram, padding_ngram +export maskedscore, logscore, entropy, perplexity +export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score + +export tokenize #imported from WordTokenizers + +include("tokenizer.jl") +include("ngramizer.jl") +include("document.jl") +include("hash.jl") +include("corpus.jl") +include("metadata.jl") +include("preprocessing.jl") + +include("stemmer.jl") +include("dtm.jl") +include("tf_idf.jl") +include("lsa.jl") +include("lda.jl") +include("summarizer.jl") +include("show.jl") +include("bayes.jl") +include("deprecations.jl") +include("tagging_schemes.jl") +include("utils.jl") + +include("evaluation_metrics.jl") +include("translate_evaluation/bleu_score.jl") +include("coom.jl") + + + +# Lang_model +include("LM/vocab.jl") +include("LM/langmodel.jl") +include("LM/api.jl") +include("LM/counter.jl") +include("LM/preprocessing.jl") + + + +function __init__() + +end end diff --git a/src/coom.jl b/src/coom.jl index e76cb151..031d5bc6 100644 --- a/src/coom.jl +++ b/src/coom.jl @@ -26,22 +26,27 @@ julia> using TextAnalysis, DataStructures TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true) 3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries: - [2, 1] = 2.0 - [1, 2] = 2.0 - [3, 2] = 0.3999 - [2, 3] = 0.3999 +13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries: + ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 1.0 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ + ⋮ ⋮ ⋮ + ⋅ ⋅ ⋅ ⋅ 2.0 ⋅ 0.4 1.166 0.6665 1.0 2.0 ⋅ 1.0 + ⋅ ⋅ ⋅ ⋅ 2.0 ⋅ ⋅ 2.0 0.4 0.5 0.6665 1.0 ⋅ julia> using TextAnalysis, DataStructures doc = StringDocument("This is a text about an apple. There are many texts about apples.") docv = TextAnalysis.tokenize(language(doc), text(doc)) - vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3) + vocab = vocab(doc) TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional) -3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries: - [2, 1] = 1.0 - [1, 2] = 1.0 - [3, 2] = 0.1999 - [2, 3] = 0.1999 +13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries: + ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 0.5 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ + ⋮ ⋮ ⋮ + ⋅ ⋅ ⋅ ⋅ 1.0 ⋅ 0.2 0.583 0.3333 0.5 1.0 ⋅ 0.5 + ⋅ ⋅ ⋅ ⋅ 1.0 ⋅ ⋅ 1.0 0.2 0.25 0.3333 0.5 ⋅ ``` """ function coo_matrix(::Type{T}, diff --git a/src/document.jl b/src/document.jl index 0d05c19a..5163fde5 100644 --- a/src/document.jl +++ b/src/document.jl @@ -46,7 +46,7 @@ end # ############################################################################## -abstract type AbstractDocument; end +abstract type AbstractDocument end mutable struct FileDocument <: AbstractDocument @@ -142,7 +142,7 @@ A TokenDocument{String} function TokenDocument(txt::AbstractString, dm::DocumentMetadata) TokenDocument(tokenize(dm.language, String(txt)), dm) end -function TokenDocument(tkns::Vector{T}) where T <: AbstractString +function TokenDocument(tkns::Vector{T}) where {T<:AbstractString} TokenDocument(tkns, DocumentMetadata()) end TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata()) @@ -189,7 +189,7 @@ end function NGramDocument(txt::AbstractString, n::Integer...=1) NGramDocument(txt, DocumentMetadata(), n...) end -function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString +function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString} NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata()) end @@ -270,17 +270,83 @@ julia> tokens(sd) "." ``` """ -tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d)) +tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d)) tokens(d::TokenDocument) = d.tokens function tokens(d::NGramDocument) error("The tokens of an NGramDocument cannot be reconstructed") end -tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens) -function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString +tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens) +function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString} error("The tokens of a $(typeof(d)) cannot be directly edited") end + +############################################################################## +# +# vocab() / vocab!(): Access to document text as a vocabulary +# +# to_string_vector(): Helper function for creating a vocabulary from a StringDocument or a Vector{String} +# +############################################################################## +# Converts a StringDocument to Vector{String} +to_string_vector(doc::StringDocument) = tokens(doc) +# Identity function for Vector{String} +to_string_vector(vec::Vector{String}) = vec + +""" + vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int} + +Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index. + +# Arguments +- `input::Union{StringDocument, Vector{String}}`: Input can be either a `StringDocument` or a `Vector{String}`. + For `StringDocument`, the tokens are extracted and used. For `Vector{String}`, the vector itself is used. + +# Returns +- `OrderedDict{String, Int}`: An ordered dictionary where each key is a unique string from the input, + and the value is the index of that string in the original input. + +# Examples +```julia +julia> doc = StringDocument("This is a sample sentence of a sample document."); + vocab(doc) + +OrderedDict{String, Int64} with 8 entries: + "This" => 1 + "is" => 2 + "a" => 3 + "sample" => 4 + "sentence" => 5 + ⋮ => ⋮ + +julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"]; + vocab(str_vec) + +OrderedDict{String, Int64} with 7 entries: + "This" => 1 + "is" => 2 + "a" => 3 + "sample" => 4 + "sentence" => 5 + ⋮ => ⋮ +""" +function vocab(input::Union{StringDocument,Vector{String}}) + string_vector = to_string_vector(input) + string_vector = length(string_vector) != length(unique(string_vector)) ? unique(string_vector) : string_vector + + # preallocating the ordered dictionary with the size of the string_vector + ordered_dict = OrderedDict{String,Int}() + sizehint!(ordered_dict, length(string_vector)) + + # reverse the order of the keys and values in the enumerate iterator to get an ordered dict. + for (index, key) in enumerate(string_vector) + ordered_dict[key] = index + end + return ordered_dict +end + + ############################################################################## # # ngrams() / ngrams!(): Access to document text as n-gram counts @@ -322,7 +388,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n. ngrams(d::NGramDocument) = d.ngrams ngrams(d::AbstractDocument) = ngrams(d, 1) -ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams) +ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams) function ngrams!(d::AbstractDocument, new_ngrams::Dict) error("The n-grams of $(typeof(d)) cannot be directly edited") end @@ -371,8 +437,8 @@ const GenericDocument = Union{ ############################################################################## Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str) -Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns) -Document(ng::Dict{String, Int}) = NGramDocument(ng) +Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns) +Document(ng::Dict{String,Int}) = NGramDocument(ng) ############################################################################## # @@ -383,11 +449,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng) function Base.convert(::Type{StringDocument}, d::FileDocument) StringDocument(text(d), d.metadata) end -function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument})) +function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument})) TokenDocument(tokens(d), d.metadata) end function Base.convert(::Type{NGramDocument}, - d::(Union{FileDocument, StringDocument, TokenDocument})) + d::(Union{FileDocument,StringDocument,TokenDocument})) NGramDocument(ngrams(d), 1, d.metadata) end Base.convert(::Type{TokenDocument}, d::TokenDocument) = d diff --git a/test/document.jl b/test/document.jl index e080f841..ec9aaa97 100644 --- a/test/document.jl +++ b/test/document.jl @@ -1,13 +1,13 @@ @testset "Document" begin - dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2")) - @test (dmeta.language == Languages.English()) && - (dmeta.title == "test title") && - (dmeta.author == "test author") && - (dmeta.timestamp == "test time") && - (get(dmeta.custom, :k1, "") == "v1") && - (get(dmeta.custom, :k2, "") == "v2") + dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1 => "v1", :k2 => "v2")) + @test (dmeta.language == Languages.English()) && + (dmeta.title == "test title") && + (dmeta.author == "test author") && + (dmeta.timestamp == "test time") && + (get(dmeta.custom, :k1, "") == "v1") && + (get(dmeta.custom, :k2, "") == "v2") # mutability dmeta.custom = nothing @@ -34,6 +34,9 @@ @test "a" in keys(ngrams(sd, 1)) @test "string" in keys(ngrams(sd, 1)) + @test vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) + @test vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) + @test length(sd) == 16 hamlet_text = "To be or not to be..." @@ -79,8 +82,8 @@ @test isequal(length(Document("this is text")), 12) # NGramDocument creation with multiple ngram complexity - let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7) - for (n,c,l) in zip(N,C,L) + let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7) + for (n, c, l) in zip(N, C, L) ngd = NGramDocument(sample_text1, n...) @test ngram_complexity(ngd) == c @test length(ngd.ngrams) == l From 8bbbbcc6273953ba1faf5935a7768e8c7c491a83 Mon Sep 17 00:00:00 2001 From: atantos Date: Wed, 10 Jan 2024 20:38:30 +0200 Subject: [PATCH 3/5] Integrated suggestions made by @rssdev10 (except for where ordered_vocab will be stored) --- src/TextAnalysis.jl | 228 ++++++++++++++++++++++---------------------- src/coom.jl | 4 +- src/document.jl | 13 ++- test/document.jl | 1 + 4 files changed, 123 insertions(+), 123 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 2d6b0fad..c28f1589 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -1,115 +1,115 @@ module TextAnalysis -using SparseArrays -using Printf -using LinearAlgebra -using StatsBase: countmap, addcounts! -using Languages -using WordTokenizers -using Snowball - -using Tables -using DataStructures -using Statistics -using Serialization -using ProgressMeter -using DocStringExtensions - -import Base: depwarn, merge! -import Serialization: serialize, deserialize - -export AbstractDocument, Document -export FileDocument, StringDocument, TokenDocument, NGramDocument -export GenericDocument -export Corpus, DirectoryCorpus -export stemmer_types, Stemmer -export DocumentTermMatrix -export text, tokens, ngrams, vocab -export text!, tokens!, ngrams! -export documents -export language, title, author, timestamp -export languages, titles, authors, timestamps -export language!, title!, author!, timestamp! -export languages!, titles!, authors!, timestamps! -export ngram_complexity -export lexicon, update_lexicon!, lexical_frequency, lexicon_size -export inverse_index, update_inverse_index!, index_size -export remove_corrupt_utf8 -export remove_corrupt_utf8! -export remove_case -export remove_case! -export remove_words, remove_stop_words -export remove_words!, remove_stop_words! -export stem, tag_pos -export stem!, tag_pos! -export remove_html_tags, remove_html_tags! -export prepare! -export frequent_terms, sparse_terms -export remove_frequent_terms!, remove_sparse_terms! -export dtv, each_dtv, dtm, tdm -export TextHashFunction, index_hash, cardinality, hash_function, hash_function! -export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm -export CooMatrix, coom -export standardize! -export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity -export tf!, tf_idf!, bm_25!, lda! -export remove_patterns!, remove_patterns -export prune! - -export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation -export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles -export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags - -export NaiveBayesClassifier -export tag_scheme! - -export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax -export bleu_score - -export PerceptronTagger, fit!, predict - -export Vocabulary, lookup, update -export everygram, padding_ngram -export maskedscore, logscore, entropy, perplexity -export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score - -export tokenize #imported from WordTokenizers - -include("tokenizer.jl") -include("ngramizer.jl") -include("document.jl") -include("hash.jl") -include("corpus.jl") -include("metadata.jl") -include("preprocessing.jl") - -include("stemmer.jl") -include("dtm.jl") -include("tf_idf.jl") -include("lsa.jl") -include("lda.jl") -include("summarizer.jl") -include("show.jl") -include("bayes.jl") -include("deprecations.jl") -include("tagging_schemes.jl") -include("utils.jl") - -include("evaluation_metrics.jl") -include("translate_evaluation/bleu_score.jl") -include("coom.jl") - - - -# Lang_model -include("LM/vocab.jl") -include("LM/langmodel.jl") -include("LM/api.jl") -include("LM/counter.jl") -include("LM/preprocessing.jl") - - - -function __init__() - -end -end + using SparseArrays + using Printf + using LinearAlgebra + using StatsBase: countmap,addcounts! + using Languages + using WordTokenizers + using Snowball + + using Tables + using DataStructures + using Statistics + using Serialization + using ProgressMeter + using DocStringExtensions + + import Base: depwarn, merge! + import Serialization: serialize, deserialize + + export AbstractDocument, Document + export FileDocument, StringDocument, TokenDocument, NGramDocument + export GenericDocument + export Corpus, DirectoryCorpus + export stemmer_types, Stemmer + export DocumentTermMatrix + export text, tokens, ngrams, ordered_vocab + export text!, tokens!, ngrams! + export documents + export language, title, author, timestamp + export languages, titles, authors, timestamps + export language!, title!, author!, timestamp! + export languages!, titles!, authors!, timestamps! + export ngram_complexity + export lexicon, update_lexicon!, lexical_frequency, lexicon_size + export inverse_index, update_inverse_index!, index_size + export remove_corrupt_utf8 + export remove_corrupt_utf8! + export remove_case + export remove_case! + export remove_words, remove_stop_words + export remove_words!, remove_stop_words! + export stem, tag_pos + export stem!, tag_pos! + export remove_html_tags, remove_html_tags! + export prepare! + export frequent_terms, sparse_terms + export remove_frequent_terms!, remove_sparse_terms! + export dtv, each_dtv, dtm, tdm + export TextHashFunction, index_hash, cardinality, hash_function, hash_function! + export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm + export CooMatrix, coom + export standardize! + export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity + export tf!, tf_idf!, bm_25!, lda! + export remove_patterns!, remove_patterns + export prune! + + export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation + export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles + export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags + + export NaiveBayesClassifier + export tag_scheme! + + export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax + export bleu_score + + export PerceptronTagger, fit!, predict + + export Vocabulary, lookup, update + export everygram, padding_ngram + export maskedscore, logscore, entropy, perplexity + export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score + + export tokenize #imported from WordTokenizers + + include("tokenizer.jl") + include("ngramizer.jl") + include("document.jl") + include("hash.jl") + include("corpus.jl") + include("metadata.jl") + include("preprocessing.jl") + + include("stemmer.jl") + include("dtm.jl") + include("tf_idf.jl") + include("lsa.jl") + include("lda.jl") + include("summarizer.jl") + include("show.jl") + include("bayes.jl") + include("deprecations.jl") + include("tagging_schemes.jl") + include("utils.jl") + + include("evaluation_metrics.jl") + include("translate_evaluation/bleu_score.jl") + include("coom.jl") + + + + # Lang_model + include("LM/vocab.jl") + include("LM/langmodel.jl") + include("LM/api.jl") + include("LM/counter.jl") + include("LM/preprocessing.jl") + + + + function __init__() + + end +end \ No newline at end of file diff --git a/src/coom.jl b/src/coom.jl index 031d5bc6..7d320a73 100644 --- a/src/coom.jl +++ b/src/coom.jl @@ -22,7 +22,7 @@ of not the counts by the distance between word positions. The `mode` keyword can julia> using TextAnalysis, DataStructures doc = StringDocument("This is a text about an apple. There are many texts about apples.") docv = TextAnalysis.tokenize(language(doc), text(doc)) - vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3) + vocab = ordered_vocab(doc) TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true) 3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries: @@ -37,7 +37,7 @@ julia> using TextAnalysis, DataStructures julia> using TextAnalysis, DataStructures doc = StringDocument("This is a text about an apple. There are many texts about apples.") docv = TextAnalysis.tokenize(language(doc), text(doc)) - vocab = vocab(doc) + vocab = ordered_vocab(doc) TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional) 13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries: diff --git a/src/document.jl b/src/document.jl index 5163fde5..7223765e 100644 --- a/src/document.jl +++ b/src/document.jl @@ -295,7 +295,7 @@ to_string_vector(doc::StringDocument) = tokens(doc) to_string_vector(vec::Vector{String}) = vec """ - vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int} + ordered_vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int} Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index. @@ -310,7 +310,7 @@ Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (u # Examples ```julia julia> doc = StringDocument("This is a sample sentence of a sample document."); - vocab(doc) + ordered_vocab(doc) OrderedDict{String, Int64} with 8 entries: "This" => 1 @@ -321,8 +321,8 @@ OrderedDict{String, Int64} with 8 entries: ⋮ => ⋮ julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"]; - vocab(str_vec) - + ordered_vocab(str_vec) + OrderedDict{String, Int64} with 7 entries: "This" => 1 "is" => 2 @@ -332,14 +332,13 @@ OrderedDict{String, Int64} with 7 entries: ⋮ => ⋮ """ function vocab(input::Union{StringDocument,Vector{String}}) - string_vector = to_string_vector(input) - string_vector = length(string_vector) != length(unique(string_vector)) ? unique(string_vector) : string_vector + string_vector = to_string_vector(input) |> unique # preallocating the ordered dictionary with the size of the string_vector ordered_dict = OrderedDict{String,Int}() sizehint!(ordered_dict, length(string_vector)) - # reverse the order of the keys and values in the enumerate iterator to get an ordered dict. + # populating the ordered dictionary for (index, key) in enumerate(string_vector) ordered_dict[key] = index end diff --git a/test/document.jl b/test/document.jl index ec9aaa97..ea5ecafd 100644 --- a/test/document.jl +++ b/test/document.jl @@ -1,3 +1,4 @@ +using DataStructures: OrderedDict @testset "Document" begin From 29a2558f1bf5a8824a9d3f5c4a2a1152200fbdb0 Mon Sep 17 00:00:00 2001 From: atantos Date: Wed, 10 Jan 2024 20:41:12 +0200 Subject: [PATCH 4/5] Added julia-repl on the ordered_vocab() documentation example. --- src/document.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/document.jl b/src/document.jl index 7223765e..84f75d6c 100644 --- a/src/document.jl +++ b/src/document.jl @@ -308,7 +308,7 @@ Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (u and the value is the index of that string in the original input. # Examples -```julia +```julia-repl julia> doc = StringDocument("This is a sample sentence of a sample document."); ordered_vocab(doc) From 0e53fc89b7e59a03e7c920ccf875bf97927796c8 Mon Sep 17 00:00:00 2001 From: atantos Date: Wed, 10 Jan 2024 21:02:45 +0200 Subject: [PATCH 5/5] All tests are passed locally. --- src/document.jl | 2 +- test/document.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/document.jl b/src/document.jl index 84f75d6c..e432ce9a 100644 --- a/src/document.jl +++ b/src/document.jl @@ -331,7 +331,7 @@ OrderedDict{String, Int64} with 7 entries: "sentence" => 5 ⋮ => ⋮ """ -function vocab(input::Union{StringDocument,Vector{String}}) +function ordered_vocab(input::Union{StringDocument,Vector{String}}) string_vector = to_string_vector(input) |> unique # preallocating the ordered dictionary with the size of the string_vector diff --git a/test/document.jl b/test/document.jl index ea5ecafd..7eca0df5 100644 --- a/test/document.jl +++ b/test/document.jl @@ -35,8 +35,8 @@ using DataStructures: OrderedDict @test "a" in keys(ngrams(sd, 1)) @test "string" in keys(ngrams(sd, 1)) - @test vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) - @test vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) + @test ordered_vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) + @test ordered_vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) @test length(sd) == 16