From ee99e857b99c97f77b5cf7c3bc79d496aa872872 Mon Sep 17 00:00:00 2001
From: atantos <atantos@gmail.com>
Date: Wed, 10 Jan 2024 13:39:21 +0200
Subject: [PATCH 1/5] edited the .gitignore file

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index e845b0fd..a92ef0bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .DS_Store
 docs/build
 Manifest.toml
+.vscode/

From d65986445c3f6c022bd1d37d63c86b34d2d7d65d Mon Sep 17 00:00:00 2001
From: atantos <atantos@gmail.com>
Date: Wed, 10 Jan 2024 15:20:16 +0200
Subject: [PATCH 2/5] Added vocabulary extraction function.

---
 src/TextAnalysis.jl | 226 ++++++++++++++++++++++----------------------
 src/coom.jl         |  25 +++--
 src/document.jl     |  88 ++++++++++++++---
 test/document.jl    |  21 ++--
 4 files changed, 217 insertions(+), 143 deletions(-)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 53c1470d..2d6b0fad 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -1,115 +1,115 @@
 module TextAnalysis
-    using SparseArrays
-    using Printf
-    using LinearAlgebra
-    using StatsBase: countmap,addcounts!
-    using Languages
-    using WordTokenizers
-    using Snowball
-
-    using Tables
-    using DataStructures
-    using Statistics
-    using Serialization
-    using ProgressMeter
-    using DocStringExtensions
-
-    import Base: depwarn, merge!
-    import Serialization: serialize, deserialize
-
-    export AbstractDocument, Document
-    export FileDocument, StringDocument, TokenDocument, NGramDocument
-    export GenericDocument
-    export Corpus, DirectoryCorpus
-    export stemmer_types, Stemmer
-    export DocumentTermMatrix
-    export text, tokens, ngrams
-    export text!, tokens!, ngrams!
-    export documents
-    export language, title, author, timestamp
-    export languages, titles, authors, timestamps
-    export language!, title!, author!, timestamp!
-    export languages!, titles!, authors!, timestamps!
-    export ngram_complexity
-    export lexicon, update_lexicon!, lexical_frequency, lexicon_size
-    export inverse_index, update_inverse_index!, index_size
-    export remove_corrupt_utf8
-    export remove_corrupt_utf8!
-    export remove_case
-    export remove_case!
-    export remove_words, remove_stop_words
-    export remove_words!, remove_stop_words!
-    export stem, tag_pos
-    export stem!, tag_pos!
-    export remove_html_tags, remove_html_tags!
-    export prepare!
-    export frequent_terms, sparse_terms
-    export remove_frequent_terms!, remove_sparse_terms!
-    export dtv, each_dtv, dtm, tdm
-    export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
-    export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
-    export CooMatrix, coom
-    export standardize!
-    export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
-    export tf!, tf_idf!, bm_25!, lda!
-    export remove_patterns!, remove_patterns
-    export prune!
-
-    export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
-    export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
-    export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
-
-    export NaiveBayesClassifier
-    export tag_scheme!
-
-    export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
-    export bleu_score
-
-    export PerceptronTagger, fit!, predict
-
-    export Vocabulary, lookup, update
-    export everygram, padding_ngram
-    export maskedscore, logscore, entropy, perplexity
-    export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
-
-    export tokenize #imported from WordTokenizers
-
-    include("tokenizer.jl")
-    include("ngramizer.jl")
-    include("document.jl")
-    include("hash.jl")
-    include("corpus.jl")
-    include("metadata.jl")
-    include("preprocessing.jl")
-
-    include("stemmer.jl")
-    include("dtm.jl")
-    include("tf_idf.jl")
-    include("lsa.jl")
-    include("lda.jl")
-    include("summarizer.jl")
-    include("show.jl")
-    include("bayes.jl")
-    include("deprecations.jl")
-    include("tagging_schemes.jl")
-    include("utils.jl")
-
-    include("evaluation_metrics.jl")
-    include("translate_evaluation/bleu_score.jl")
-    include("coom.jl")
-
-
-
-    # Lang_model
-    include("LM/vocab.jl")
-    include("LM/langmodel.jl")
-    include("LM/api.jl")
-    include("LM/counter.jl")
-    include("LM/preprocessing.jl")
-
-
-
-    function __init__()
-
-    end
+using SparseArrays
+using Printf
+using LinearAlgebra
+using StatsBase: countmap, addcounts!
+using Languages
+using WordTokenizers
+using Snowball
+
+using Tables
+using DataStructures
+using Statistics
+using Serialization
+using ProgressMeter
+using DocStringExtensions
+
+import Base: depwarn, merge!
+import Serialization: serialize, deserialize
+
+export AbstractDocument, Document
+export FileDocument, StringDocument, TokenDocument, NGramDocument
+export GenericDocument
+export Corpus, DirectoryCorpus
+export stemmer_types, Stemmer
+export DocumentTermMatrix
+export text, tokens, ngrams, vocab
+export text!, tokens!, ngrams!
+export documents
+export language, title, author, timestamp
+export languages, titles, authors, timestamps
+export language!, title!, author!, timestamp!
+export languages!, titles!, authors!, timestamps!
+export ngram_complexity
+export lexicon, update_lexicon!, lexical_frequency, lexicon_size
+export inverse_index, update_inverse_index!, index_size
+export remove_corrupt_utf8
+export remove_corrupt_utf8!
+export remove_case
+export remove_case!
+export remove_words, remove_stop_words
+export remove_words!, remove_stop_words!
+export stem, tag_pos
+export stem!, tag_pos!
+export remove_html_tags, remove_html_tags!
+export prepare!
+export frequent_terms, sparse_terms
+export remove_frequent_terms!, remove_sparse_terms!
+export dtv, each_dtv, dtm, tdm
+export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
+export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
+export CooMatrix, coom
+export standardize!
+export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
+export tf!, tf_idf!, bm_25!, lda!
+export remove_patterns!, remove_patterns
+export prune!
+
+export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
+export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
+export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
+
+export NaiveBayesClassifier
+export tag_scheme!
+
+export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
+export bleu_score
+
+export PerceptronTagger, fit!, predict
+
+export Vocabulary, lookup, update
+export everygram, padding_ngram
+export maskedscore, logscore, entropy, perplexity
+export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
+
+export tokenize #imported from WordTokenizers
+
+include("tokenizer.jl")
+include("ngramizer.jl")
+include("document.jl")
+include("hash.jl")
+include("corpus.jl")
+include("metadata.jl")
+include("preprocessing.jl")
+
+include("stemmer.jl")
+include("dtm.jl")
+include("tf_idf.jl")
+include("lsa.jl")
+include("lda.jl")
+include("summarizer.jl")
+include("show.jl")
+include("bayes.jl")
+include("deprecations.jl")
+include("tagging_schemes.jl")
+include("utils.jl")
+
+include("evaluation_metrics.jl")
+include("translate_evaluation/bleu_score.jl")
+include("coom.jl")
+
+
+
+# Lang_model
+include("LM/vocab.jl")
+include("LM/langmodel.jl")
+include("LM/api.jl")
+include("LM/counter.jl")
+include("LM/preprocessing.jl")
+
+
+
+function __init__()
+
+end
 end
diff --git a/src/coom.jl b/src/coom.jl
index e76cb151..031d5bc6 100644
--- a/src/coom.jl
+++ b/src/coom.jl
@@ -26,22 +26,27 @@ julia> using TextAnalysis, DataStructures
        TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true)
 
 3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
-  [2, 1]  =  2.0
-  [1, 2]  =  2.0
-  [3, 2]  =  0.3999
-  [2, 3]  =  0.3999
+13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
+  ⋅   2.0  1.0  0.6665  0.5     0.4      ⋅    ⋅      ⋅       ⋅    ⋅       ⋅    ⋅ 
+ 2.0   ⋅   2.0  1.0     0.6665  0.5     0.4   ⋅      ⋅       ⋅    ⋅       ⋅    ⋅ 
+ 1.0  2.0   ⋅   2.0     1.0     0.6665  0.5  0.4     ⋅       ⋅    ⋅       ⋅    ⋅ 
+ ⋮                              ⋮                                ⋮            
+  ⋅    ⋅    ⋅    ⋅      2.0      ⋅      0.4  1.166  0.6665  1.0  2.0      ⋅   1.0
+  ⋅    ⋅    ⋅    ⋅      2.0      ⋅       ⋅   2.0    0.4     0.5  0.6665  1.0   ⋅ 
 
 julia> using TextAnalysis, DataStructures
        doc = StringDocument("This is a text about an apple. There are many texts about apples.")
        docv = TextAnalysis.tokenize(language(doc), text(doc))
-       vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
+       vocab = vocab(doc)
        TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)
 
-3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
-  [2, 1]  =  1.0
-  [1, 2]  =  1.0
-  [3, 2]  =  0.1999
-  [2, 3]  =  0.1999
+13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
+  ⋅   1.0  0.5  0.3333  0.25    0.2      ⋅     ⋅      ⋅       ⋅     ⋅       ⋅    ⋅ 
+ 1.0   ⋅   1.0  0.5     0.3333  0.25    0.2    ⋅      ⋅       ⋅     ⋅       ⋅    ⋅ 
+ 0.5  1.0   ⋅   1.0     0.5     0.3333  0.25  0.2     ⋅       ⋅     ⋅       ⋅    ⋅ 
+ ⋮                              ⋮                                  ⋮            
+  ⋅    ⋅    ⋅    ⋅      1.0      ⋅      0.2   0.583  0.3333  0.5   1.0      ⋅   0.5
+  ⋅    ⋅    ⋅    ⋅      1.0      ⋅       ⋅    1.0    0.2     0.25  0.3333  0.5   ⋅ 
 ```
 """
 function coo_matrix(::Type{T},
diff --git a/src/document.jl b/src/document.jl
index 0d05c19a..5163fde5 100644
--- a/src/document.jl
+++ b/src/document.jl
@@ -46,7 +46,7 @@ end
 #
 ##############################################################################
 
-abstract type AbstractDocument; end
+abstract type AbstractDocument end
 
 
 mutable struct FileDocument <: AbstractDocument
@@ -142,7 +142,7 @@ A TokenDocument{String}
 function TokenDocument(txt::AbstractString, dm::DocumentMetadata)
     TokenDocument(tokenize(dm.language, String(txt)), dm)
 end
-function TokenDocument(tkns::Vector{T}) where T <: AbstractString
+function TokenDocument(tkns::Vector{T}) where {T<:AbstractString}
     TokenDocument(tkns, DocumentMetadata())
 end
 TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata())
@@ -189,7 +189,7 @@ end
 function NGramDocument(txt::AbstractString, n::Integer...=1)
     NGramDocument(txt, DocumentMetadata(), n...)
 end
-function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
+function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString}
     NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
 end
 
@@ -270,17 +270,83 @@ julia> tokens(sd)
     "."
 ```
 """
-tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d))
+tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d))
 tokens(d::TokenDocument) = d.tokens
 function tokens(d::NGramDocument)
     error("The tokens of an NGramDocument cannot be reconstructed")
 end
 
-tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens)
-function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString
+tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens)
+function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString}
     error("The tokens of a $(typeof(d)) cannot be directly edited")
 end
 
+
+##############################################################################
+#
+# vocab() / vocab!(): Access to document text as a vocabulary
+#
+# to_string_vector(): Helper function for creating a vocabulary from a StringDocument or a Vector{String}
+#
+##############################################################################
+# Converts a StringDocument to Vector{String}
+to_string_vector(doc::StringDocument) = tokens(doc)
+# Identity function for Vector{String}
+to_string_vector(vec::Vector{String}) = vec
+
+"""
+    vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int}
+
+Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index.
+
+# Arguments
+- `input::Union{StringDocument, Vector{String}}`: Input can be either a `StringDocument` or a `Vector{String}`. 
+  For `StringDocument`, the tokens are extracted and used. For `Vector{String}`, the vector itself is used.
+
+# Returns
+- `OrderedDict{String, Int}`: An ordered dictionary where each key is a unique string from the input, 
+  and the value is the index of that string in the original input.
+
+# Examples
+```julia
+julia> doc = StringDocument("This is a sample sentence of a sample document.");
+       vocab(doc) 
+
+OrderedDict{String, Int64} with 8 entries:
+  "This"     => 1
+  "is"       => 2
+  "a"        => 3
+  "sample"   => 4
+  "sentence" => 5
+  ⋮          => ⋮
+
+julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"];
+       vocab(str_vec)
+       
+OrderedDict{String, Int64} with 7 entries:
+  "This"     => 1
+  "is"       => 2
+  "a"        => 3
+  "sample"   => 4
+  "sentence" => 5
+  ⋮          => ⋮
+"""
+function vocab(input::Union{StringDocument,Vector{String}})
+    string_vector = to_string_vector(input)
+    string_vector = length(string_vector) != length(unique(string_vector)) ? unique(string_vector) : string_vector
+
+    # preallocating the ordered dictionary with the size of the string_vector
+    ordered_dict = OrderedDict{String,Int}()
+    sizehint!(ordered_dict, length(string_vector))
+
+    # reverse the order of the keys and values in the enumerate iterator to get an ordered dict.
+    for (index, key) in enumerate(string_vector)
+        ordered_dict[key] = index
+    end
+    return ordered_dict
+end
+
+
 ##############################################################################
 #
 # ngrams() / ngrams!(): Access to document text as n-gram counts
@@ -322,7 +388,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n.
 ngrams(d::NGramDocument) = d.ngrams
 ngrams(d::AbstractDocument) = ngrams(d, 1)
 
-ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams)
+ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams)
 function ngrams!(d::AbstractDocument, new_ngrams::Dict)
     error("The n-grams of $(typeof(d)) cannot be directly edited")
 end
@@ -371,8 +437,8 @@ const GenericDocument = Union{
 ##############################################################################
 
 Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str)
-Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns)
-Document(ng::Dict{String, Int}) = NGramDocument(ng)
+Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns)
+Document(ng::Dict{String,Int}) = NGramDocument(ng)
 
 ##############################################################################
 #
@@ -383,11 +449,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng)
 function Base.convert(::Type{StringDocument}, d::FileDocument)
     StringDocument(text(d), d.metadata)
 end
-function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument}))
+function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument}))
     TokenDocument(tokens(d), d.metadata)
 end
 function Base.convert(::Type{NGramDocument},
-            d::(Union{FileDocument, StringDocument, TokenDocument}))
+    d::(Union{FileDocument,StringDocument,TokenDocument}))
     NGramDocument(ngrams(d), 1, d.metadata)
 end
 Base.convert(::Type{TokenDocument}, d::TokenDocument) = d
diff --git a/test/document.jl b/test/document.jl
index e080f841..ec9aaa97 100644
--- a/test/document.jl
+++ b/test/document.jl
@@ -1,13 +1,13 @@
 
 @testset "Document" begin
 
-    dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2"))
-    @test (dmeta.language == Languages.English()) && 
-        (dmeta.title == "test title") && 
-        (dmeta.author == "test author") && 
-        (dmeta.timestamp == "test time") && 
-        (get(dmeta.custom, :k1, "") == "v1") && 
-        (get(dmeta.custom, :k2, "") == "v2")
+    dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1 => "v1", :k2 => "v2"))
+    @test (dmeta.language == Languages.English()) &&
+          (dmeta.title == "test title") &&
+          (dmeta.author == "test author") &&
+          (dmeta.timestamp == "test time") &&
+          (get(dmeta.custom, :k1, "") == "v1") &&
+          (get(dmeta.custom, :k2, "") == "v2")
 
     # mutability
     dmeta.custom = nothing
@@ -34,6 +34,9 @@
     @test "a" in keys(ngrams(sd, 1))
     @test "string" in keys(ngrams(sd, 1))
 
+    @test vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
+    @test vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
+
     @test length(sd) == 16
 
     hamlet_text = "To be or not to be..."
@@ -79,8 +82,8 @@
     @test isequal(length(Document("this is text")), 12)
 
     # NGramDocument creation with multiple ngram complexity
-    let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
-        for (n,c,l) in zip(N,C,L)
+    let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7)
+        for (n, c, l) in zip(N, C, L)
             ngd = NGramDocument(sample_text1, n...)
             @test ngram_complexity(ngd) == c
             @test length(ngd.ngrams) == l

From 8bbbbcc6273953ba1faf5935a7768e8c7c491a83 Mon Sep 17 00:00:00 2001
From: atantos <atantos@gmail.com>
Date: Wed, 10 Jan 2024 20:38:30 +0200
Subject: [PATCH 3/5] Integrated suggestions made by @rssdev10 (except for
 where ordered_vocab will be stored)

---
 src/TextAnalysis.jl | 228 ++++++++++++++++++++++----------------------
 src/coom.jl         |   4 +-
 src/document.jl     |  13 ++-
 test/document.jl    |   1 +
 4 files changed, 123 insertions(+), 123 deletions(-)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 2d6b0fad..c28f1589 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -1,115 +1,115 @@
 module TextAnalysis
-using SparseArrays
-using Printf
-using LinearAlgebra
-using StatsBase: countmap, addcounts!
-using Languages
-using WordTokenizers
-using Snowball
-
-using Tables
-using DataStructures
-using Statistics
-using Serialization
-using ProgressMeter
-using DocStringExtensions
-
-import Base: depwarn, merge!
-import Serialization: serialize, deserialize
-
-export AbstractDocument, Document
-export FileDocument, StringDocument, TokenDocument, NGramDocument
-export GenericDocument
-export Corpus, DirectoryCorpus
-export stemmer_types, Stemmer
-export DocumentTermMatrix
-export text, tokens, ngrams, vocab
-export text!, tokens!, ngrams!
-export documents
-export language, title, author, timestamp
-export languages, titles, authors, timestamps
-export language!, title!, author!, timestamp!
-export languages!, titles!, authors!, timestamps!
-export ngram_complexity
-export lexicon, update_lexicon!, lexical_frequency, lexicon_size
-export inverse_index, update_inverse_index!, index_size
-export remove_corrupt_utf8
-export remove_corrupt_utf8!
-export remove_case
-export remove_case!
-export remove_words, remove_stop_words
-export remove_words!, remove_stop_words!
-export stem, tag_pos
-export stem!, tag_pos!
-export remove_html_tags, remove_html_tags!
-export prepare!
-export frequent_terms, sparse_terms
-export remove_frequent_terms!, remove_sparse_terms!
-export dtv, each_dtv, dtm, tdm
-export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
-export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
-export CooMatrix, coom
-export standardize!
-export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
-export tf!, tf_idf!, bm_25!, lda!
-export remove_patterns!, remove_patterns
-export prune!
-
-export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
-export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
-export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
-
-export NaiveBayesClassifier
-export tag_scheme!
-
-export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
-export bleu_score
-
-export PerceptronTagger, fit!, predict
-
-export Vocabulary, lookup, update
-export everygram, padding_ngram
-export maskedscore, logscore, entropy, perplexity
-export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
-
-export tokenize #imported from WordTokenizers
-
-include("tokenizer.jl")
-include("ngramizer.jl")
-include("document.jl")
-include("hash.jl")
-include("corpus.jl")
-include("metadata.jl")
-include("preprocessing.jl")
-
-include("stemmer.jl")
-include("dtm.jl")
-include("tf_idf.jl")
-include("lsa.jl")
-include("lda.jl")
-include("summarizer.jl")
-include("show.jl")
-include("bayes.jl")
-include("deprecations.jl")
-include("tagging_schemes.jl")
-include("utils.jl")
-
-include("evaluation_metrics.jl")
-include("translate_evaluation/bleu_score.jl")
-include("coom.jl")
-
-
-
-# Lang_model
-include("LM/vocab.jl")
-include("LM/langmodel.jl")
-include("LM/api.jl")
-include("LM/counter.jl")
-include("LM/preprocessing.jl")
-
-
-
-function __init__()
-
-end
-end
+    using SparseArrays
+    using Printf
+    using LinearAlgebra
+    using StatsBase: countmap,addcounts!
+    using Languages
+    using WordTokenizers
+    using Snowball
+
+    using Tables
+    using DataStructures
+    using Statistics
+    using Serialization
+    using ProgressMeter
+    using DocStringExtensions
+
+    import Base: depwarn, merge!
+    import Serialization: serialize, deserialize
+
+    export AbstractDocument, Document
+    export FileDocument, StringDocument, TokenDocument, NGramDocument
+    export GenericDocument
+    export Corpus, DirectoryCorpus
+    export stemmer_types, Stemmer
+    export DocumentTermMatrix
+    export text, tokens, ngrams, ordered_vocab
+    export text!, tokens!, ngrams!
+    export documents
+    export language, title, author, timestamp
+    export languages, titles, authors, timestamps
+    export language!, title!, author!, timestamp!
+    export languages!, titles!, authors!, timestamps!
+    export ngram_complexity
+    export lexicon, update_lexicon!, lexical_frequency, lexicon_size
+    export inverse_index, update_inverse_index!, index_size
+    export remove_corrupt_utf8
+    export remove_corrupt_utf8!
+    export remove_case
+    export remove_case!
+    export remove_words, remove_stop_words
+    export remove_words!, remove_stop_words!
+    export stem, tag_pos
+    export stem!, tag_pos!
+    export remove_html_tags, remove_html_tags!
+    export prepare!
+    export frequent_terms, sparse_terms
+    export remove_frequent_terms!, remove_sparse_terms!
+    export dtv, each_dtv, dtm, tdm
+    export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
+    export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
+    export CooMatrix, coom
+    export standardize!
+    export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
+    export tf!, tf_idf!, bm_25!, lda!
+    export remove_patterns!, remove_patterns
+    export prune!
+
+    export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
+    export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
+    export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
+
+    export NaiveBayesClassifier
+    export tag_scheme!
+
+    export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
+    export bleu_score
+
+    export PerceptronTagger, fit!, predict
+
+    export Vocabulary, lookup, update
+    export everygram, padding_ngram
+    export maskedscore, logscore, entropy, perplexity
+    export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
+
+    export tokenize #imported from WordTokenizers
+
+    include("tokenizer.jl")
+    include("ngramizer.jl")
+    include("document.jl")
+    include("hash.jl")
+    include("corpus.jl")
+    include("metadata.jl")
+    include("preprocessing.jl")
+
+    include("stemmer.jl")
+    include("dtm.jl")
+    include("tf_idf.jl")
+    include("lsa.jl")
+    include("lda.jl")
+    include("summarizer.jl")
+    include("show.jl")
+    include("bayes.jl")
+    include("deprecations.jl")
+    include("tagging_schemes.jl")
+    include("utils.jl")
+
+    include("evaluation_metrics.jl")
+    include("translate_evaluation/bleu_score.jl")
+    include("coom.jl")
+
+
+
+    # Lang_model
+    include("LM/vocab.jl")
+    include("LM/langmodel.jl")
+    include("LM/api.jl")
+    include("LM/counter.jl")
+    include("LM/preprocessing.jl")
+
+
+
+    function __init__()
+
+    end
+end
\ No newline at end of file
diff --git a/src/coom.jl b/src/coom.jl
index 031d5bc6..7d320a73 100644
--- a/src/coom.jl
+++ b/src/coom.jl
@@ -22,7 +22,7 @@ of not the counts by the distance between word positions. The `mode` keyword can
 julia> using TextAnalysis, DataStructures
        doc = StringDocument("This is a text about an apple. There are many texts about apples.")
        docv = TextAnalysis.tokenize(language(doc), text(doc))
-       vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
+       vocab = ordered_vocab(doc)
        TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true)
 
 3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
@@ -37,7 +37,7 @@ julia> using TextAnalysis, DataStructures
 julia> using TextAnalysis, DataStructures
        doc = StringDocument("This is a text about an apple. There are many texts about apples.")
        docv = TextAnalysis.tokenize(language(doc), text(doc))
-       vocab = vocab(doc)
+       vocab = ordered_vocab(doc)
        TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)
 
 13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
diff --git a/src/document.jl b/src/document.jl
index 5163fde5..7223765e 100644
--- a/src/document.jl
+++ b/src/document.jl
@@ -295,7 +295,7 @@ to_string_vector(doc::StringDocument) = tokens(doc)
 to_string_vector(vec::Vector{String}) = vec
 
 """
-    vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int}
+    ordered_vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int}
 
 Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index.
 
@@ -310,7 +310,7 @@ Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (u
 # Examples
 ```julia
 julia> doc = StringDocument("This is a sample sentence of a sample document.");
-       vocab(doc) 
+       ordered_vocab(doc) 
 
 OrderedDict{String, Int64} with 8 entries:
   "This"     => 1
@@ -321,8 +321,8 @@ OrderedDict{String, Int64} with 8 entries:
   ⋮          => ⋮
 
 julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"];
-       vocab(str_vec)
-       
+       ordered_vocab(str_vec)
+
 OrderedDict{String, Int64} with 7 entries:
   "This"     => 1
   "is"       => 2
@@ -332,14 +332,13 @@ OrderedDict{String, Int64} with 7 entries:
   ⋮          => ⋮
 """
 function vocab(input::Union{StringDocument,Vector{String}})
-    string_vector = to_string_vector(input)
-    string_vector = length(string_vector) != length(unique(string_vector)) ? unique(string_vector) : string_vector
+    string_vector = to_string_vector(input) |> unique
 
     # preallocating the ordered dictionary with the size of the string_vector
     ordered_dict = OrderedDict{String,Int}()
     sizehint!(ordered_dict, length(string_vector))
 
-    # reverse the order of the keys and values in the enumerate iterator to get an ordered dict.
+    # populating the ordered dictionary
     for (index, key) in enumerate(string_vector)
         ordered_dict[key] = index
     end
diff --git a/test/document.jl b/test/document.jl
index ec9aaa97..ea5ecafd 100644
--- a/test/document.jl
+++ b/test/document.jl
@@ -1,3 +1,4 @@
+using DataStructures: OrderedDict
 
 @testset "Document" begin
 

From 29a2558f1bf5a8824a9d3f5c4a2a1152200fbdb0 Mon Sep 17 00:00:00 2001
From: atantos <atantos@gmail.com>
Date: Wed, 10 Jan 2024 20:41:12 +0200
Subject: [PATCH 4/5] Added julia-repl on the ordered_vocab() documentation
 example.

---
 src/document.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/document.jl b/src/document.jl
index 7223765e..84f75d6c 100644
--- a/src/document.jl
+++ b/src/document.jl
@@ -308,7 +308,7 @@ Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (u
   and the value is the index of that string in the original input.
 
 # Examples
-```julia
+```julia-repl
 julia> doc = StringDocument("This is a sample sentence of a sample document.");
        ordered_vocab(doc) 
 

From 0e53fc89b7e59a03e7c920ccf875bf97927796c8 Mon Sep 17 00:00:00 2001
From: atantos <atantos@gmail.com>
Date: Wed, 10 Jan 2024 21:02:45 +0200
Subject: [PATCH 5/5] All tests are passed locally.

---
 src/document.jl  | 2 +-
 test/document.jl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/document.jl b/src/document.jl
index 84f75d6c..e432ce9a 100644
--- a/src/document.jl
+++ b/src/document.jl
@@ -331,7 +331,7 @@ OrderedDict{String, Int64} with 7 entries:
   "sentence" => 5
   ⋮          => ⋮
 """
-function vocab(input::Union{StringDocument,Vector{String}})
+function ordered_vocab(input::Union{StringDocument,Vector{String}})
     string_vector = to_string_vector(input) |> unique
 
     # preallocating the ordered dictionary with the size of the string_vector
diff --git a/test/document.jl b/test/document.jl
index ea5ecafd..7eca0df5 100644
--- a/test/document.jl
+++ b/test/document.jl
@@ -35,8 +35,8 @@ using DataStructures: OrderedDict
     @test "a" in keys(ngrams(sd, 1))
     @test "string" in keys(ngrams(sd, 1))
 
-    @test vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
-    @test vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
+    @test ordered_vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
+    @test ordered_vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
 
     @test length(sd) == 16