JuliaText · atantos · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .DS_Store
 docs/build
 Manifest.toml
+.vscode/
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -23,7 +23,7 @@ module TextAnalysis
     export Corpus, DirectoryCorpus
     export stemmer_types, Stemmer
     export DocumentTermMatrix
-    export text, tokens, ngrams
+    export text, tokens, ngrams, ordered_vocab
     export text!, tokens!, ngrams!
     export documents
     export language, title, author, timestamp
@@ -112,4 +112,4 @@ module TextAnalysis
     function __init__()
 
     end
-end
+end
diff --git a/src/coom.jl b/src/coom.jl
@@ -22,26 +22,31 @@ of not the counts by the distance between word positions. The `mode` keyword can
 julia> using TextAnalysis, DataStructures
        doc = StringDocument("This is a text about an apple. There are many texts about apples.")
        docv = TextAnalysis.tokenize(language(doc), text(doc))
-       vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
+       vocab = ordered_vocab(doc)
        TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true)
 
 3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
-  [2, 1]  =  2.0
-  [1, 2]  =  2.0
-  [3, 2]  =  0.3999
-  [2, 3]  =  0.3999
+13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
+  ⋅   2.0  1.0  0.6665  0.5     0.4      ⋅    ⋅      ⋅       ⋅    ⋅       ⋅    ⋅ 
+ 2.0   ⋅   2.0  1.0     0.6665  0.5     0.4   ⋅      ⋅       ⋅    ⋅       ⋅    ⋅ 
+ 1.0  2.0   ⋅   2.0     1.0     0.6665  0.5  0.4     ⋅       ⋅    ⋅       ⋅    ⋅ 
+ ⋮                              ⋮                                ⋮            
+  ⋅    ⋅    ⋅    ⋅      2.0      ⋅      0.4  1.166  0.6665  1.0  2.0      ⋅   1.0
+  ⋅    ⋅    ⋅    ⋅      2.0      ⋅       ⋅   2.0    0.4     0.5  0.6665  1.0   ⋅ 
 
 julia> using TextAnalysis, DataStructures
        doc = StringDocument("This is a text about an apple. There are many texts about apples.")
        docv = TextAnalysis.tokenize(language(doc), text(doc))
-       vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
+       vocab = ordered_vocab(doc)
        TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)
 
-3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
-  [2, 1]  =  1.0
-  [1, 2]  =  1.0
-  [3, 2]  =  0.1999
-  [2, 3]  =  0.1999
+13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
+  ⋅   1.0  0.5  0.3333  0.25    0.2      ⋅     ⋅      ⋅       ⋅     ⋅       ⋅    ⋅ 
+ 1.0   ⋅   1.0  0.5     0.3333  0.25    0.2    ⋅      ⋅       ⋅     ⋅       ⋅    ⋅ 
+ 0.5  1.0   ⋅   1.0     0.5     0.3333  0.25  0.2     ⋅       ⋅     ⋅       ⋅    ⋅ 
+ ⋮                              ⋮                                  ⋮            
+  ⋅    ⋅    ⋅    ⋅      1.0      ⋅      0.2   0.583  0.3333  0.5   1.0      ⋅   0.5
+  ⋅    ⋅    ⋅    ⋅      1.0      ⋅       ⋅    1.0    0.2     0.25  0.3333  0.5   ⋅ 
 ```
 """
 function coo_matrix(::Type{T},

diff --git a/src/document.jl b/src/document.jl
@@ -46,7 +46,7 @@ end
 #
 ##############################################################################
 
-abstract type AbstractDocument; end
+abstract type AbstractDocument end
 
 
 mutable struct FileDocument <: AbstractDocument
@@ -142,7 +142,7 @@ A TokenDocument{String}
 function TokenDocument(txt::AbstractString, dm::DocumentMetadata)
     TokenDocument(tokenize(dm.language, String(txt)), dm)
 end
-function TokenDocument(tkns::Vector{T}) where T <: AbstractString
+function TokenDocument(tkns::Vector{T}) where {T<:AbstractString}
     TokenDocument(tkns, DocumentMetadata())
 end
 TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata())
@@ -189,7 +189,7 @@ end
 function NGramDocument(txt::AbstractString, n::Integer...=1)
     NGramDocument(txt, DocumentMetadata(), n...)
 end
-function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
+function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString}
     NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
 end
 
@@ -270,17 +270,82 @@ julia> tokens(sd)
     "."
 ```
 """
-tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d))
+tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d))
 tokens(d::TokenDocument) = d.tokens
 function tokens(d::NGramDocument)
     error("The tokens of an NGramDocument cannot be reconstructed")
 end
 
-tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens)
-function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString
+tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens)
+function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString}
     error("The tokens of a $(typeof(d)) cannot be directly edited")
 end
 
+
+##############################################################################
+#
+# vocab() / vocab!(): Access to document text as a vocabulary
+#
+# to_string_vector(): Helper function for creating a vocabulary from a StringDocument or a Vector{String}
+#
+##############################################################################
+# Converts a StringDocument to Vector{String}
+to_string_vector(doc::StringDocument) = tokens(doc)
+# Identity function for Vector{String}
+to_string_vector(vec::Vector{String}) = vec
+
+"""
+    ordered_vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int}
+
+Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index.
+
+# Arguments
+- `input::Union{StringDocument, Vector{String}}`: Input can be either a `StringDocument` or a `Vector{String}`. 
+  For `StringDocument`, the tokens are extracted and used. For `Vector{String}`, the vector itself is used.
+
+# Returns
+- `OrderedDict{String, Int}`: An ordered dictionary where each key is a unique string from the input, 
+  and the value is the index of that string in the original input.
+
+# Examples
+```julia-repl
+julia> doc = StringDocument("This is a sample sentence of a sample document.");
+       ordered_vocab(doc) 
+
+OrderedDict{String, Int64} with 8 entries:
+  "This"     => 1
+  "is"       => 2
+  "a"        => 3
+  "sample"   => 4
+  "sentence" => 5
+  ⋮          => ⋮
+
+julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"];
+       ordered_vocab(str_vec)
+
+OrderedDict{String, Int64} with 7 entries:
+  "This"     => 1
+  "is"       => 2
+  "a"        => 3
+  "sample"   => 4
+  "sentence" => 5
+  ⋮          => ⋮
+"""
+function ordered_vocab(input::Union{StringDocument,Vector{String}})
+    string_vector = to_string_vector(input) |> unique
+
+    # preallocating the ordered dictionary with the size of the string_vector
+    ordered_dict = OrderedDict{String,Int}()
+    sizehint!(ordered_dict, length(string_vector))
+
+    # populating the ordered dictionary
+    for (index, key) in enumerate(string_vector)
+        ordered_dict[key] = index
+    end
+    return ordered_dict
+end
+
+
 ##############################################################################
 #
 # ngrams() / ngrams!(): Access to document text as n-gram counts
@@ -322,7 +387,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n.
 ngrams(d::NGramDocument) = d.ngrams
 ngrams(d::AbstractDocument) = ngrams(d, 1)
 
-ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams)
+ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams)
 function ngrams!(d::AbstractDocument, new_ngrams::Dict)
     error("The n-grams of $(typeof(d)) cannot be directly edited")
 end
@@ -371,8 +436,8 @@ const GenericDocument = Union{
 ##############################################################################
 
 Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str)
-Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns)
-Document(ng::Dict{String, Int}) = NGramDocument(ng)
+Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns)
+Document(ng::Dict{String,Int}) = NGramDocument(ng)
 
 ##############################################################################
 #
@@ -383,11 +448,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng)
 function Base.convert(::Type{StringDocument}, d::FileDocument)
     StringDocument(text(d), d.metadata)
 end
-function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument}))
+function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument}))
     TokenDocument(tokens(d), d.metadata)
 end
 function Base.convert(::Type{NGramDocument},
-            d::(Union{FileDocument, StringDocument, TokenDocument}))
+    d::(Union{FileDocument,StringDocument,TokenDocument}))
     NGramDocument(ngrams(d), 1, d.metadata)
 end
 Base.convert(::Type{TokenDocument}, d::TokenDocument) = d

diff --git a/test/document.jl b/test/document.jl
@@ -1,13 +1,14 @@
+using DataStructures: OrderedDict
 
 @testset "Document" begin
 
-    dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2"))
-    @test (dmeta.language == Languages.English()) && 
-        (dmeta.title == "test title") && 
-        (dmeta.author == "test author") && 
-        (dmeta.timestamp == "test time") && 
-        (get(dmeta.custom, :k1, "") == "v1") && 
-        (get(dmeta.custom, :k2, "") == "v2")
+    dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1 => "v1", :k2 => "v2"))
+    @test (dmeta.language == Languages.English()) &&
+          (dmeta.title == "test title") &&
+          (dmeta.author == "test author") &&
+          (dmeta.timestamp == "test time") &&
+          (get(dmeta.custom, :k1, "") == "v1") &&
+          (get(dmeta.custom, :k2, "") == "v2")
 
     # mutability
     dmeta.custom = nothing
@@ -34,6 +35,9 @@
     @test "a" in keys(ngrams(sd, 1))
     @test "string" in keys(ngrams(sd, 1))
 
+    @test ordered_vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
+    @test ordered_vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
+
     @test length(sd) == 16
 
     hamlet_text = "To be or not to be..."
@@ -79,8 +83,8 @@
     @test isequal(length(Document("this is text")), 12)
 
     # NGramDocument creation with multiple ngram complexity
-    let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
-        for (n,c,l) in zip(N,C,L)
+    let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7)
+        for (n, c, l) in zip(N, C, L)
             ngd = NGramDocument(sample_text1, n...)
             @test ngram_complexity(ngd) == c
             @test length(ngd.ngrams) == l