Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vocabulary extraction #280

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
docs/build
Manifest.toml
.vscode/
4 changes: 2 additions & 2 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@ module TextAnalysis
export Corpus, DirectoryCorpus
export stemmer_types, Stemmer
export DocumentTermMatrix
export text, tokens, ngrams
export text, tokens, ngrams, ordered_vocab
export text!, tokens!, ngrams!
export documents
export language, title, author, timestamp
@@ -112,4 +112,4 @@ module TextAnalysis
function __init__()

end
end
end
27 changes: 16 additions & 11 deletions src/coom.jl
Original file line number Diff line number Diff line change
@@ -22,26 +22,31 @@ of not the counts by the distance between word positions. The `mode` keyword can
julia> using TextAnalysis, DataStructures
doc = StringDocument("This is a text about an apple. There are many texts about apples.")
docv = TextAnalysis.tokenize(language(doc), text(doc))
vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
vocab = ordered_vocab(doc)
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true)

3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 2.0
[1, 2] = 2.0
[3, 2] = 0.3999
[2, 3] = 0.3999
13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
1.0 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅
⋮ ⋮ ⋮
⋅ ⋅ ⋅ ⋅ 2.0 ⋅ 0.4 1.166 0.6665 1.0 2.0 ⋅ 1.0
⋅ ⋅ ⋅ ⋅ 2.0 ⋅ ⋅ 2.0 0.4 0.5 0.6665 1.0 ⋅

julia> using TextAnalysis, DataStructures
doc = StringDocument("This is a text about an apple. There are many texts about apples.")
docv = TextAnalysis.tokenize(language(doc), text(doc))
vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
vocab = ordered_vocab(doc)
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)

3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 1.0
[1, 2] = 1.0
[3, 2] = 0.1999
[2, 3] = 0.1999
13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
0.5 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅
⋮ ⋮ ⋮
⋅ ⋅ ⋅ ⋅ 1.0 ⋅ 0.2 0.583 0.3333 0.5 1.0 ⋅ 0.5
⋅ ⋅ ⋅ ⋅ 1.0 ⋅ ⋅ 1.0 0.2 0.25 0.3333 0.5 ⋅
```
"""
function coo_matrix(::Type{T},
87 changes: 76 additions & 11 deletions src/document.jl
Original file line number Diff line number Diff line change
@@ -46,7 +46,7 @@ end
#
##############################################################################

abstract type AbstractDocument; end
abstract type AbstractDocument end


mutable struct FileDocument <: AbstractDocument
@@ -142,7 +142,7 @@ A TokenDocument{String}
function TokenDocument(txt::AbstractString, dm::DocumentMetadata)
TokenDocument(tokenize(dm.language, String(txt)), dm)
end
function TokenDocument(tkns::Vector{T}) where T <: AbstractString
function TokenDocument(tkns::Vector{T}) where {T<:AbstractString}
TokenDocument(tkns, DocumentMetadata())
end
TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata())
@@ -189,7 +189,7 @@ end
function NGramDocument(txt::AbstractString, n::Integer...=1)
NGramDocument(txt, DocumentMetadata(), n...)
end
function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString}
NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
end

@@ -270,17 +270,82 @@ julia> tokens(sd)
"."
```
"""
tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d))
tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d))
tokens(d::TokenDocument) = d.tokens
function tokens(d::NGramDocument)
error("The tokens of an NGramDocument cannot be reconstructed")
end

tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens)
function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString
tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens)
function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString}
error("The tokens of a $(typeof(d)) cannot be directly edited")
end


##############################################################################
#
# vocab() / vocab!(): Access to document text as a vocabulary
#
# to_string_vector(): Helper function for creating a vocabulary from a StringDocument or a Vector{String}
#
##############################################################################
# Converts a StringDocument to Vector{String}
to_string_vector(doc::StringDocument) = tokens(doc)
# Identity function for Vector{String}
to_string_vector(vec::Vector{String}) = vec

"""
ordered_vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int}

Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index.

# Arguments
- `input::Union{StringDocument, Vector{String}}`: Input can be either a `StringDocument` or a `Vector{String}`.
For `StringDocument`, the tokens are extracted and used. For `Vector{String}`, the vector itself is used.

# Returns
- `OrderedDict{String, Int}`: An ordered dictionary where each key is a unique string from the input,
and the value is the index of that string in the original input.

# Examples
```julia-repl
julia> doc = StringDocument("This is a sample sentence of a sample document.");
ordered_vocab(doc)

OrderedDict{String, Int64} with 8 entries:
"This" => 1
"is" => 2
"a" => 3
"sample" => 4
"sentence" => 5
⋮ => ⋮

julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"];
ordered_vocab(str_vec)

OrderedDict{String, Int64} with 7 entries:
"This" => 1
"is" => 2
"a" => 3
"sample" => 4
"sentence" => 5
⋮ => ⋮
"""
function ordered_vocab(input::Union{StringDocument,Vector{String}})
string_vector = to_string_vector(input) |> unique

# preallocating the ordered dictionary with the size of the string_vector
ordered_dict = OrderedDict{String,Int}()
sizehint!(ordered_dict, length(string_vector))

# populating the ordered dictionary
for (index, key) in enumerate(string_vector)
ordered_dict[key] = index
end
return ordered_dict
end


##############################################################################
#
# ngrams() / ngrams!(): Access to document text as n-gram counts
@@ -322,7 +387,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n.
ngrams(d::NGramDocument) = d.ngrams
ngrams(d::AbstractDocument) = ngrams(d, 1)

ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams)
ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams)
function ngrams!(d::AbstractDocument, new_ngrams::Dict)
error("The n-grams of $(typeof(d)) cannot be directly edited")
end
@@ -371,8 +436,8 @@ const GenericDocument = Union{
##############################################################################

Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str)
Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns)
Document(ng::Dict{String, Int}) = NGramDocument(ng)
Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns)
Document(ng::Dict{String,Int}) = NGramDocument(ng)

##############################################################################
#
@@ -383,11 +448,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng)
function Base.convert(::Type{StringDocument}, d::FileDocument)
StringDocument(text(d), d.metadata)
end
function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument}))
function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument}))
TokenDocument(tokens(d), d.metadata)
end
function Base.convert(::Type{NGramDocument},
d::(Union{FileDocument, StringDocument, TokenDocument}))
d::(Union{FileDocument,StringDocument,TokenDocument}))
NGramDocument(ngrams(d), 1, d.metadata)
end
Base.convert(::Type{TokenDocument}, d::TokenDocument) = d
22 changes: 13 additions & 9 deletions test/document.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
using DataStructures: OrderedDict

@testset "Document" begin

dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2"))
@test (dmeta.language == Languages.English()) &&
(dmeta.title == "test title") &&
(dmeta.author == "test author") &&
(dmeta.timestamp == "test time") &&
(get(dmeta.custom, :k1, "") == "v1") &&
(get(dmeta.custom, :k2, "") == "v2")
dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1 => "v1", :k2 => "v2"))
@test (dmeta.language == Languages.English()) &&
(dmeta.title == "test title") &&
(dmeta.author == "test author") &&
(dmeta.timestamp == "test time") &&
(get(dmeta.custom, :k1, "") == "v1") &&
(get(dmeta.custom, :k2, "") == "v2")

# mutability
dmeta.custom = nothing
@@ -34,6 +35,9 @@
@test "a" in keys(ngrams(sd, 1))
@test "string" in keys(ngrams(sd, 1))

@test ordered_vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
@test ordered_vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)

@test length(sd) == 16

hamlet_text = "To be or not to be..."
@@ -79,8 +83,8 @@
@test isequal(length(Document("this is text")), 12)

# NGramDocument creation with multiple ngram complexity
let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
for (n,c,l) in zip(N,C,L)
let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7)
for (n, c, l) in zip(N, C, L)
ngd = NGramDocument(sample_text1, n...)
@test ngram_complexity(ngd) == c
@test length(ngd.ngrams) == l