Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vocabulary extraction #280

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
docs/build
Manifest.toml
.vscode/
226 changes: 113 additions & 113 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
@@ -1,115 +1,115 @@
module TextAnalysis
using SparseArrays
using Printf
using LinearAlgebra
using StatsBase: countmap,addcounts!
using Languages
using WordTokenizers
using Snowball

using Tables
using DataStructures
using Statistics
using Serialization
using ProgressMeter
using DocStringExtensions

import Base: depwarn, merge!
import Serialization: serialize, deserialize

export AbstractDocument, Document
export FileDocument, StringDocument, TokenDocument, NGramDocument
export GenericDocument
export Corpus, DirectoryCorpus
export stemmer_types, Stemmer
export DocumentTermMatrix
export text, tokens, ngrams
export text!, tokens!, ngrams!
export documents
export language, title, author, timestamp
export languages, titles, authors, timestamps
export language!, title!, author!, timestamp!
export languages!, titles!, authors!, timestamps!
export ngram_complexity
export lexicon, update_lexicon!, lexical_frequency, lexicon_size
export inverse_index, update_inverse_index!, index_size
export remove_corrupt_utf8
export remove_corrupt_utf8!
export remove_case
export remove_case!
export remove_words, remove_stop_words
export remove_words!, remove_stop_words!
export stem, tag_pos
export stem!, tag_pos!
export remove_html_tags, remove_html_tags!
export prepare!
export frequent_terms, sparse_terms
export remove_frequent_terms!, remove_sparse_terms!
export dtv, each_dtv, dtm, tdm
export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
export CooMatrix, coom
export standardize!
export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
export tf!, tf_idf!, bm_25!, lda!
export remove_patterns!, remove_patterns
export prune!

export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags

export NaiveBayesClassifier
export tag_scheme!

export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
export bleu_score

export PerceptronTagger, fit!, predict

export Vocabulary, lookup, update
export everygram, padding_ngram
export maskedscore, logscore, entropy, perplexity
export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score

export tokenize #imported from WordTokenizers

include("tokenizer.jl")
include("ngramizer.jl")
include("document.jl")
include("hash.jl")
include("corpus.jl")
include("metadata.jl")
include("preprocessing.jl")

include("stemmer.jl")
include("dtm.jl")
include("tf_idf.jl")
include("lsa.jl")
include("lda.jl")
include("summarizer.jl")
include("show.jl")
include("bayes.jl")
include("deprecations.jl")
include("tagging_schemes.jl")
include("utils.jl")

include("evaluation_metrics.jl")
include("translate_evaluation/bleu_score.jl")
include("coom.jl")



# Lang_model
include("LM/vocab.jl")
include("LM/langmodel.jl")
include("LM/api.jl")
include("LM/counter.jl")
include("LM/preprocessing.jl")



function __init__()

end
using SparseArrays
using Printf
using LinearAlgebra
using StatsBase: countmap, addcounts!
using Languages
using WordTokenizers
using Snowball

using Tables
using DataStructures
using Statistics
using Serialization
using ProgressMeter
using DocStringExtensions

import Base: depwarn, merge!
import Serialization: serialize, deserialize

export AbstractDocument, Document
export FileDocument, StringDocument, TokenDocument, NGramDocument
export GenericDocument
export Corpus, DirectoryCorpus
export stemmer_types, Stemmer
export DocumentTermMatrix
export text, tokens, ngrams, vocab
export text!, tokens!, ngrams!
export documents
export language, title, author, timestamp
export languages, titles, authors, timestamps
export language!, title!, author!, timestamp!
export languages!, titles!, authors!, timestamps!
export ngram_complexity
export lexicon, update_lexicon!, lexical_frequency, lexicon_size
export inverse_index, update_inverse_index!, index_size
export remove_corrupt_utf8
export remove_corrupt_utf8!
export remove_case
export remove_case!
export remove_words, remove_stop_words
export remove_words!, remove_stop_words!
export stem, tag_pos
export stem!, tag_pos!
export remove_html_tags, remove_html_tags!
export prepare!
export frequent_terms, sparse_terms
export remove_frequent_terms!, remove_sparse_terms!
export dtv, each_dtv, dtm, tdm
export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
export CooMatrix, coom
export standardize!
export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
export tf!, tf_idf!, bm_25!, lda!
export remove_patterns!, remove_patterns
export prune!

export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags

export NaiveBayesClassifier
export tag_scheme!

export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
export bleu_score

export PerceptronTagger, fit!, predict

export Vocabulary, lookup, update
export everygram, padding_ngram
export maskedscore, logscore, entropy, perplexity
export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score

export tokenize #imported from WordTokenizers

include("tokenizer.jl")
include("ngramizer.jl")
include("document.jl")
include("hash.jl")
include("corpus.jl")
include("metadata.jl")
include("preprocessing.jl")

include("stemmer.jl")
include("dtm.jl")
include("tf_idf.jl")
include("lsa.jl")
include("lda.jl")
include("summarizer.jl")
include("show.jl")
include("bayes.jl")
include("deprecations.jl")
include("tagging_schemes.jl")
include("utils.jl")

include("evaluation_metrics.jl")
include("translate_evaluation/bleu_score.jl")
include("coom.jl")



# Lang_model
include("LM/vocab.jl")
include("LM/langmodel.jl")
include("LM/api.jl")
include("LM/counter.jl")
include("LM/preprocessing.jl")



function __init__()

end
end
25 changes: 15 additions & 10 deletions src/coom.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,27 @@ julia> using TextAnalysis, DataStructures
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true)

3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 2.0
[1, 2] = 2.0
[3, 2] = 0.3999
[2, 3] = 0.3999
13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
1.0 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅
⋮ ⋮ ⋮
⋅ ⋅ ⋅ ⋅ 2.0 ⋅ 0.4 1.166 0.6665 1.0 2.0 ⋅ 1.0
⋅ ⋅ ⋅ ⋅ 2.0 ⋅ ⋅ 2.0 0.4 0.5 0.6665 1.0 ⋅

julia> using TextAnalysis, DataStructures
doc = StringDocument("This is a text about an apple. There are many texts about apples.")
docv = TextAnalysis.tokenize(language(doc), text(doc))
vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
vocab = vocab(doc)
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)

3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 1.0
[1, 2] = 1.0
[3, 2] = 0.1999
[2, 3] = 0.1999
13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
0.5 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅
⋮ ⋮ ⋮
⋅ ⋅ ⋅ ⋅ 1.0 ⋅ 0.2 0.583 0.3333 0.5 1.0 ⋅ 0.5
⋅ ⋅ ⋅ ⋅ 1.0 ⋅ ⋅ 1.0 0.2 0.25 0.3333 0.5 ⋅
```
"""
function coo_matrix(::Type{T},
Expand Down
Loading
Loading