diff --git a/.gitignore b/.gitignore index afd28c1..c7a0281 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,5 @@ *.swp **.ipynb_checkpoints -/docs/build -/docs/site +docs/build +docs/site diff --git a/.travis.yml b/.travis.yml index 928998b..a6be970 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,10 +20,17 @@ email: false # - julia -e 'Pkg.clone(pwd()); Pkg.build("CorpusLoaders"); Pkg.test("CorpusLoaders"; coverage=true)' after_success: - # Push Documentation - - julia -e 'Pkg.add("Documenter")' - - julia -e 'cd(Pkg.dir("CorpusLoaders")); include(joinpath("docs", "make.jl"))' # push coverage results to Coveralls - julia -e 'cd(Pkg.dir("CorpusLoaders")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' # push coverage results to Codecov - julia -e 'cd(Pkg.dir("CorpusLoaders")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' + +jobs: + include: + - stage: "Documentation" + julia: 1.0 + os: linux + script: + - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - julia --project=docs/ docs/make.jl + after_success: skip diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..8cdb670 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,3 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8" diff --git a/docs/make.jl b/docs/make.jl index 0ac919a..bf92f8e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,9 +1,21 @@ using Documenter using CorpusLoaders -makedocs(modules=[CorpusLoaders]) +makedocs(modules = [CorpusLoaders], + sitename = "CorpusLoaders", + pages = [ + "Home" => "index.md", + "CoNLL" => "CoNLL.md", + "IMDB" => "IMDB.md", + "SemCor" => "SemCor.md", + "Senseval3" => "Senseval3.md", + "StanfordSentimentTreebank" => "StanfordSentimentTreebank.md", + "Twitter" => "Twitter.md", + "WikiCorpus" => "WikiCorpus.md", + "API References" => "APIReference.md" + ]) -deploydocs(deps = Deps.pip("mkdocs", "python-markdown-math"), - repo = "github.com/oxinabox/CorpusLoaders.jl.git", - osname = "linux") +deploydocs(deps = Deps.pip("mkdocs", "python-markdown-math"), + repo = "github.com/oxinabox/CorpusLoaders.jl.git" + ) diff --git a/docs/src/APIReference.md b/docs/src/APIReference.md new file mode 100644 index 0000000..69bc727 --- /dev/null +++ b/docs/src/APIReference.md @@ -0,0 +1,6 @@ +# API References + +```@autodocs +Modules = [CorpusLoaders] +Order = [:function, :type] +``` diff --git a/docs/src/CoNLL.md b/docs/src/CoNLL.md index 0c1d4a4..fec6224 100644 --- a/docs/src/CoNLL.md +++ b/docs/src/CoNLL.md @@ -1,4 +1,4 @@ -## CoNLL 2003 +# CoNLL 2003 The CoNLL-2003 shared task data files is made from the the Reuters Corpus, is a collection of news wire articles. diff --git a/docs/src/IMDB.md b/docs/src/IMDB.md index eb89993..1caaf80 100644 --- a/docs/src/IMDB.md +++ b/docs/src/IMDB.md @@ -1,4 +1,4 @@ -### IMDB +# IMDB IMDB movie reviews dataset a standard collection for Binary Sentiment Analysis task. It is used for benchmarking Sentiment Analysis algorithms. It provides a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided @@ -19,7 +19,7 @@ Example: #Using "test_neg" keywords for negative test set examples -``` +```julia julia> dataset_test_neg = load(IMDB("test_neg")) Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4) @@ -32,7 +32,7 @@ julia> docs = collect(take(dataset_test_neg, 2)) #Using "train_pos" keyword for positive train set examples -``` +```julia julia> dataset_train_pos = load(IMDB()) #no need to specify category because "train_pos" is default Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4) diff --git a/docs/src/SemCor.md b/docs/src/SemCor.md index 894b97f..cb9327a 100644 --- a/docs/src/SemCor.md +++ b/docs/src/SemCor.md @@ -1,5 +1,4 @@ - -## SemCor +# SemCor The classical Sense Annotated corpus. See also [WordNet.jl](https://github.com/JuliaText/WordNet.jl) diff --git a/docs/src/Senseval3.md b/docs/src/Senseval3.md index 5f4bc1c..eb1e06b 100644 --- a/docs/src/Senseval3.md +++ b/docs/src/Senseval3.md @@ -1,4 +1,5 @@ -## Senseval-3 +# Senseval-3 + Senseval-3 is a sense annotated corpus Has a structure of documents, sentences, words. The words are either tagged with part of speech, or tagged with full lemma, part of speech and sensekey. diff --git a/docs/src/StanfordSentimentTreebank.md b/docs/src/StanfordSentimentTreebank.md index f098a05..ceebf0b 100644 --- a/docs/src/StanfordSentimentTreebank.md +++ b/docs/src/StanfordSentimentTreebank.md @@ -1,4 +1,5 @@ -### StanfordSentimentTreebank +# StanfordSentimentTreebank + This contains sentiment part of famous dataset Stanford Sentiment Treebank V1.0 for [Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) paper by Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher Manning, Andrew Ng and Christopher Potts. The dataset gives the phases with their sentiment labels between 0 to 1. This dataset can be used as binary or fine-grained sentiment classification problems. @@ -7,11 +8,11 @@ documents/tweets, sentences, words, characters To get desired levels, `flatten_levels` function from [MultiResolutionIterators.jl](https://github.com/oxinabox/MultiResolutionIterators.jl) can be used. -## Usage: +## Usage The output dataset is a 2-dimensional `Array` with first column as `Vector`s of sentences as tokens and second column as their respective sentiment scores. -``` +```julia julia> dataset = load(StanfordSentimentTreebank()) 239232×2 Array{Any,2}: Array{String,1}[["!"]] @@ -50,9 +51,9 @@ julia> dataset = load(StanfordSentimentTreebank()) ``` -# To get phrases from `data`: +### To get phrases from `data`: -``` +```julia julia> phrases = dataset[1:5, 1] #Here `data1`is a 2-D Array 5-element Array{Any,1}: Array{String,1}[["!"]] @@ -62,9 +63,9 @@ julia> phrases = dataset[1:5, 1] #Here `data1`is a 2-D Array Array{String,1}[["!"], ["Brilliant"]] ``` -# To get sentiments values: +### To get sentiments values: -``` +```julia julia> values = data[1:5, 2] #Here "data" is a 2-D Array 5-element Array{Any,1}: 0.5 @@ -74,11 +75,11 @@ julia> values = data[1:5, 2] #Here "data" is a 2-D Array 0.86111 ``` -# Using `flatten_levels` +### Using `flatten_levels` To get an `Array` of all sentences from all the `phrases` (since each phrase can contain more than one sentence): -``` +```julia julia> sentences = flatten_levels(phrases, (lvls)(StanfordSentimentTreebank, :documents))|>full_consolidate 9-element Array{Array{String,1},1}: ["!"] @@ -94,7 +95,7 @@ julia> sentences = flatten_levels(phrases, (lvls)(StanfordSentimentTreebank, :do To get `Array` of all the from `phrases`: -``` +```julia julia> words = flatten_levels(phrases, (!lvls)(StanfordSentimentTreebank, :words))|>full_consolidate 10-element Array{String,1}: "!" diff --git a/docs/src/Twitter.md b/docs/src/Twitter.md index 3a0734f..7ee4f6a 100644 --- a/docs/src/Twitter.md +++ b/docs/src/Twitter.md @@ -1,4 +1,4 @@ -## Twitter +# Twitter Twitter sentiment dataset by Nick Sanders. Downloaded from [Sentiment140 site](http://help.sentiment140.com/for-students). It is large dataset for the Sentiment Analysis task. Every tweets falls in either three categories positive(4), negative(0) or neutral(2).It contains 1600000 training examples and 498 testing examples. @@ -18,7 +18,7 @@ Example: #Using "test_pos" keyword for getting positive polarity sentiment examples -``` +```julia julia> dataset_test_pos = load(Twitter("test_pos")) Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4) @@ -67,7 +67,7 @@ julia> tweets = collect(take(dataset_test_pos, 2)) #Using "train_pos" category to get positive polarity sentiment examples -``` +```julia julia> dataset_train_pos = load(Twitter()) #no need to specify category because "train_pos" is default Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4) diff --git a/docs/src/WikiCorpus.md b/docs/src/WikiCorpus.md index 6f0f8f3..e46d2af 100644 --- a/docs/src/WikiCorpus.md +++ b/docs/src/WikiCorpus.md @@ -1,5 +1,4 @@ - -### WikiCorpus +# WikiCorpus Very commonly used corpus in general. The loader (and default datadep) is for [Samuel Reese's 2006 based corpus](http://www.lsi.upc.edu/~nlp/wikicorpus/). @@ -17,7 +16,7 @@ so should use `flatten_levels` (from MultiResolutionIterators.jl) to get rid of Example: -``` +```julia julia> using CorpusLoaders; julia> using MultiResolutionIterators; julia> using Base.Iterators; diff --git a/docs/src/index.md b/docs/src/index.md index 51b6efe..7d6acd1 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,4 +1,26 @@ -```@autodocs -Modules = [CorpusLoaders] -Order = [:function, :type] -``` +# CorpusLoaders.jl + +A collection of various means for loading various different corpora used in NLP. + +## Installation + + pkg> add https://github.com/JuliaText/CorpusLoaders.jl + +## Common Structure + +For some corpus which we will say has type `Corpus`, +it will have a constructior `Corpus(path)`` +where `path` argument is a path to the files describing it. +That path will default to a predefined data dependency, if not provided. +The data dependency will be downloaded the first time you call `Corpus()`. +When the datadep resolves it will give full bibliograpghic details on the corpus etc. +For more on that like configuration details, see [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl). + +Each corpus has a function `load(::Corpus)`. +This will return some iterator of data. +It is often lazy, e.g. using a `Channel`, +as many corpora are too large to fit in memory comfortably. +It will often be an iterator of iterators of iterators ... +Designed to be manipulated by using [MultiResolutionIterators.jl](https://github.com/oxinabox/MultiResolutionIterators.jl). +The corpus type is an indexer for using named levels with MultiResolutionInterators.jl, +so `lvls(Corpus, :para)` works