JuliaText · Ayushk4 · Aug 6, 2019 · Jul 27, 2019 · Jul 27, 2019 · Jul 27, 2019
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@
 *.jl.mem
 *.swp
 **.ipynb_checkpoints
+
+docs/build
+docs/site
diff --git a/.travis.yml b/.travis.yml
@@ -20,10 +20,17 @@ email: false
 #  - julia -e 'Pkg.clone(pwd()); Pkg.build("CorpusLoaders"); Pkg.test("CorpusLoaders"; coverage=true)'
 
 after_success:
-  # Push Documentation
-  - julia -e 'Pkg.add("Documenter")'
-  - julia -e 'cd(Pkg.dir("CorpusLoaders")); include(joinpath("docs", "make.jl"))'
   # push coverage results to Coveralls
   - julia -e 'cd(Pkg.dir("CorpusLoaders")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
   # push coverage results to Codecov
   - julia -e 'cd(Pkg.dir("CorpusLoaders")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
+
+jobs:
+  include:
+    - stage: "Documentation"
+      julia: 1.0
+      os: linux
+      script:
+        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+        - julia --project=docs/ docs/make.jl
+      after_success: skip
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,9 +1,21 @@
 using Documenter
 using CorpusLoaders
 
-makedocs(modules=[CorpusLoaders])
+makedocs(modules = [CorpusLoaders],
+         sitename = "CorpusLoaders",
+         pages = [
+             "Home" => "index.md",
+             "CoNLL" => "CoNLL.md",
+             "IMDB" => "IMDB.md",
+             "SemCor" => "SemCor.md",
+             "Senseval3" => "Senseval3.md",
+             "StanfordSentimentTreebank" => "StanfordSentimentTreebank.md",
+             "Twitter" => "Twitter.md",
+             "WikiCorpus" => "WikiCorpus.md",
+             "API References" => "APIReference.md"
+        ])
 
 
-deploydocs(deps   = Deps.pip("mkdocs", "python-markdown-math"),
-			repo = "github.com/oxinabox/CorpusLoaders.jl.git",
-			osname = "linux")
+deploydocs(deps = Deps.pip("mkdocs", "python-markdown-math"),
+           repo = "github.com/oxinabox/CorpusLoaders.jl.git"
+           )
diff --git a/docs/src/APIReference.md b/docs/src/APIReference.md
@@ -0,0 +1,6 @@
+# API References
+
+```@autodocs
+Modules = [CorpusLoaders]
+Order   = [:function, :type]
+```
diff --git a/docs/src/CoNLL.md b/docs/src/CoNLL.md
@@ -1,4 +1,4 @@
-## CoNLL 2003
+# CoNLL 2003
 The CoNLL-2003 shared task data files
 is made from the the Reuters Corpus,
 is a collection of news wire articles.

diff --git a/docs/src/IMDB.md b/docs/src/IMDB.md
@@ -1,4 +1,4 @@
-### IMDB
+# IMDB
 
 IMDB movie reviews dataset a standard collection for Binary Sentiment Analysis task. It is used for benchmarking Sentiment Analysis algorithms. It provides a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided
 
@@ -19,7 +19,7 @@ Example:
 
 #Using "test_neg" keywords for negative test set examples
 
-```
+```julia
  julia> dataset_test_neg = load(IMDB("test_neg"))
 Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4)
 
@@ -32,7 +32,7 @@ julia> docs = collect(take(dataset_test_neg, 2))
 
 #Using "train_pos" keyword for positive train set examples
 
-```
+```julia
 julia> dataset_train_pos = load(IMDB())   #no need to specify category because "train_pos" is default
 Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4)
 

diff --git a/docs/src/SemCor.md b/docs/src/SemCor.md
@@ -1,5 +1,4 @@
-
-## SemCor
+# SemCor
 
 The classical Sense Annotated corpus.
 See also [WordNet.jl](https://github.com/JuliaText/WordNet.jl)

diff --git a/docs/src/Senseval3.md b/docs/src/Senseval3.md
@@ -1,4 +1,5 @@
-## Senseval-3
+# Senseval-3
+
 Senseval-3 is a sense annotated corpus
 Has a structure of documents, sentences, words.
 The words are either tagged with part of speech, or tagged with full lemma, part of speech and sensekey.

diff --git a/docs/src/StanfordSentimentTreebank.md b/docs/src/StanfordSentimentTreebank.md
@@ -1,4 +1,5 @@
-### StanfordSentimentTreebank
+# StanfordSentimentTreebank
+
 This contains sentiment part of famous dataset Stanford Sentiment Treebank V1.0 for [Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) paper by Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher Manning, Andrew Ng and Christopher Potts.
 The dataset gives the phases with their sentiment labels between 0 to 1. This dataset can be used as binary or fine-grained sentiment classification problems.
 
@@ -7,11 +8,11 @@ documents/tweets, sentences, words, characters
 
 To get desired levels, `flatten_levels` function from [MultiResolutionIterators.jl](https://github.com/oxinabox/MultiResolutionIterators.jl) can be used.
 
-## Usage:
+## Usage
 
 The output dataset is a 2-dimensional `Array` with first column as `Vector`s of sentences as tokens and second column as their respective sentiment scores.
 
-```
+```julia
 julia> dataset = load(StanfordSentimentTreebank())
 239232×2 Array{Any,2}:
  Array{String,1}[["!"]]
@@ -50,9 +51,9 @@ julia> dataset = load(StanfordSentimentTreebank())
 
 ```
 
-# To get phrases from `data`:
+### To get phrases from `data`:
 
-```
+```julia
 julia> phrases = dataset[1:5, 1]       #Here `data1`is a 2-D Array
 5-element Array{Any,1}:
  Array{String,1}[["!"]]
@@ -62,9 +63,9 @@ julia> phrases = dataset[1:5, 1]       #Here `data1`is a 2-D Array
  Array{String,1}[["!"], ["Brilliant"]]
 ```
 
-# To get sentiments values:
+### To get sentiments values:
 
-```
+```julia
 julia> values = data[1:5, 2]          #Here "data" is a 2-D Array
 5-element Array{Any,1}:
  0.5
@@ -74,11 +75,11 @@ julia> values = data[1:5, 2]          #Here "data" is a 2-D Array
  0.86111
 ```
 
-# Using `flatten_levels`
+### Using `flatten_levels`
 
 To get an `Array` of all sentences from all the `phrases` (since each phrase can contain more than one sentence):
 
-```
+```julia
 julia> sentences = flatten_levels(phrases, (lvls)(StanfordSentimentTreebank, :documents))|>full_consolidate
 9-element Array{Array{String,1},1}:
  ["!"]
@@ -94,7 +95,7 @@ julia> sentences = flatten_levels(phrases, (lvls)(StanfordSentimentTreebank, :do
 
 To get `Array` of all the from `phrases`:
 
-```
+```julia
 julia> words = flatten_levels(phrases, (!lvls)(StanfordSentimentTreebank, :words))|>full_consolidate
 10-element Array{String,1}:
  "!"

diff --git a/docs/src/Twitter.md b/docs/src/Twitter.md
@@ -1,4 +1,4 @@
-## Twitter
+# Twitter
 
 Twitter sentiment dataset by Nick Sanders. Downloaded from [Sentiment140 site](http://help.sentiment140.com/for-students).
 It is large dataset for the Sentiment Analysis task. Every tweets falls in either three categories positive(4), negative(0) or neutral(2).It contains 1600000 training examples and 498 testing examples.
@@ -18,7 +18,7 @@ Example:
 
 #Using "test_pos" keyword for getting positive polarity sentiment examples
 
-```
+```julia
 julia> dataset_test_pos = load(Twitter("test_pos"))
 Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4)
 
@@ -67,7 +67,7 @@ julia> tweets = collect(take(dataset_test_pos, 2))
 
 #Using "train_pos" category to get positive polarity sentiment examples
 
-```
+```julia
 julia> dataset_train_pos = load(Twitter()) #no need to specify category because "train_pos" is default
 Channel{Array{Array{String,1},1}}(sz_max:4,sz_curr:4)
 

diff --git a/docs/src/WikiCorpus.md b/docs/src/WikiCorpus.md
@@ -1,5 +1,4 @@
-
-### WikiCorpus
+# WikiCorpus
 
 Very commonly used corpus in general.
 The loader (and default datadep) is for [Samuel Reese's 2006 based corpus](http://www.lsi.upc.edu/~nlp/wikicorpus/).
@@ -17,7 +16,7 @@ so should use `flatten_levels` (from MultiResolutionIterators.jl)  to get rid of
 
 Example:
 
-```
+```julia
 julia> using CorpusLoaders;
 julia> using MultiResolutionIterators;
 julia> using Base.Iterators;

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,4 +1,26 @@
-```@autodocs
-Modules = [CorpusLoaders]
-Order   = [:function, :type]
-```
+# CorpusLoaders.jl
+
+A collection of various means for loading various different corpora used in NLP.
+
+## Installation
+
+    pkg> add https://github.com/JuliaText/CorpusLoaders.jl
+
+## Common Structure
+
+For some corpus which we will say has type `Corpus`,
+it will have a constructior `Corpus(path)``
+where `path` argument is a path to the files describing it.
+That path will default to a predefined data dependency, if not provided.
+The data dependency will be downloaded the first time you call `Corpus()`.
+When the datadep resolves it will give full bibliograpghic details on the corpus etc.
+For more on that like configuration details, see [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl).
+
+Each corpus has a function `load(::Corpus)`.
+This will return some iterator of data.
+It is often lazy, e.g. using a `Channel`,
+as many corpora are too large to fit in memory comfortably.
+It will often be an iterator of iterators of iterators ...
+Designed to be manipulated by using [MultiResolutionIterators.jl](https://github.com/oxinabox/MultiResolutionIterators.jl).
+The corpus type is an indexer for using named levels with MultiResolutionInterators.jl,
+so `lvls(Corpus, :para)` works