Merge pull request #7 from JuliaText/ox/datadeps

Ox/datadeps
JuliaText · Apr 12, 2018 · 65f349b · 65f349b
2 parents 22f6692 + f3b00c5
commit 65f349b
Show file tree

Hide file tree

Showing 9 changed files with 757 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+*.swp
+**.ipynb_checkpoints
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,8 @@ language: julia
 os:
   - linux
 #  - osx
+env:
+  - DEPS_ALWAY_ACCEPT=true
 julia:
   - 0.6
   - nightly

diff --git a/REQUIRE b/REQUIRE
@@ -1,2 +1,5 @@
 julia 0.6
 LightXML
+DataDeps 0.2.2
+Glob
+StringInterning
diff --git a/examples/GLoVE-Flux.ipynb b/examples/GLoVE-Flux.ipynb
diff --git a/examples/GLoVE.ipynb b/examples/GLoVE.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "LoadError",
+     "evalue": "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m",
+      "",
+      "Stacktrace:",
+      " [1] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:522\u001b[22m\u001b[22m"
+     ]
+    }
+   ],
+   "source": [
+    "#=\n",
+    "# This is an implementation of\n",
+    "# Glove: Global vectors for word representation\n",
+    "# J Pennington, R Socher, C Manning\n",
+    "# Proceedings of the 2014 conference on empirical methods in natural language\n",
+    "# https://nlp.stanford.edu/pubs/glove.pdf\n",
+    "# (Made from the paper without reference to the source code)\n",
+    "#\n",
+    "# Pennington et. al's implementation is ~1K lines of  C\n",
+    "#=#\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "using CorpusLoaders\n",
+    "using MLDataUtils\n",
+    "using StringInterning\n",
+    "using DataStructures\n",
+    "using Optim\n",
+    "#using CatViews"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "wikidata = collect(Iterators.take(CorpusLoaders.load_wikicorpus(), 10_000_000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "function coocurs(data, hw=5)\n",
+    "    coocurs = DefaultDict{Tuple{InternedString,InternedString}, Float32}(0f0)\n",
+    "    distance_weights = [1f0/abs(d-hw) for d in 0:2hw  if d!=hw]\n",
+    "    for (word_, window) in slidingwindow(i->[i-hw:i-1; i+1:i+hw], data, 1, stride=1)\n",
+    "        word = first(word_)\n",
+    "        for (weight, coword) in zip(distance_weights, window)\n",
+    "            coocurs[(word,coword)]+=weight\n",
+    "        end\n",
+    "    end\n",
+    "\n",
+    "    encoding = labelenc(last.(collect(keys(coocurs))))\n",
+    "    coocurs_mat = spzeros(Float32, nlabel(encoding), nlabel(encoding))\n",
+    "    for (coocurance, score) in coocurs\n",
+    "        inds = convertlabel.(LabelEnc.Indices(nlabel(encoding)), coocurance, encoding)\n",
+    "        coocurs_mat[inds...] = score\n",
+    "    end\n",
+    "    coocurs_mat, encoding\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "f(x, xmax=100f0, α=3/4)::Float32 = x>xmax ? 1f0 : (x/xmax)^α\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "function glove(data, ndim=300, halfwindow=5)\n",
+    "    xco, encoding = coocurs(data, halfwindow)\n",
+    "    # sum f.(xco)\n",
+    "\n",
+    "    nwords = nlabel(encoding)\n",
+    "\n",
+    "    params = Float32[]\n",
+    "    mm = Int[]\n",
+    "    mm(ii) = (ii-1)*(2*ndim+2)+1\n",
+    "    getw(params, i) = begin @inbounds x=@view params[mm(i) : mm(i)+ndim-1]; x end\n",
+    "    getv(params, i) = begin @inbounds x=@view params[mm(i)+ndim : mm(i)+2ndim-1]; x end\n",
+    "    getb(params, i) = begin @inbounds x=params[mm(i)+2ndim]; x end\n",
+    "    getc(params, i) = begin @inbounds x=params[mm(i)+2ndim+1]; x end\n",
+    "\n",
+    "    for ii in 1:nwords\n",
+    "        vals = randn(Float32, 2ndim+2)# 2ndim+2)\n",
+    "        append!(params, vals)\n",
+    "        #Base.Test.@test vals == [getw(params,ii); getv(params,ii); getb(params,ii); getc(params,ii)]\n",
+    "    end\n",
+    "\n",
+    "    xco_ijx = collect(zip(findnz(xco)...))\n",
+    "    \n",
+    "    function loss(params)\n",
+    "        loss = 0f0\n",
+    "        @inbounds for (i, j, x) in xco_ijx\n",
+    "            wi = getw(params, i)\n",
+    "            vj = getv(params, j)\n",
+    "            bi = getb(params, i)\n",
+    "            cj = getc(params, j)\n",
+    "            loss += f(x)*(wi⋅vj + bi + cj - log(x))^2\n",
+    "        end\n",
+    "        loss\n",
+    "    end\n",
+    "\n",
+    "    \n",
+    "    optimize(loss, params; show_every=1, show_trace=true)\n",
+    "    getw.(params, 1:nlabels), encoding\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "wes, enc = glove(wikidata, 30)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "wes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Julia 0.6.1",
+   "language": "julia",
+   "name": "julia-0.6"
+  },
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "0.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl
@@ -1,15 +1,27 @@
 module CorpusLoaders
 
 using LightXML
+using DataDeps
+using Glob
+using BinDeps
+using InternedStrings
+using StringEncodings
 
 const AbstractStringVector = AbstractVector{<:AbstractString}
 
+function __init__()
+    print(DataDeps)
+    include("./datadeps.jl")
+end
+
+
 include("util.jl")
 
 include("tokenizers.jl")
 include("tags.jl")
 include("semcor.jl")
 include("semeval2007t7.jl")
-include("./similarity.jl")
+include("similarity.jl")
+include("wikicorpus.jl")
 
 end # module
diff --git a/src/wikicorpus.jl b/src/wikicorpus.jl
@@ -0,0 +1,29 @@
+export load_wikicorpus
+
+"""
+    load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5)
+
+Lazily loads text from wikicorpus.
+
+ - If there are less than `sentence_min_length` tokens on a line it is skipped.
+   - this gets rid of a lot of cruft like lists, titles, data-blocks etc
+"""
+function load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5)
+    Channel(ctype=InternedString, csize=2048) do ch
+        for file in readdir(glob"*Text*", path)
+            for sent in eachline(file, enc"latin1")
+                # Note: wikicorpus is not correct XML
+                # For now we will just drop the document identifying stuff
+                if any(startswith.(sent, ["<doc id", "</doc>", "ENDOFARTICLE."]))
+                    continue
+                end
+                tokens = punctuation_space_tokenize(sent)
+                if length(tokens) < sentence_min_length
+                    continue
+                else
+                    put!.(ch, tokens)
+                end
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,24 +1,21 @@
 using CorpusLoaders
 using Base.Test
 
-@testset "Semcor" begin
-	include("semcor.jl")
+testsets = ["semcor.jl",
+            "tokenizers.jl",
+            "similarity.jl",
+            "tags.jl",
+            "util.jl",
+            "semeval2007t7.jl",
+            "wikicorpus.jl"
+           ]
+
+for fn in testsets
+    @testset "$fn" begin
+	    include(fn)
+    end
 end
 
 
-@testset "Tokenizers" begin
-	include("./tokenizers.jl")
-end
-
-include("./similarity.jl")
-
 
-@testset "tag conversion" begin
-	include("tags.jl")
-end
-
-include("util.jl")
 
-@testset "semeval2007t7" begin
-	include("./semeval2007t7.jl")
-end
diff --git a/test/wikicorpus.jl b/test/wikicorpus.jl
@@ -0,0 +1,15 @@
+using CorpusLoaders
+using Base.Test
+using DataDeps
+
+
+@testset "basic use" begin
+    wk_gen = load_wikicorpus()
+    words = collect(Base.Iterators.take(wk_gen, 10_000));
+
+    @test all(isa.(words, AbstractString))
+    @test "a" ∈ words
+    @test "the" ∈ words
+
+end
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,8 @@ language: julia @@
     os:
       - linux
     #  - osx
+    env:
+      - DEPS_ALWAY_ACCEPT=true
     julia:
       - 0.6
       - nightly
@@ Expand Down @@