Skip to content

Commit

Permalink
Merge pull request #7 from JuliaText/ox/datadeps
Browse files Browse the repository at this point in the history
Ox/datadeps
  • Loading branch information
oxinabox authored Apr 12, 2018
2 parents 22f6692 + f3b00c5 commit 65f349b
Show file tree
Hide file tree
Showing 9 changed files with 757 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.jl.cov
*.jl.*.cov
*.jl.mem
*.swp
**.ipynb_checkpoints
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ language: julia
os:
- linux
# - osx
env:
- DEPS_ALWAY_ACCEPT=true
julia:
- 0.6
- nightly
Expand Down
3 changes: 3 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
julia 0.6
LightXML
DataDeps 0.2.2
Glob
StringInterning
482 changes: 482 additions & 0 deletions examples/GLoVE-Flux.ipynb

Large diffs are not rendered by default.

198 changes: 198 additions & 0 deletions examples/GLoVE.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "LoadError",
"evalue": "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m",
"output_type": "error",
"traceback": [
"\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m",
"",
"Stacktrace:",
" [1] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:522\u001b[22m\u001b[22m"
]
}
],
"source": [
"#=\n",
"# This is an implementation of\n",
"# Glove: Global vectors for word representation\n",
"# J Pennington, R Socher, C Manning\n",
"# Proceedings of the 2014 conference on empirical methods in natural language\n",
"# https://nlp.stanford.edu/pubs/glove.pdf\n",
"# (Made from the paper without reference to the source code)\n",
"#\n",
"# Pennington et. al's implementation is ~1K lines of C\n",
"#=#\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"using CorpusLoaders\n",
"using MLDataUtils\n",
"using StringInterning\n",
"using DataStructures\n",
"using Optim\n",
"#using CatViews"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"wikidata = collect(Iterators.take(CorpusLoaders.load_wikicorpus(), 10_000_000))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"function coocurs(data, hw=5)\n",
" coocurs = DefaultDict{Tuple{InternedString,InternedString}, Float32}(0f0)\n",
" distance_weights = [1f0/abs(d-hw) for d in 0:2hw if d!=hw]\n",
" for (word_, window) in slidingwindow(i->[i-hw:i-1; i+1:i+hw], data, 1, stride=1)\n",
" word = first(word_)\n",
" for (weight, coword) in zip(distance_weights, window)\n",
" coocurs[(word,coword)]+=weight\n",
" end\n",
" end\n",
"\n",
" encoding = labelenc(last.(collect(keys(coocurs))))\n",
" coocurs_mat = spzeros(Float32, nlabel(encoding), nlabel(encoding))\n",
" for (coocurance, score) in coocurs\n",
" inds = convertlabel.(LabelEnc.Indices(nlabel(encoding)), coocurance, encoding)\n",
" coocurs_mat[inds...] = score\n",
" end\n",
" coocurs_mat, encoding\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f(x, xmax=100f0, α=3/4)::Float32 = x>xmax ? 1f0 : (x/xmax)^α\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\n",
"function glove(data, ndim=300, halfwindow=5)\n",
" xco, encoding = coocurs(data, halfwindow)\n",
" # sum f.(xco)\n",
"\n",
" nwords = nlabel(encoding)\n",
"\n",
" params = Float32[]\n",
" mm = Int[]\n",
" mm(ii) = (ii-1)*(2*ndim+2)+1\n",
" getw(params, i) = begin @inbounds x=@view params[mm(i) : mm(i)+ndim-1]; x end\n",
" getv(params, i) = begin @inbounds x=@view params[mm(i)+ndim : mm(i)+2ndim-1]; x end\n",
" getb(params, i) = begin @inbounds x=params[mm(i)+2ndim]; x end\n",
" getc(params, i) = begin @inbounds x=params[mm(i)+2ndim+1]; x end\n",
"\n",
" for ii in 1:nwords\n",
" vals = randn(Float32, 2ndim+2)# 2ndim+2)\n",
" append!(params, vals)\n",
" #Base.Test.@test vals == [getw(params,ii); getv(params,ii); getb(params,ii); getc(params,ii)]\n",
" end\n",
"\n",
" xco_ijx = collect(zip(findnz(xco)...))\n",
" \n",
" function loss(params)\n",
" loss = 0f0\n",
" @inbounds for (i, j, x) in xco_ijx\n",
" wi = getw(params, i)\n",
" vj = getv(params, j)\n",
" bi = getb(params, i)\n",
" cj = getc(params, j)\n",
" loss += f(x)*(wi⋅vj + bi + cj - log(x))^2\n",
" end\n",
" loss\n",
" end\n",
"\n",
" \n",
" optimize(loss, params; show_every=1, show_trace=true)\n",
" getw.(params, 1:nlabels), encoding\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"wes, enc = glove(wikidata, 30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"wes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 0.6.1",
"language": "julia",
"name": "julia-0.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "0.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
14 changes: 13 additions & 1 deletion src/CorpusLoaders.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,27 @@
module CorpusLoaders

using LightXML
using DataDeps
using Glob
using BinDeps
using InternedStrings
using StringEncodings

const AbstractStringVector = AbstractVector{<:AbstractString}

function __init__()
print(DataDeps)
include("./datadeps.jl")
end


include("util.jl")

include("tokenizers.jl")
include("tags.jl")
include("semcor.jl")
include("semeval2007t7.jl")
include("./similarity.jl")
include("similarity.jl")
include("wikicorpus.jl")

end # module
29 changes: 29 additions & 0 deletions src/wikicorpus.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
export load_wikicorpus

"""
load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5)
Lazily loads text from wikicorpus.
- If there are less than `sentence_min_length` tokens on a line it is skipped.
- this gets rid of a lot of cruft like lists, titles, data-blocks etc
"""
function load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5)
Channel(ctype=InternedString, csize=2048) do ch
for file in readdir(glob"*Text*", path)
for sent in eachline(file, enc"latin1")
# Note: wikicorpus is not correct XML
# For now we will just drop the document identifying stuff
if any(startswith.(sent, ["<doc id", "</doc>", "ENDOFARTICLE."]))
continue
end
tokens = punctuation_space_tokenize(sent)
if length(tokens) < sentence_min_length
continue
else
put!.(ch, tokens)
end
end
end
end
end
29 changes: 13 additions & 16 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
using CorpusLoaders
using Base.Test

@testset "Semcor" begin
include("semcor.jl")
testsets = ["semcor.jl",
"tokenizers.jl",
"similarity.jl",
"tags.jl",
"util.jl",
"semeval2007t7.jl",
"wikicorpus.jl"
]

for fn in testsets
@testset "$fn" begin
include(fn)
end
end


@testset "Tokenizers" begin
include("./tokenizers.jl")
end

include("./similarity.jl")


@testset "tag conversion" begin
include("tags.jl")
end

include("util.jl")

@testset "semeval2007t7" begin
include("./semeval2007t7.jl")
end
15 changes: 15 additions & 0 deletions test/wikicorpus.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
using CorpusLoaders
using Base.Test
using DataDeps


@testset "basic use" begin
wk_gen = load_wikicorpus()
words = collect(Base.Iterators.take(wk_gen, 10_000));

@test all(isa.(words, AbstractString))
@test "a" words
@test "the" words

end

0 comments on commit 65f349b

Please sign in to comment.