-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from JuliaText/ox/datadeps
Ox/datadeps
- Loading branch information
Showing
9 changed files
with
757 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
*.jl.cov | ||
*.jl.*.cov | ||
*.jl.mem | ||
*.swp | ||
**.ipynb_checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ language: julia | |
os: | ||
- linux | ||
# - osx | ||
env: | ||
- DEPS_ALWAY_ACCEPT=true | ||
julia: | ||
- 0.6 | ||
- nightly | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
julia 0.6 | ||
LightXML | ||
DataDeps 0.2.2 | ||
Glob | ||
StringInterning |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"ename": "LoadError", | ||
"evalue": "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m", | ||
"", | ||
"Stacktrace:", | ||
" [1] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:522\u001b[22m\u001b[22m" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#=\n", | ||
"# This is an implementation of\n", | ||
"# Glove: Global vectors for word representation\n", | ||
"# J Pennington, R Socher, C Manning\n", | ||
"# Proceedings of the 2014 conference on empirical methods in natural language\n", | ||
"# https://nlp.stanford.edu/pubs/glove.pdf\n", | ||
"# (Made from the paper without reference to the source code)\n", | ||
"#\n", | ||
"# Pennington et. al's implementation is ~1K lines of C\n", | ||
"#=#\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false, | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"using CorpusLoaders\n", | ||
"using MLDataUtils\n", | ||
"using StringInterning\n", | ||
"using DataStructures\n", | ||
"using Optim\n", | ||
"#using CatViews" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"wikidata = collect(Iterators.take(CorpusLoaders.load_wikicorpus(), 10_000_000))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"function coocurs(data, hw=5)\n", | ||
" coocurs = DefaultDict{Tuple{InternedString,InternedString}, Float32}(0f0)\n", | ||
" distance_weights = [1f0/abs(d-hw) for d in 0:2hw if d!=hw]\n", | ||
" for (word_, window) in slidingwindow(i->[i-hw:i-1; i+1:i+hw], data, 1, stride=1)\n", | ||
" word = first(word_)\n", | ||
" for (weight, coword) in zip(distance_weights, window)\n", | ||
" coocurs[(word,coword)]+=weight\n", | ||
" end\n", | ||
" end\n", | ||
"\n", | ||
" encoding = labelenc(last.(collect(keys(coocurs))))\n", | ||
" coocurs_mat = spzeros(Float32, nlabel(encoding), nlabel(encoding))\n", | ||
" for (coocurance, score) in coocurs\n", | ||
" inds = convertlabel.(LabelEnc.Indices(nlabel(encoding)), coocurance, encoding)\n", | ||
" coocurs_mat[inds...] = score\n", | ||
" end\n", | ||
" coocurs_mat, encoding\n", | ||
"end" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"f(x, xmax=100f0, α=3/4)::Float32 = x>xmax ? 1f0 : (x/xmax)^α\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"function glove(data, ndim=300, halfwindow=5)\n", | ||
" xco, encoding = coocurs(data, halfwindow)\n", | ||
" # sum f.(xco)\n", | ||
"\n", | ||
" nwords = nlabel(encoding)\n", | ||
"\n", | ||
" params = Float32[]\n", | ||
" mm = Int[]\n", | ||
" mm(ii) = (ii-1)*(2*ndim+2)+1\n", | ||
" getw(params, i) = begin @inbounds x=@view params[mm(i) : mm(i)+ndim-1]; x end\n", | ||
" getv(params, i) = begin @inbounds x=@view params[mm(i)+ndim : mm(i)+2ndim-1]; x end\n", | ||
" getb(params, i) = begin @inbounds x=params[mm(i)+2ndim]; x end\n", | ||
" getc(params, i) = begin @inbounds x=params[mm(i)+2ndim+1]; x end\n", | ||
"\n", | ||
" for ii in 1:nwords\n", | ||
" vals = randn(Float32, 2ndim+2)# 2ndim+2)\n", | ||
" append!(params, vals)\n", | ||
" #Base.Test.@test vals == [getw(params,ii); getv(params,ii); getb(params,ii); getc(params,ii)]\n", | ||
" end\n", | ||
"\n", | ||
" xco_ijx = collect(zip(findnz(xco)...))\n", | ||
" \n", | ||
" function loss(params)\n", | ||
" loss = 0f0\n", | ||
" @inbounds for (i, j, x) in xco_ijx\n", | ||
" wi = getw(params, i)\n", | ||
" vj = getv(params, j)\n", | ||
" bi = getb(params, i)\n", | ||
" cj = getc(params, j)\n", | ||
" loss += f(x)*(wi⋅vj + bi + cj - log(x))^2\n", | ||
" end\n", | ||
" loss\n", | ||
" end\n", | ||
"\n", | ||
" \n", | ||
" optimize(loss, params; show_every=1, show_trace=true)\n", | ||
" getw.(params, 1:nlabels), encoding\n", | ||
"end" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"wes, enc = glove(wikidata, 30)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"wes" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Julia 0.6.1", | ||
"language": "julia", | ||
"name": "julia-0.6" | ||
}, | ||
"language_info": { | ||
"file_extension": ".jl", | ||
"mimetype": "application/julia", | ||
"name": "julia", | ||
"version": "0.6.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,27 @@ | ||
module CorpusLoaders | ||
|
||
using LightXML | ||
using DataDeps | ||
using Glob | ||
using BinDeps | ||
using InternedStrings | ||
using StringEncodings | ||
|
||
const AbstractStringVector = AbstractVector{<:AbstractString} | ||
|
||
function __init__() | ||
print(DataDeps) | ||
include("./datadeps.jl") | ||
end | ||
|
||
|
||
include("util.jl") | ||
|
||
include("tokenizers.jl") | ||
include("tags.jl") | ||
include("semcor.jl") | ||
include("semeval2007t7.jl") | ||
include("./similarity.jl") | ||
include("similarity.jl") | ||
include("wikicorpus.jl") | ||
|
||
end # module |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
export load_wikicorpus | ||
|
||
""" | ||
load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5) | ||
Lazily loads text from wikicorpus. | ||
- If there are less than `sentence_min_length` tokens on a line it is skipped. | ||
- this gets rid of a lot of cruft like lists, titles, data-blocks etc | ||
""" | ||
function load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5) | ||
Channel(ctype=InternedString, csize=2048) do ch | ||
for file in readdir(glob"*Text*", path) | ||
for sent in eachline(file, enc"latin1") | ||
# Note: wikicorpus is not correct XML | ||
# For now we will just drop the document identifying stuff | ||
if any(startswith.(sent, ["<doc id", "</doc>", "ENDOFARTICLE."])) | ||
continue | ||
end | ||
tokens = punctuation_space_tokenize(sent) | ||
if length(tokens) < sentence_min_length | ||
continue | ||
else | ||
put!.(ch, tokens) | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,21 @@ | ||
using CorpusLoaders | ||
using Base.Test | ||
|
||
@testset "Semcor" begin | ||
include("semcor.jl") | ||
testsets = ["semcor.jl", | ||
"tokenizers.jl", | ||
"similarity.jl", | ||
"tags.jl", | ||
"util.jl", | ||
"semeval2007t7.jl", | ||
"wikicorpus.jl" | ||
] | ||
|
||
for fn in testsets | ||
@testset "$fn" begin | ||
include(fn) | ||
end | ||
end | ||
|
||
|
||
@testset "Tokenizers" begin | ||
include("./tokenizers.jl") | ||
end | ||
|
||
include("./similarity.jl") | ||
|
||
|
||
@testset "tag conversion" begin | ||
include("tags.jl") | ||
end | ||
|
||
include("util.jl") | ||
|
||
@testset "semeval2007t7" begin | ||
include("./semeval2007t7.jl") | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
using CorpusLoaders | ||
using Base.Test | ||
using DataDeps | ||
|
||
|
||
@testset "basic use" begin | ||
wk_gen = load_wikicorpus() | ||
words = collect(Base.Iterators.take(wk_gen, 10_000)); | ||
|
||
@test all(isa.(words, AbstractString)) | ||
@test "a" ∈ words | ||
@test "the" ∈ words | ||
|
||
end | ||
|