diff --git a/.gitignore b/.gitignore index 8c960ec..5307567 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.jl.cov *.jl.*.cov *.jl.mem +*.swp +**.ipynb_checkpoints diff --git a/.travis.yml b/.travis.yml index 4643aa0..5b81402 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,8 @@ language: julia os: - linux # - osx +env: + - DEPS_ALWAY_ACCEPT=true julia: - 0.6 - nightly diff --git a/REQUIRE b/REQUIRE index c6df563..1d787a9 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,2 +1,5 @@ julia 0.6 LightXML +DataDeps 0.2.2 +Glob +StringInterning diff --git a/examples/GLoVE-Flux.ipynb b/examples/GLoVE-Flux.ipynb new file mode 100644 index 0000000..5f733ae --- /dev/null +++ b/examples/GLoVE-Flux.ipynb @@ -0,0 +1,482 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "LoadError", + "evalue": "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m", + "output_type": "error", + "traceback": [ + "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m", + "", + "Stacktrace:", + " [1] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:522\u001b[22m\u001b[22m" + ] + } + ], + "source": [ + "#=\n", + "# This is an implementation of\n", + "# Glove: Global vectors for word representation\n", + "# J Pennington, R Socher, C Manning\n", + "# Proceedings of the 2014 conference on empirical methods in natural language\n", + "# https://nlp.stanford.edu/pubs/glove.pdf\n", + "# (Made from the paper without reference to the source code)\n", + "#\n", + "# Pennington et. al's implementation is ~1K lines of C\n", + "#=#\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "LinAlg.dot(x::Flux.TrackedArray, y::Flux.TrackedArray) = sum(x.*y)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "using CorpusLoaders\n", + "using MLDataUtils\n", + "using StringInterning\n", + "using DataStructures\n", + "using Flux" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10000000-element Array{StringInterning.InternedString,1}:\n", + " \"henry\" \n", + " \"hallam\" \n", + " \"july\" \n", + " \"9\" \n", + " \"1777\" \n", + " \"january\" \n", + " \"21\" \n", + " \"1859\" \n", + " \"was\" \n", + " \"an\" \n", + " \"english\" \n", + " \"historian\" \n", + " \"the\" \n", + " ⋮ \n", + " \"concerns\" \n", + " \"a\" \n", + " \"midwife\" \n", + " \"watts\" \n", + " \"in\" \n", + " \"london\" \n", + " \"who\" \n", + " \"investigates\"\n", + " \"the\" \n", + " \"identity\" \n", + " \"of\" \n", + " \"a\" " + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikidata = collect(Iterators.take(CorpusLoaders.load_wikicorpus(), 10_000_000))" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "coocurs (generic function with 2 methods)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "function coocurs(data, hw=5)\n", + " coocurs = DefaultDict{Tuple{InternedString,InternedString}, Float32}(0f0)\n", + " distance_weights = [1f0/abs(d-hw) for d in 0:2hw if d!=hw]\n", + " for (word_, window) in slidingwindow(i->[i-hw:i-1; i+1:i+hw], data, 1, stride=1)\n", + " word = first(word_)\n", + " for (weight, coword) in zip(distance_weights, window)\n", + " coocurs[(word,coword)]+=weight\n", + " end\n", + " end\n", + " coocurs\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "f (generic function with 3 methods)" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f(x, xmax=100f0, α=3/4)::Float32 = x>xmax ? 1f0 : (x/xmax)^α\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "glove (generic function with 3 methods)" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "function glove(data, ndim=300, halfwindow=5)\n", + " xco = coocurs(data, halfwindow)\n", + " # sum f.(xco)\n", + " words = unique(last.(collect(keys(xco))))\n", + " ws = Dict(w=>param(randn(Float32,ndim)) for w in words) \n", + " vs = Dict(w=>param(randn(Float32,ndim)) for w in words)\n", + " bs = Dict(w=>param(randn(Float32)) for w in words)\n", + " cs = Dict(w=>param(randn(Float32)) for w in words)\n", + " all_params = vcat(collect.(values.((ws,vs,bs,cs)))...)\n", + " function loss(ij,x)\n", + " i,j = ij\n", + " @inbounds res = f(x)*(ws[i]⋅vs[j] + bs[i] + cs[j] - log(x)).^2\n", + " res\n", + " end\n", + " \n", + " loss(co) = mapreduce(ijx -> loss(ijx...), sum, co)\n", + " \n", + " Flux.train!(loss, collect(xco), ADAM(all_params, 0.01))\n", + " Dict(w=>v.data for (w,v) in ws)\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variables:\n", + " #self# \n", + " data::Array{StringInterning.InternedString,1}\n", + " ndim::Int64\n", + " halfwindow::Int64\n", + " #207::##207#215{Int64}\n", + " #208::##208#216{Int64}\n", + " #209 \n", + " #210 \n", + " #211::##211#219\n", + " #214 \n", + " xco::DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32}\n", + " words::Array{StringInterning.InternedString,1}\n", + " ws::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}\n", + " vs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}\n", + " bs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}\n", + " cs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}\n", + " all_params::Array{Flux.Tracker.TrackedArray{Float32,N,A} where A where N,1}\n", + " loss\u001b[1m\u001b[91m::Core.Box\u001b[39m\u001b[22m\n", + " T \n", + " shape \n", + " iter \n", + " C::Array{StringInterning.InternedString,1}\n", + " keeps@_23::Tuple{Tuple{Bool}}\n", + " Idefaults@_24::Tuple{Tuple{Int64}}\n", + " #temp#@_25 \n", + " keeps@_26 \n", + " Idefaults@_27 \n", + " #temp#@_28 \n", + " keep@_29::Tuple{Bool}\n", + " Idefault@_30::Tuple{Int64}\n", + " #temp#@_31 \n", + " ind1 \n", + " keep@_33 \n", + " Idefault@_34 \n", + " #temp#@_35 \n", + " I_1 \n", + " val_1::Tuple{StringInterning.InternedString,StringInterning.InternedString}\n", + " result::StringInterning.InternedString\n", + " I@_39 \n", + " i#688::Int64\n", + " I@_41 \n", + " n#687::Int64\n", + " i#686 \n", + " #temp#@_44::Bool\n", + " r#685 \n", + " A_1 \n", + " keep_1::Tuple{Bool}\n", + " Idefault_1::Tuple{Int64}\n", + " #temp#@_49 \n", + " #temp#@_50::Base.ValueIterator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}}\n", + " #90::Flux.Optimise.##90#93{Float64,Float64,Float64,Float64}\n", + " #91::Flux.Optimise.##91#94{Int64}\n", + " #92::Flux.Optimise.##92#95\n", + " #113::Flux.Optimise.##113#115\n", + "\n", + "Body:\n", + " begin \n", + " loss\u001b[1m\u001b[91m::Core.Box\u001b[39m\u001b[22m = $(Expr(:new, :(Core.Box)))\n", + " xco::DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32} = $(Expr(:invoke, MethodInstance for coocurs(::Array{StringInterning.InternedString,1}, ::Int64), :(Main.coocurs), :(data), :(halfwindow))) # line 5:\n", + " SSAValue(14) = $(Expr(:new, Base.KeyIterator{DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32}}, :(xco)))\n", + " $(Expr(:inbounds, false))\n", + " # meta: location array.jl collect 431\n", + " # meta: location array.jl _collect 437\n", + " # meta: location array.jl _similar_for 407\n", + " SSAValue(17) = (Core.getfield)((Core.getfield)((Core.getfield)((Core.getfield)(SSAValue(14), :dict)::DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32}, :d)::DataStructures.DefaultDictBase{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32,Dict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32}}, :d)::Dict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32}, :count)::Int64\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " SSAValue(18) = $(Expr(:invoke, MethodInstance for copy!(::Array{Tuple{StringInterning.InternedString,StringInterning.InternedString},1}, ::Base.KeyIterator{DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32}}), :(Base.copy!), :($(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Tuple{StringInterning.InternedString,StringInterning.InternedString},1}, svec(Any, Int64), Array{Tuple{StringInterning.InternedString,StringInterning.InternedString},1}, 0, SSAValue(17), 0))), SSAValue(14)))\n", + " $(Expr(:inbounds, false))\n", + " # meta: location broadcast.jl broadcast 434\n", + " # meta: location broadcast.jl broadcast_c 311\n", + " # meta: location broadcast.jl broadcast_indices 48\n", + " # meta: location broadcast.jl broadcast_indices 52\n", + " # meta: location abstractarray.jl indices 64\n", + " SSAValue(21) = (Base.arraysize)(SSAValue(18), 1)::Int64\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " SSAValue(67) = (Base.select_value)((Base.slt_int)(SSAValue(21), 0)::Bool, 0, SSAValue(21))::Int64 # line 314:\n", + " # meta: location broadcast.jl broadcast_t 266\n", + " C::Array{StringInterning.InternedString,1} = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{StringInterning.InternedString,1}, svec(Any, Int64), Array{StringInterning.InternedString,1}, 0, SSAValue(67), 0)) # line 267:\n", + " # meta: location broadcast.jl map_newindexer 125 # line 126:\n", + " # meta: location broadcast.jl newindexer 108\n", + " # meta: location broadcast.jl broadcast_indices 48\n", + " # meta: location broadcast.jl broadcast_indices 52\n", + " # meta: location abstractarray.jl indices 64\n", + " SSAValue(34) = (Base.arraysize)(SSAValue(18), 1)::Int64\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: location broadcast.jl shapeindexer 111\n", + " SSAValue(46) = (Base.select_value)((Base.slt_int)(SSAValue(34), 0)::Bool, 0, SSAValue(34))::Int64\n", + " # meta: pop location\n", + " # meta: pop location\n", + " SSAValue(47) = (Core.tuple)((Base.and_int)((Base.and_int)((1 === 1)::Bool, (1 === 1)::Bool)::Bool, (SSAValue(67) === SSAValue(46))::Bool)::Bool)::Tuple{Bool}\n", + " SSAValue(48) = (Core.tuple)(1)::Tuple{Int64}\n", + " keep@_29::Tuple{Bool} = SSAValue(47)\n", + " Idefault@_30::Tuple{Int64} = SSAValue(48)\n", + " # meta: pop location\n", + " SSAValue(62) = (Core.tuple)(keep@_29::Tuple{Bool})::Tuple{Tuple{Bool}}\n", + " SSAValue(63) = (Core.tuple)(Idefault@_30::Tuple{Int64})::Tuple{Tuple{Int64}}\n", + " keeps@_23::Tuple{Tuple{Bool}} = SSAValue(62)\n", + " Idefaults@_24::Tuple{Tuple{Int64}} = SSAValue(63) # line 268:\n", + " # meta: location broadcast.jl _broadcast! 139\n", + " # meta: location broadcast.jl # line 145:\n", + " keep_1::Tuple{Bool} = (Base.getfield)(keeps@_23::Tuple{Tuple{Bool}}, 1)::Tuple{Bool} # line 146:\n", + " Idefault_1::Tuple{Int64} = (Base.getfield)(Idefaults@_24::Tuple{Tuple{Int64}}, 1)::Tuple{Int64} # line 147:\n", + " # meta: location simdloop.jl # line 66:\n", + " #temp#@_44::Bool = false\n", + " 64: \n", + " unless (Base.not_int)(#temp#@_44::Bool)::Bool goto 107\n", + " #temp#@_44::Bool = true # line 67:\n", + " n#687::Int64 = (Base.add_int)((Base.sub_int)(SSAValue(67), 1)::Int64, 1)::Int64 # line 68:\n", + " unless (Base.slt_int)(0, n#687::Int64)::Bool goto 105 # line 70:\n", + " i#688::Int64 = 0 # line 71:\n", + " NewvarNode(:(val_1::Tuple{StringInterning.InternedString,StringInterning.InternedString}))\n", + " NewvarNode(:(result::StringInterning.InternedString))\n", + " 76: \n", + " unless (Base.slt_int)(i#688::Int64, n#687::Int64)::Bool goto 103 # line 72:\n", + " SSAValue(59) = (Base.add_int)(i#688::Int64, 1)::Int64 # line 73:\n", + " # meta: location broadcast.jl # line 149:\n", + " SSAValue(61) = (Base.select_value)((Base.getfield)(keep_1::Tuple{Bool}, 1)::Bool, SSAValue(59), (Base.getfield)(Idefault_1::Tuple{Int64}, 1)::Int64)::Int64 # line 151:\n", + " $(Expr(:inbounds, true))\n", + " val_1::Tuple{StringInterning.InternedString,StringInterning.InternedString} = (Base.arrayref)(SSAValue(18), SSAValue(61))::Tuple{StringInterning.InternedString,StringInterning.InternedString}\n", + " $(Expr(:inbounds, :pop)) # line 153:\n", + " result::StringInterning.InternedString = (Base.getfield)(val_1::Tuple{StringInterning.InternedString,StringInterning.InternedString}, 2)::StringInterning.InternedString # line 154:\n", + " $(Expr(:inbounds, true))\n", + " # meta: location multidimensional.jl setindex! 300\n", + " (Base.arrayset)(C::Array{StringInterning.InternedString,1}, $(Expr(:invoke, MethodInstance for StringInterning.InternedString(::StringInterning.InternedString), :(StringInterning.InternedString), :(result))), SSAValue(59))::Array{StringInterning.InternedString,1}\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " # meta: pop location # line 74:\n", + " i#688::Int64 = (Base.add_int)(i#688::Int64, 1)::Int64 # line 75:\n", + " $(Expr(:simdloop))\n", + " 101: \n", + " goto 76\n", + " 103: # line 79:\n", + " 105: \n", + " goto 64\n", + " 107: \n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " goto 114 # line 319:\n", + " 114: \n", + " # meta: pop location\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " words::Array{StringInterning.InternedString,1} = $(Expr(:invoke, MethodInstance for unique(::Array{StringInterning.InternedString,1}), :(Main.unique), :(C))) # line 6:\n", + " #207::##207#215{Int64} = $(Expr(:new, ##207#215{Int64}, :(ndim)))\n", + " SSAValue(1) = $(Expr(:new, Base.Generator{Array{StringInterning.InternedString,1},##207#215{Int64}}, :(#207), :(words)))\n", + " ws::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}} = $(Expr(:invoke, MethodInstance for Dict(::Base.Generator{Array{StringInterning.InternedString,1},##207#215{Int64}}), :(Main.Dict), SSAValue(1))) # line 7:\n", + " #208::##208#216{Int64} = $(Expr(:new, ##208#216{Int64}, :(ndim)))\n", + " SSAValue(3) = $(Expr(:new, Base.Generator{Array{StringInterning.InternedString,1},##208#216{Int64}}, :(#208), :(words)))\n", + " vs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}} = $(Expr(:invoke, MethodInstance for Dict(::Base.Generator{Array{StringInterning.InternedString,1},##208#216{Int64}}), :(Main.Dict), SSAValue(3))) # line 8:\n", + " SSAValue(5) = $(Expr(:new, Base.Generator{Array{StringInterning.InternedString,1},##209#217}, :($(QuoteNode(#209))), :(words)))\n", + " bs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}} = $(Expr(:invoke, MethodInstance for Dict(::Base.Generator{Array{StringInterning.InternedString,1},##209#217}), :(Main.Dict), SSAValue(5))) # line 9:\n", + " SSAValue(7) = $(Expr(:new, Base.Generator{Array{StringInterning.InternedString,1},##210#218}, :($(QuoteNode(#210))), :(words)))\n", + " cs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}} = $(Expr(:invoke, MethodInstance for Dict(::Base.Generator{Array{StringInterning.InternedString,1},##210#218}), :(Main.Dict), SSAValue(7))) # line 10:\n", + " #211::##211#219 = $(Expr(:new, :(Main.##211#219)))\n", + " SSAValue(92) = ws::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}\n", + " SSAValue(93) = vs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}\n", + " SSAValue(94) = bs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}\n", + " SSAValue(95) = cs::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}\n", + " $(Expr(:inbounds, false))\n", + " # meta: location broadcast.jl broadcast 17\n", + " # meta: location tuple.jl map 161\n", + " # meta: location #211 0\n", + " #temp#@_50::Base.ValueIterator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}} = $(Expr(:new, Base.ValueIterator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}}, SSAValue(92)))\n", + " # meta: location array.jl collect 431\n", + " # meta: location array.jl _collect 437\n", + " # meta: location array.jl _similar_for 407\n", + " SSAValue(74) = (Core.getfield)((Core.getfield)(#temp#@_50::Base.ValueIterator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}}, :dict)::Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}, :count)::Int64\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " SSAValue(75) = $(Expr(:invoke, MethodInstance for copy!(::Array{TrackedArray{…,Array{Float32,1}},1}, ::Base.ValueIterator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}}}), :(Base.copy!), :($(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{TrackedArray{…,Array{Float32,1}},1}, svec(Any, Int64), Array{TrackedArray{…,Array{Float32,1}},1}, 0, SSAValue(74), 0))), :(#temp#@_50)))\n", + " SSAValue(76) = $(Expr(:invoke, MethodInstance for map(::##211#219, ::Tuple{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}}), :(Base.map), :(#211), :((Core.tuple)(SSAValue(93), SSAValue(94), SSAValue(95))::Tuple{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}})))\n", + " # meta: pop location\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " SSAValue(96) = (Core.getfield)(SSAValue(76), 1)::Array{TrackedArray{…,Array{Float32,1}},1}\n", + " SSAValue(97) = (Core.getfield)(SSAValue(76), 2)::Array{TrackedArray{…,Array{Float32,0}},1}\n", + " SSAValue(98) = (Core.getfield)(SSAValue(76), 3)::Array{TrackedArray{…,Array{Float32,0}},1}\n", + " all_params::Array{Flux.Tracker.TrackedArray{Float32,N,A} where A where N,1} = $(Expr(:invoke, MethodInstance for typed_vcat(::Type{Flux.Tracker.TrackedArray{Float32,N,A} where A where N}, ::Array{TrackedArray{…,Array{Float32,1}},1}, ::Array{TrackedArray{…,Array{Float32,1}},1}, ::Array{TrackedArray{…,Array{Float32,0}},1}, ::Vararg{Array{TrackedArray{…,Array{Float32,0}},1},N} where N), :(Base.typed_vcat), Flux.Tracker.TrackedArray{Float32,N,A} where A where N, SSAValue(75), SSAValue(96), SSAValue(97), SSAValue(98))) # line 11:\n", + " SSAValue(11) = $(Expr(:new, #loss#220{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}}, :(ws), :(vs), :(bs), :(cs), :(loss)))\n", + " (Core.setfield!)(loss\u001b[1m\u001b[91m::Core.Box\u001b[39m\u001b[22m, :contents, SSAValue(11))::#loss#220{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}},Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,0}}}} # line 19:\n", + " SSAValue(86) = (Core.getfield)(loss\u001b[1m\u001b[91m::Core.Box\u001b[39m\u001b[22m, :contents)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m\n", + " $(Expr(:inbounds, false))\n", + " # meta: location array.jl collect 431\n", + " # meta: location array.jl _collect 437\n", + " # meta: location array.jl _similar_for 407\n", + " SSAValue(84) = (Core.getfield)((Core.getfield)((Core.getfield)(xco::DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32}, :d)::DataStructures.DefaultDictBase{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32,Dict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32}}, :d)::Dict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32}, :count)::Int64\n", + " # meta: pop location\n", + " # meta: pop location\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " SSAValue(85) = $(Expr(:invoke, MethodInstance for copy!(::Array{Pair{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32},1}, ::DataStructures.DefaultDict{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32,Float32}), :(Base.copy!), :($(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Pair{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32},1}, svec(Any, Int64), Array{Pair{Tuple{StringInterning.InternedString,StringInterning.InternedString},Float32},1}, 0, SSAValue(84), 0))), :(xco)))\n", + " $(Expr(:inbounds, false))\n", + " # meta: location /home/wheel/oxinabox/.julia/v0.6/Flux/src/optimise/interface.jl ADAM 54\n", + " # meta: location /home/wheel/oxinabox/.julia/v0.6/Flux/src/optimise/interface.jl #ADAM#89 54\n", + " #90::Flux.Optimise.##90#93{Float64,Float64,Float64,Float64} = $(Expr(:new, Flux.Optimise.##90#93{Float64,Float64,Float64,Float64}, 0.9, 0.999, 1.0e-8, 0.01))\n", + " #91::Flux.Optimise.##91#94{Int64} = $(Expr(:new, Flux.Optimise.##91#94{Int64}, 0))\n", + " #92::Flux.Optimise.##92#95 = $(Expr(:new, :(Flux.Optimise.##92#95)))\n", + " # meta: pop location\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " SSAValue(87) = $(Expr(:invoke, MethodInstance for optimiser(::Array{Flux.Tracker.TrackedArray{Float32,N,A} where A where N,1}, ::Function, ::Vararg{Function,N} where N), :(Flux.Optimise.optimiser), :(all_params), :(#90), :(#91), :(#92)))\n", + " $(Expr(:inbounds, false))\n", + " # meta: location /home/wheel/oxinabox/.julia/v0.6/Flux/src/optimise/train.jl train! 17\n", + " #113::Flux.Optimise.##113#115 = $(Expr(:new, :(Flux.Optimise.##113#115)))\n", + " # meta: pop location\n", + " $(Expr(:inbounds, :pop))\n", + " (Flux.Optimise.#train!#112)(#113::Flux.Optimise.##113#115, Flux.train!, SSAValue(86), SSAValue(85), SSAValue(87))::Void # line 20:\n", + " SSAValue(13) = $(Expr(:new, Base.Generator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},##214#223}, :($(QuoteNode(#214))), :(ws)))\n", + " return $(Expr(:invoke, MethodInstance for Dict(::Base.Generator{Dict{StringInterning.InternedString,TrackedArray{…,Array{Float32,1}}},##214#223}), :(Main.Dict), SSAValue(13)))\n", + " end::Dict{StringInterning.InternedString,Array{Float32,1}}\n" + ] + } + ], + "source": [ + "@code_warntype glove(wikidata, 30,5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "wes, enc = glove(wikidata, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 0.6.1", + "language": "julia", + "name": "julia-0.6" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "0.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/GLoVE.ipynb b/examples/GLoVE.ipynb new file mode 100644 index 0000000..6a83192 --- /dev/null +++ b/examples/GLoVE.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "LoadError", + "evalue": "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m", + "output_type": "error", + "traceback": [ + "\u001b[91msyntax: incomplete: unterminated multi-line comment #= ... =#\u001b[39m", + "", + "Stacktrace:", + " [1] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:522\u001b[22m\u001b[22m" + ] + } + ], + "source": [ + "#=\n", + "# This is an implementation of\n", + "# Glove: Global vectors for word representation\n", + "# J Pennington, R Socher, C Manning\n", + "# Proceedings of the 2014 conference on empirical methods in natural language\n", + "# https://nlp.stanford.edu/pubs/glove.pdf\n", + "# (Made from the paper without reference to the source code)\n", + "#\n", + "# Pennington et. al's implementation is ~1K lines of C\n", + "#=#\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "using CorpusLoaders\n", + "using MLDataUtils\n", + "using StringInterning\n", + "using DataStructures\n", + "using Optim\n", + "#using CatViews" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "wikidata = collect(Iterators.take(CorpusLoaders.load_wikicorpus(), 10_000_000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "function coocurs(data, hw=5)\n", + " coocurs = DefaultDict{Tuple{InternedString,InternedString}, Float32}(0f0)\n", + " distance_weights = [1f0/abs(d-hw) for d in 0:2hw if d!=hw]\n", + " for (word_, window) in slidingwindow(i->[i-hw:i-1; i+1:i+hw], data, 1, stride=1)\n", + " word = first(word_)\n", + " for (weight, coword) in zip(distance_weights, window)\n", + " coocurs[(word,coword)]+=weight\n", + " end\n", + " end\n", + "\n", + " encoding = labelenc(last.(collect(keys(coocurs))))\n", + " coocurs_mat = spzeros(Float32, nlabel(encoding), nlabel(encoding))\n", + " for (coocurance, score) in coocurs\n", + " inds = convertlabel.(LabelEnc.Indices(nlabel(encoding)), coocurance, encoding)\n", + " coocurs_mat[inds...] = score\n", + " end\n", + " coocurs_mat, encoding\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "f(x, xmax=100f0, α=3/4)::Float32 = x>xmax ? 1f0 : (x/xmax)^α\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "\n", + "function glove(data, ndim=300, halfwindow=5)\n", + " xco, encoding = coocurs(data, halfwindow)\n", + " # sum f.(xco)\n", + "\n", + " nwords = nlabel(encoding)\n", + "\n", + " params = Float32[]\n", + " mm = Int[]\n", + " mm(ii) = (ii-1)*(2*ndim+2)+1\n", + " getw(params, i) = begin @inbounds x=@view params[mm(i) : mm(i)+ndim-1]; x end\n", + " getv(params, i) = begin @inbounds x=@view params[mm(i)+ndim : mm(i)+2ndim-1]; x end\n", + " getb(params, i) = begin @inbounds x=params[mm(i)+2ndim]; x end\n", + " getc(params, i) = begin @inbounds x=params[mm(i)+2ndim+1]; x end\n", + "\n", + " for ii in 1:nwords\n", + " vals = randn(Float32, 2ndim+2)# 2ndim+2)\n", + " append!(params, vals)\n", + " #Base.Test.@test vals == [getw(params,ii); getv(params,ii); getb(params,ii); getc(params,ii)]\n", + " end\n", + "\n", + " xco_ijx = collect(zip(findnz(xco)...))\n", + " \n", + " function loss(params)\n", + " loss = 0f0\n", + " @inbounds for (i, j, x) in xco_ijx\n", + " wi = getw(params, i)\n", + " vj = getv(params, j)\n", + " bi = getb(params, i)\n", + " cj = getc(params, j)\n", + " loss += f(x)*(wi⋅vj + bi + cj - log(x))^2\n", + " end\n", + " loss\n", + " end\n", + "\n", + " \n", + " optimize(loss, params; show_every=1, show_trace=true)\n", + " getw.(params, 1:nlabels), encoding\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "wes, enc = glove(wikidata, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "wes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 0.6.1", + "language": "julia", + "name": "julia-0.6" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "0.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl index 8a4eb4d..eda97c7 100644 --- a/src/CorpusLoaders.jl +++ b/src/CorpusLoaders.jl @@ -1,15 +1,27 @@ module CorpusLoaders using LightXML +using DataDeps +using Glob +using BinDeps +using InternedStrings +using StringEncodings const AbstractStringVector = AbstractVector{<:AbstractString} +function __init__() + print(DataDeps) + include("./datadeps.jl") +end + + include("util.jl") include("tokenizers.jl") include("tags.jl") include("semcor.jl") include("semeval2007t7.jl") -include("./similarity.jl") +include("similarity.jl") +include("wikicorpus.jl") end # module diff --git a/src/wikicorpus.jl b/src/wikicorpus.jl new file mode 100644 index 0000000..f076a73 --- /dev/null +++ b/src/wikicorpus.jl @@ -0,0 +1,29 @@ +export load_wikicorpus + +""" + load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5) + +Lazily loads text from wikicorpus. + + - If there are less than `sentence_min_length` tokens on a line it is skipped. + - this gets rid of a lot of cruft like lists, titles, data-blocks etc +""" +function load_wikicorpus(path = datadep"Wikicorpus-en raw"; sentence_min_length = 5) + Channel(ctype=InternedString, csize=2048) do ch + for file in readdir(glob"*Text*", path) + for sent in eachline(file, enc"latin1") + # Note: wikicorpus is not correct XML + # For now we will just drop the document identifying stuff + if any(startswith.(sent, ["", "ENDOFARTICLE."])) + continue + end + tokens = punctuation_space_tokenize(sent) + if length(tokens) < sentence_min_length + continue + else + put!.(ch, tokens) + end + end + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 1619ce9..9666c0e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,24 +1,21 @@ using CorpusLoaders using Base.Test -@testset "Semcor" begin - include("semcor.jl") +testsets = ["semcor.jl", + "tokenizers.jl", + "similarity.jl", + "tags.jl", + "util.jl", + "semeval2007t7.jl", + "wikicorpus.jl" + ] + +for fn in testsets + @testset "$fn" begin + include(fn) + end end -@testset "Tokenizers" begin - include("./tokenizers.jl") -end - -include("./similarity.jl") - -@testset "tag conversion" begin - include("tags.jl") -end - -include("util.jl") -@testset "semeval2007t7" begin - include("./semeval2007t7.jl") -end diff --git a/test/wikicorpus.jl b/test/wikicorpus.jl new file mode 100644 index 0000000..8ec585f --- /dev/null +++ b/test/wikicorpus.jl @@ -0,0 +1,15 @@ +using CorpusLoaders +using Base.Test +using DataDeps + + +@testset "basic use" begin + wk_gen = load_wikicorpus() + words = collect(Base.Iterators.take(wk_gen, 10_000)); + + @test all(isa.(words, AbstractString)) + @test "a" ∈ words + @test "the" ∈ words + +end +