diff --git a/REQUIRE b/REQUIRE index 137767a..352c3ea 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1 +1,2 @@ -julia 0.6 +julia 0.7 +Test diff --git a/src/Word2Vec.jl b/src/Word2Vec.jl index 078032f..46491d9 100644 --- a/src/Word2Vec.jl +++ b/src/Word2Vec.jl @@ -1,6 +1,7 @@ module Word2Vec import Base: show, size +import Statistics: norm, mean export # types diff --git a/src/interface.jl b/src/interface.jl index df95d68..e4b398c 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -1,5 +1,7 @@ -""" - word2vec(train, output; size=100, window=5, sample=1e-3, hs=0, negative=5, threads=12, iter=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=Void(), read_vocab=Void(), verbose=false,) +""" word2vec(train, output; size=100, window=5, sample=1e-3, hs=0, + negative=5, threads=12, iter=5, min_count=5, alpha=0.025, + debug=2, binary=1, cbow=1, save_vocal=nothing, + read_vocab=nothing, verbose=false,) Parameters for training: train @@ -48,7 +50,7 @@ function word2vec(train::AbstractString, output::AbstractString; hs::Int=0, negative::Int=5, threads::Int=12, iter::Int=5, min_count::Int=5, alpha::AbstractFloat=0.025, debug::Int=2, binary::Int=0, cbow::Int=1, - save_vocab=Void(), read_vocab=Void(), + save_vocab=nothing, read_vocab=nothing, verbose::Bool=false) command = joinpath(dirname(@__FILE__), "..", "deps", "src", "word2vec-c", "./word2vec") @@ -63,11 +65,11 @@ function word2vec(train::AbstractString, output::AbstractString; push!(parameters, arg) push!(parameters, string(value)) end - if save_vocab != Void() + if save_vocab != nothing push!(parameters, "-save-vocab") push!(parameters, string(save_vocab)) end - if read_vocab != Void() + if read_vocab != nothing push!(parameters, "-read-vocab") push!(parameters, string(read_vocab)) end @@ -75,8 +77,10 @@ function word2vec(train::AbstractString, output::AbstractString; end -""" - word2cluster(train, output, classes; size=100, window=5, sample=1e-3, hs=0, negative=5, threads=1, iter=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=Void(), read_vocab=Void(), verbose=false,) +""" word2cluster(train, output, classes; size=100, window=5, + sample=1e-3, hs=0, negative=5, threads=1, iter=5, min_count=5, + alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=nothing, + read_vocab=nothing, verbose=false,) Parameters for training: train @@ -129,7 +133,7 @@ function word2clusters(train::AbstractString, output::AbstractString, negative::Int=5, threads::Int=1, iter::Int=5, min_count::Int=5, alpha::AbstractFloat=0.025, debug::Int=2, binary::Int=0, cbow::Int=1, - save_vocab=Void(), read_vocab=Void(), + save_vocab=nothing, read_vocab=nothing, verbose::Bool=false) command = joinpath(dirname(@__FILE__), "..", "deps", "src", "word2vec-c", "./word2vec") parameters = AbstractString[] @@ -142,11 +146,11 @@ function word2clusters(train::AbstractString, output::AbstractString, push!(parameters, arg) push!(parameters, string(value)) end - if save_vocab != Void() + if save_vocab != nothing push!(parameters, "-save-vocab") push!(parameters, string(save_vocab)) end - if read_vocab != Void() + if read_vocab != nothing push!(parameters, "-read-vocab") push!(parameters, string(read_vocab)) end diff --git a/src/wordclusters.jl b/src/wordclusters.jl index 253cfaf..dabb98f 100644 --- a/src/wordclusters.jl +++ b/src/wordclusters.jl @@ -58,7 +58,7 @@ For the WordCluster `wc`, return all the words from a given cluster number `cluster`. """ function get_words(wc::WordClusters, cluster::Int) - inds = findin(wc.clusters, cluster) + inds = findall(isequal(cluster), wc.clusters) return wc.vocab[inds] end diff --git a/src/wordvectors.jl b/src/wordvectors.jl index 9289a1a..51f7842 100644 --- a/src/wordvectors.jl +++ b/src/wordvectors.jl @@ -5,7 +5,7 @@ mutable struct WordVectors{S<:AbstractString, T<:Real, H<:Integer} end function WordVectors(vocab::AbstractArray{S,1}, - vectors::AbstractArray{T,2}) where {S <: AbstractString, T <: Real} + vectors::AbstractArray{T,2}) where {S <: AbstractString, T <: Real} length(vocab) == size(vectors, 2) || throw(DimensionMismatch("Dimension of vocab and vectors are inconsistent.")) vocab_hash = Dict{S, Int}() @@ -59,7 +59,7 @@ index(wv::WordVectors, word) = wv.vocab_hash[word] Return the vector representation of `word` from the WordVectors `wv`. """ get_vector(wv::WordVectors, word) = - (idx = wv.vocab_hash[word]; wv.vectors[:,idx]) + (idx = wv.vocab_hash[word]; wv.vectors[:,idx]) """ cosine(wv, word, n=10) @@ -110,7 +110,7 @@ function analogy(wv::WordVectors, pos::AbstractArray, neg::AbstractArray, n= 5) m, n_vocab = size(wv) n_pos = length(pos) n_neg = length(neg) - anal_vecs = Array{AbstractFloat}(m, n_pos + n_neg) + anal_vecs = Array{AbstractFloat}(undef, m, n_pos + n_neg) for (i, word) in enumerate(pos) anal_vecs[:,i] = get_vector(wv, word) @@ -118,13 +118,13 @@ function analogy(wv::WordVectors, pos::AbstractArray, neg::AbstractArray, n= 5) for (i, word) in enumerate(neg) anal_vecs[:,i+n_pos] = -get_vector(wv, word) end - mean_vec = mean(anal_vecs, 2) + mean_vec = mean(anal_vecs, dims=2) metrics = wv.vectors'*mean_vec top_positions = sortperm(metrics[:], rev = true)[1:n+n_pos+n_neg] for word in [pos;neg] idx = index(wv, word) - loc = findfirst(top_positions, idx) - if loc != 0 + loc = findfirst(isequal(idx), top_positions) + if loc != nothing splice!(top_positions, loc) end end @@ -152,56 +152,94 @@ Generate a WordVectors type object from the file `fname`, where `type` is the element of the vectors. The file format can be either text (kind=`:text`) or binary (kind=`:binary`). + +If `normalize=false` the embedding vectors will not be normalized. +The default is true. """ -function wordvectors(fname::AbstractString, ::Type{T}; kind::Symbol=:text) where T <: Real +function wordvectors(fname::AbstractString, ::Type{T}; kind::Symbol=:text, + normalize::Bool=true) where T <: Real if kind == :binary - return _from_binary(fname) # only for Float32 + try + return _from_binary(fname, normalize=normalize) # only for Float32 + catch y + if isa(y, UnicodeError) + info("UnicodeError detected. This could mean you try to load a pre-trained " * + "file from Google. Trying to load as a Google binary. You can force " * + "this with kind=:google") + return _from_google_binary(fname, normalize=normalize) + else # Otherwise pass the exception along + throw(y) + end + end + elseif kind == :google + return _from_google_binary(fname, normalize=normalize) elseif kind == :text - return _from_text(T, fname) + return _from_text(T, fname, normalize=normalize) else throw(ArgumentError("Unknown kind $(kind)")) end end -wordvectors(frame::AbstractString; kind::Symbol=:text) = - wordvectors(frame, Float64,kind=kind) +wordvectors(frame::AbstractString; kind::Symbol=:text, normalize::Bool=true) = + wordvectors(frame, Float64,kind=kind, normalize=normalize) # generate a WordVectors object from binary file -function _from_binary(filename::AbstractString) +function _from_binary(filename::AbstractString; normalize::Bool=true) open(filename) do f header = strip(readline(f)) vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' ')) - vocab = Vector{AbstractString}(vocab_size) - vectors = Array{Float32}(vector_size, vocab_size) + vocab = Vector{AbstractString}(undef, vocab_size) + vectors = Array{Float32}(undef, vector_size, vocab_size) binary_length = sizeof(Float32) * vector_size for i in 1:vocab_size vocab[i] = strip(readuntil(f, ' ')) - vector = read(f, Float32, vector_size) - vec_norm = norm(vector) - vectors[:, i] = vector./vec_norm # unit vector + vector = Vector{Float32}(undef, vector_size) + read!(f, vector) + normalize && (vector = vector ./ norm(vector)) # Normalize if needed + vectors[:, i] = vector read(f, UInt8) # new line end return WordVectors(vocab, vectors) end end +# generate a WordVectors object from binary file in the format used by +# the original pre-trained files by google +function _from_google_binary(filename::AbstractString; normalize::Bool=true) + open(filename) do f + header = strip(readline(f)) + vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' ')) + vocab = Vector{AbstractString}(undef, vocab_size) + vectors = Array{Float32}(undef, vector_size, vocab_size) + binary_length = sizeof(Float32) * vector_size + for i in 1:vocab_size + vocab[i] = strip(readuntil(f, ' ')) + vector = Vector{Float32}(undef, vector_size) + read!(f, vector) + normalize && (vector = vector ./ norm(vector)) # Normalize if needed + vectors[:, i] = vector + end + return WordVectors(vocab, vectors) + end +end + # generate a WordVectors object from text file -function _from_text(::Type{T}, filename::AbstractString) where T +function _from_text(::Type{T}, filename::AbstractString; normalize::Bool=true) where T open(filename) do f header = strip(readline(f)) vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' ')) - vocab = Vector{AbstractString}(vocab_size) - vectors = Array{T}(vector_size, vocab_size) + vocab = Vector{AbstractString}(undef, vocab_size) + vectors = Array{T}(undef, vector_size, vocab_size) @inbounds for (i, line) in enumerate(readlines(f)) #println(line) line = strip(line) parts = split(line, ' ') word = parts[1] vector = map(x-> parse(T, x), parts[2:end]) - vec_norm = norm(vector) + normalize && (vector = vector ./ norm(vector)) # Normalize if needed + vectors[:, i] = vector vocab[i] = word - vectors[:, i] = vector./vec_norm #unit vector end - return WordVectors(vocab, vectors) + return WordVectors(vocab, vectors) end end diff --git a/test/model.jl b/test/model.jl index 5a4781f..8e30796 100644 --- a/test/model.jl +++ b/test/model.jl @@ -30,7 +30,7 @@ n = rand(1:100) indxs, mes = cosine(model, word1, n) @test words[indxs] == cosine_similar_words(model, word1, n) w4_indx = indxs[rand(1:end)] -loc = findin(indxs, w4_indx) +loc = findall((in)(w4_indx), indxs) word4 = words[w4_indx] @test index(model, word4) == w4_indx diff --git a/test/runtests.jl b/test/runtests.jl index fd739ba..5bec7a0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,5 @@ using Word2Vec -using Base.Test +using Test include("train.jl") include("model.jl")