Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes an error while loading pre-trained files from Google #6

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
julia 0.6
julia 0.7
Test
1 change: 1 addition & 0 deletions src/Word2Vec.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Word2Vec

import Base: show, size
import Statistics: norm, mean

export
# types
Expand Down
24 changes: 14 additions & 10 deletions src/interface.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""
word2vec(train, output; size=100, window=5, sample=1e-3, hs=0, negative=5, threads=12, iter=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=Void(), read_vocab=Void(), verbose=false,)
""" word2vec(train, output; size=100, window=5, sample=1e-3, hs=0,
negative=5, threads=12, iter=5, min_count=5, alpha=0.025,
debug=2, binary=1, cbow=1, save_vocal=nothing,
read_vocab=nothing, verbose=false,)

Parameters for training:
train <file>
Expand Down Expand Up @@ -48,7 +50,7 @@ function word2vec(train::AbstractString, output::AbstractString;
hs::Int=0, negative::Int=5, threads::Int=12, iter::Int=5,
min_count::Int=5, alpha::AbstractFloat=0.025,
debug::Int=2, binary::Int=0, cbow::Int=1,
save_vocab=Void(), read_vocab=Void(),
save_vocab=nothing, read_vocab=nothing,
verbose::Bool=false)

command = joinpath(dirname(@__FILE__), "..", "deps", "src", "word2vec-c", "./word2vec")
Expand All @@ -63,20 +65,22 @@ function word2vec(train::AbstractString, output::AbstractString;
push!(parameters, arg)
push!(parameters, string(value))
end
if save_vocab != Void()
if save_vocab != nothing
push!(parameters, "-save-vocab")
push!(parameters, string(save_vocab))
end
if read_vocab != Void()
if read_vocab != nothing
push!(parameters, "-read-vocab")
push!(parameters, string(read_vocab))
end
run(`$(command) $(parameters)`)
end


"""
word2cluster(train, output, classes; size=100, window=5, sample=1e-3, hs=0, negative=5, threads=1, iter=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=Void(), read_vocab=Void(), verbose=false,)
""" word2cluster(train, output, classes; size=100, window=5,
sample=1e-3, hs=0, negative=5, threads=1, iter=5, min_count=5,
alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=nothing,
read_vocab=nothing, verbose=false,)

Parameters for training:
train <file>
Expand Down Expand Up @@ -129,7 +133,7 @@ function word2clusters(train::AbstractString, output::AbstractString,
negative::Int=5, threads::Int=1, iter::Int=5,
min_count::Int=5, alpha::AbstractFloat=0.025,
debug::Int=2, binary::Int=0, cbow::Int=1,
save_vocab=Void(), read_vocab=Void(),
save_vocab=nothing, read_vocab=nothing,
verbose::Bool=false)
command = joinpath(dirname(@__FILE__), "..", "deps", "src", "word2vec-c", "./word2vec")
parameters = AbstractString[]
Expand All @@ -142,11 +146,11 @@ function word2clusters(train::AbstractString, output::AbstractString,
push!(parameters, arg)
push!(parameters, string(value))
end
if save_vocab != Void()
if save_vocab != nothing
push!(parameters, "-save-vocab")
push!(parameters, string(save_vocab))
end
if read_vocab != Void()
if read_vocab != nothing
push!(parameters, "-read-vocab")
push!(parameters, string(read_vocab))
end
Expand Down
2 changes: 1 addition & 1 deletion src/wordclusters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ For the WordCluster `wc`, return all the words from a given cluster
number `cluster`.
"""
function get_words(wc::WordClusters, cluster::Int)
inds = findin(wc.clusters, cluster)
inds = findall(isequal(cluster), wc.clusters)
return wc.vocab[inds]
end

Expand Down
84 changes: 61 additions & 23 deletions src/wordvectors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ mutable struct WordVectors{S<:AbstractString, T<:Real, H<:Integer}
end

function WordVectors(vocab::AbstractArray{S,1},
vectors::AbstractArray{T,2}) where {S <: AbstractString, T <: Real}
vectors::AbstractArray{T,2}) where {S <: AbstractString, T <: Real}
length(vocab) == size(vectors, 2) ||
throw(DimensionMismatch("Dimension of vocab and vectors are inconsistent."))
vocab_hash = Dict{S, Int}()
Expand Down Expand Up @@ -59,7 +59,7 @@ index(wv::WordVectors, word) = wv.vocab_hash[word]
Return the vector representation of `word` from the WordVectors `wv`.
"""
get_vector(wv::WordVectors, word) =
(idx = wv.vocab_hash[word]; wv.vectors[:,idx])
(idx = wv.vocab_hash[word]; wv.vectors[:,idx])

"""
cosine(wv, word, n=10)
Expand Down Expand Up @@ -110,21 +110,21 @@ function analogy(wv::WordVectors, pos::AbstractArray, neg::AbstractArray, n= 5)
m, n_vocab = size(wv)
n_pos = length(pos)
n_neg = length(neg)
anal_vecs = Array{AbstractFloat}(m, n_pos + n_neg)
anal_vecs = Array{AbstractFloat}(undef, m, n_pos + n_neg)

for (i, word) in enumerate(pos)
anal_vecs[:,i] = get_vector(wv, word)
end
for (i, word) in enumerate(neg)
anal_vecs[:,i+n_pos] = -get_vector(wv, word)
end
mean_vec = mean(anal_vecs, 2)
mean_vec = mean(anal_vecs, dims=2)
metrics = wv.vectors'*mean_vec
top_positions = sortperm(metrics[:], rev = true)[1:n+n_pos+n_neg]
for word in [pos;neg]
idx = index(wv, word)
loc = findfirst(top_positions, idx)
if loc != 0
loc = findfirst(isequal(idx), top_positions)
if loc != nothing
splice!(top_positions, loc)
end
end
Expand Down Expand Up @@ -152,56 +152,94 @@ Generate a WordVectors type object from the file `fname`, where
`type` is the element of the vectors.
The file format can be either text (kind=`:text`) or
binary (kind=`:binary`).

If `normalize=false` the embedding vectors will not be normalized.
The default is true.
"""
function wordvectors(fname::AbstractString, ::Type{T}; kind::Symbol=:text) where T <: Real
function wordvectors(fname::AbstractString, ::Type{T}; kind::Symbol=:text,
normalize::Bool=true) where T <: Real
if kind == :binary
return _from_binary(fname) # only for Float32
try
return _from_binary(fname, normalize=normalize) # only for Float32
catch y
if isa(y, UnicodeError)
info("UnicodeError detected. This could mean you try to load a pre-trained " *
"file from Google. Trying to load as a Google binary. You can force " *
"this with kind=:google")
return _from_google_binary(fname, normalize=normalize)
else # Otherwise pass the exception along
throw(y)
end
end
elseif kind == :google
return _from_google_binary(fname, normalize=normalize)
elseif kind == :text
return _from_text(T, fname)
return _from_text(T, fname, normalize=normalize)
else
throw(ArgumentError("Unknown kind $(kind)"))
end
end

wordvectors(frame::AbstractString; kind::Symbol=:text) =
wordvectors(frame, Float64,kind=kind)
wordvectors(frame::AbstractString; kind::Symbol=:text, normalize::Bool=true) =
wordvectors(frame, Float64,kind=kind, normalize=normalize)

# generate a WordVectors object from binary file
function _from_binary(filename::AbstractString)
function _from_binary(filename::AbstractString; normalize::Bool=true)
open(filename) do f
header = strip(readline(f))
vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' '))
vocab = Vector{AbstractString}(vocab_size)
vectors = Array{Float32}(vector_size, vocab_size)
vocab = Vector{AbstractString}(undef, vocab_size)
vectors = Array{Float32}(undef, vector_size, vocab_size)
binary_length = sizeof(Float32) * vector_size
for i in 1:vocab_size
vocab[i] = strip(readuntil(f, ' '))
vector = read(f, Float32, vector_size)
vec_norm = norm(vector)
vectors[:, i] = vector./vec_norm # unit vector
vector = Vector{Float32}(undef, vector_size)
read!(f, vector)
normalize && (vector = vector ./ norm(vector)) # Normalize if needed
vectors[:, i] = vector
read(f, UInt8) # new line
end
return WordVectors(vocab, vectors)
end
end

# generate a WordVectors object from binary file in the format used by
# the original pre-trained files by google
function _from_google_binary(filename::AbstractString; normalize::Bool=true)
open(filename) do f
header = strip(readline(f))
vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' '))
vocab = Vector{AbstractString}(undef, vocab_size)
vectors = Array{Float32}(undef, vector_size, vocab_size)
binary_length = sizeof(Float32) * vector_size
for i in 1:vocab_size
vocab[i] = strip(readuntil(f, ' '))
vector = Vector{Float32}(undef, vector_size)
read!(f, vector)
normalize && (vector = vector ./ norm(vector)) # Normalize if needed
vectors[:, i] = vector
end
return WordVectors(vocab, vectors)
end
end

# generate a WordVectors object from text file
function _from_text(::Type{T}, filename::AbstractString) where T
function _from_text(::Type{T}, filename::AbstractString; normalize::Bool=true) where T
open(filename) do f
header = strip(readline(f))
vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' '))
vocab = Vector{AbstractString}(vocab_size)
vectors = Array{T}(vector_size, vocab_size)
vocab = Vector{AbstractString}(undef, vocab_size)
vectors = Array{T}(undef, vector_size, vocab_size)
@inbounds for (i, line) in enumerate(readlines(f))
#println(line)
line = strip(line)
parts = split(line, ' ')
word = parts[1]
vector = map(x-> parse(T, x), parts[2:end])
vec_norm = norm(vector)
normalize && (vector = vector ./ norm(vector)) # Normalize if needed
vectors[:, i] = vector
vocab[i] = word
vectors[:, i] = vector./vec_norm #unit vector
end
return WordVectors(vocab, vectors)
return WordVectors(vocab, vectors)
end
end
2 changes: 1 addition & 1 deletion test/model.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ n = rand(1:100)
indxs, mes = cosine(model, word1, n)
@test words[indxs] == cosine_similar_words(model, word1, n)
w4_indx = indxs[rand(1:end)]
loc = findin(indxs, w4_indx)
loc = findall((in)(w4_indx), indxs)
word4 = words[w4_indx]
@test index(model, word4) == w4_indx

Expand Down
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using Word2Vec
using Base.Test
using Test

include("train.jl")
include("model.jl")
Expand Down