JuliaText · Paethon · Jun 7, 2018 · Jun 26, 2018 · Sep 17, 2018
diff --git a/REQUIRE b/REQUIRE
@@ -1 +1,2 @@
-julia 0.6
+julia 0.7
+Test
diff --git a/src/Word2Vec.jl b/src/Word2Vec.jl
@@ -1,6 +1,7 @@
 module Word2Vec
 
 import Base: show, size
+import Statistics: norm, mean
 
 export
     # types

diff --git a/src/interface.jl b/src/interface.jl
@@ -1,5 +1,7 @@
-"""
-     word2vec(train, output; size=100, window=5, sample=1e-3, hs=0,  negative=5, threads=12, iter=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=Void(), read_vocab=Void(), verbose=false,)
+""" word2vec(train, output; size=100, window=5, sample=1e-3, hs=0,
+     negative=5, threads=12, iter=5, min_count=5, alpha=0.025,
+     debug=2, binary=1, cbow=1, save_vocal=nothing,
+     read_vocab=nothing, verbose=false,)
 
     Parameters for training:
         train <file>
@@ -48,7 +50,7 @@ function word2vec(train::AbstractString, output::AbstractString;
                   hs::Int=0, negative::Int=5, threads::Int=12, iter::Int=5, 
                   min_count::Int=5, alpha::AbstractFloat=0.025,
                   debug::Int=2, binary::Int=0, cbow::Int=1, 
-                  save_vocab=Void(), read_vocab=Void(), 
+                  save_vocab=nothing, read_vocab=nothing,
                   verbose::Bool=false)
 
     command = joinpath(dirname(@__FILE__), "..", "deps", "src", "word2vec-c", "./word2vec")
@@ -63,20 +65,22 @@ function word2vec(train::AbstractString, output::AbstractString;
         push!(parameters, arg)
         push!(parameters, string(value))
     end
-    if save_vocab != Void()
+    if save_vocab != nothing
         push!(parameters, "-save-vocab")
         push!(parameters, string(save_vocab))
     end
-    if read_vocab != Void()
+    if read_vocab != nothing
         push!(parameters, "-read-vocab")
         push!(parameters, string(read_vocab))
     end        
     run(`$(command) $(parameters)`)
 end
 
 
-"""
-     word2cluster(train, output, classes; size=100, window=5, sample=1e-3, hs=0,  negative=5, threads=1, iter=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=Void(), read_vocab=Void(), verbose=false,)
+""" word2cluster(train, output, classes; size=100, window=5,
+     sample=1e-3, hs=0, negative=5, threads=1, iter=5, min_count=5,
+     alpha=0.025, debug=2, binary=1, cbow=1, save_vocal=nothing,
+     read_vocab=nothing, verbose=false,)
 
     Parameters for training:
         train <file>
@@ -129,7 +133,7 @@ function word2clusters(train::AbstractString, output::AbstractString,
                        negative::Int=5, threads::Int=1, iter::Int=5,
                        min_count::Int=5, alpha::AbstractFloat=0.025,
                        debug::Int=2, binary::Int=0, cbow::Int=1,
-                       save_vocab=Void(), read_vocab=Void(), 
+                       save_vocab=nothing, read_vocab=nothing,
                        verbose::Bool=false)
     command = joinpath(dirname(@__FILE__), "..", "deps", "src", "word2vec-c", "./word2vec")
     parameters = AbstractString[]
@@ -142,11 +146,11 @@ function word2clusters(train::AbstractString, output::AbstractString,
         push!(parameters, arg)
         push!(parameters, string(value))
     end
-    if save_vocab != Void()
+    if save_vocab != nothing
         push!(parameters, "-save-vocab")
         push!(parameters, string(save_vocab))
     end
-    if read_vocab != Void()
+    if read_vocab != nothing
         push!(parameters, "-read-vocab")
         push!(parameters, string(read_vocab))
     end 

diff --git a/src/wordclusters.jl b/src/wordclusters.jl
@@ -58,7 +58,7 @@ For the WordCluster `wc`, return all the words from a given cluster
 number `cluster`.
 """
 function get_words(wc::WordClusters, cluster::Int)
-    inds = findin(wc.clusters, cluster)
+    inds = findall(isequal(cluster), wc.clusters)
     return wc.vocab[inds]
 end
 

diff --git a/src/wordvectors.jl b/src/wordvectors.jl
@@ -5,7 +5,7 @@ mutable struct WordVectors{S<:AbstractString, T<:Real, H<:Integer}
 end
 
 function WordVectors(vocab::AbstractArray{S,1},
-                    vectors::AbstractArray{T,2}) where {S <: AbstractString, T <: Real}
+                     vectors::AbstractArray{T,2}) where {S <: AbstractString, T <: Real}
     length(vocab) == size(vectors, 2) ||
         throw(DimensionMismatch("Dimension of vocab and vectors are inconsistent."))
     vocab_hash = Dict{S, Int}()
@@ -59,7 +59,7 @@ index(wv::WordVectors, word) = wv.vocab_hash[word]
 Return the vector representation of `word` from the WordVectors `wv`.
 """
 get_vector(wv::WordVectors, word) =
-      (idx = wv.vocab_hash[word]; wv.vectors[:,idx])
+    (idx = wv.vocab_hash[word]; wv.vectors[:,idx])
 
 """
     cosine(wv, word, n=10)
@@ -110,21 +110,21 @@ function analogy(wv::WordVectors, pos::AbstractArray, neg::AbstractArray, n= 5)
     m, n_vocab = size(wv)
     n_pos = length(pos)
     n_neg = length(neg)
-    anal_vecs = Array{AbstractFloat}(m, n_pos + n_neg)
+    anal_vecs = Array{AbstractFloat}(undef, m, n_pos + n_neg)
 
     for (i, word) in enumerate(pos)
         anal_vecs[:,i] = get_vector(wv, word)
     end
     for (i, word) in enumerate(neg)
         anal_vecs[:,i+n_pos] = -get_vector(wv, word)
     end
-    mean_vec = mean(anal_vecs, 2)
+    mean_vec = mean(anal_vecs, dims=2)
     metrics = wv.vectors'*mean_vec
     top_positions = sortperm(metrics[:], rev = true)[1:n+n_pos+n_neg]
     for word in [pos;neg]
         idx = index(wv, word)
-        loc = findfirst(top_positions, idx)
-        if loc != 0
+        loc = findfirst(isequal(idx), top_positions)
+        if loc != nothing
             splice!(top_positions, loc)
         end
     end
@@ -152,56 +152,94 @@ Generate a WordVectors type object from the file `fname`, where
 `type` is the element of the vectors.
 The file format can be either text (kind=`:text`) or
 binary (kind=`:binary`).
+
+If `normalize=false` the embedding vectors will not be normalized. 
+The default is true.
 """
-function wordvectors(fname::AbstractString, ::Type{T}; kind::Symbol=:text) where T <: Real
+function wordvectors(fname::AbstractString, ::Type{T}; kind::Symbol=:text,
+                     normalize::Bool=true) where T <: Real
     if kind == :binary
-        return _from_binary(fname) # only for Float32
+        try
+            return _from_binary(fname, normalize=normalize) # only for Float32
+        catch y
+            if isa(y, UnicodeError)
+                info("UnicodeError detected. This could mean you try to load a pre-trained " *
+                     "file from Google. Trying to load as a Google binary. You can force " *
+                     "this with kind=:google")
+                return _from_google_binary(fname, normalize=normalize)
+            else # Otherwise pass the exception along
+                throw(y)
+            end
+        end
+    elseif kind == :google
+        return _from_google_binary(fname, normalize=normalize)
     elseif kind == :text
-        return _from_text(T, fname)
+        return _from_text(T, fname, normalize=normalize)
     else
         throw(ArgumentError("Unknown kind $(kind)"))
     end
 end
 
-wordvectors(frame::AbstractString; kind::Symbol=:text) =
-    wordvectors(frame, Float64,kind=kind)
+wordvectors(frame::AbstractString; kind::Symbol=:text, normalize::Bool=true) =
+    wordvectors(frame, Float64,kind=kind, normalize=normalize)
 
 # generate a WordVectors object from binary file
-function _from_binary(filename::AbstractString)
+function _from_binary(filename::AbstractString; normalize::Bool=true)
     open(filename) do f
         header = strip(readline(f))
         vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' '))
-        vocab = Vector{AbstractString}(vocab_size)
-        vectors = Array{Float32}(vector_size, vocab_size)
+        vocab = Vector{AbstractString}(undef, vocab_size)
+        vectors = Array{Float32}(undef, vector_size, vocab_size)
         binary_length = sizeof(Float32) * vector_size
         for i in 1:vocab_size
             vocab[i] = strip(readuntil(f, ' '))
-            vector = read(f, Float32, vector_size)
-            vec_norm = norm(vector)
-            vectors[:, i] = vector./vec_norm  # unit vector
+            vector = Vector{Float32}(undef, vector_size)
+            read!(f, vector)
+            normalize && (vector = vector ./ norm(vector)) # Normalize if needed
+            vectors[:, i] = vector
             read(f, UInt8) # new line
         end
         return WordVectors(vocab, vectors)
     end
 end
 
+# generate a WordVectors object from binary file in the format used by
+# the original pre-trained files by google
+function _from_google_binary(filename::AbstractString; normalize::Bool=true)
+    open(filename) do f
+        header = strip(readline(f))
+        vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' '))
+        vocab = Vector{AbstractString}(undef, vocab_size)
+        vectors = Array{Float32}(undef, vector_size, vocab_size)
+        binary_length = sizeof(Float32) * vector_size
+        for i in 1:vocab_size
+            vocab[i] = strip(readuntil(f, ' '))
+            vector = Vector{Float32}(undef, vector_size)
+            read!(f, vector)
+            normalize && (vector = vector ./ norm(vector)) # Normalize if needed
+            vectors[:, i] = vector
+        end
+        return WordVectors(vocab, vectors)
+    end
+end
+
 # generate a WordVectors object from text file
-function _from_text(::Type{T}, filename::AbstractString) where T
+function _from_text(::Type{T}, filename::AbstractString; normalize::Bool=true) where T
     open(filename) do f
         header = strip(readline(f))
         vocab_size,vector_size = map(x -> parse(Int, x), split(header, ' '))
-        vocab = Vector{AbstractString}(vocab_size)
-        vectors = Array{T}(vector_size, vocab_size)
+        vocab = Vector{AbstractString}(undef, vocab_size)
+        vectors = Array{T}(undef, vector_size, vocab_size)
         @inbounds for (i, line) in enumerate(readlines(f))
             #println(line)
             line = strip(line)
             parts = split(line, ' ')
             word = parts[1]
             vector = map(x-> parse(T, x), parts[2:end])
-            vec_norm = norm(vector)
+            normalize && (vector = vector ./ norm(vector)) # Normalize if needed            
+            vectors[:, i] = vector
             vocab[i] = word
-            vectors[:, i] = vector./vec_norm  #unit vector
         end
-       return WordVectors(vocab, vectors)
+        return WordVectors(vocab, vectors)
     end
 end
diff --git a/test/model.jl b/test/model.jl
@@ -30,7 +30,7 @@ n = rand(1:100)
 indxs, mes = cosine(model, word1, n)
 @test words[indxs] == cosine_similar_words(model, word1, n)
 w4_indx = indxs[rand(1:end)]
-loc = findin(indxs, w4_indx)
+loc = findall((in)(w4_indx), indxs)
 word4 = words[w4_indx]
 @test index(model, word4) == w4_indx
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,5 @@
 using Word2Vec
-using Base.Test
+using Test
 
 include("train.jl") 
 include("model.jl")