Adding Louvain clustering (weighted) on Mac and Windows

KrainskiL · Aug 9, 2020 · ee8678c · ee8678c
1 parent 25c1519
commit ee8678c
Show file tree

Hide file tree

Showing 14 changed files with 139 additions and 190 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,6 @@ language: julia
 
 os:
   - osx
-  - linux
 
 julia:
   - 1.4

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "CGE"
 uuid = "f7ff1d1e-e254-4b26-babe-fc3421add060"
 authors = ["KrainskiL <[email protected]>"]
-version = "1.0.1"
+version = "1.1.0"
 
 [deps]
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"

diff --git a/README.md b/README.md
@@ -56,13 +56,13 @@ When comparing embeddings, lower divergence is better.
 Format:
 
 ```
-julia CGE_CLI.jl -g edgelist_file -c clusters_file -e embedding_file [-a -v] [-l landmarks -f forced -m method]
+julia CGE_CLI.jl -g edgelist_file -e embedding_file [-c clusters_file] [-a -v] [-l landmarks -f forced -m method]
 
 ## required flags:
 -g: the edgelist (1 per line, whitespace separated, optionally with weights)
--c: the communities (in vertices order, 1 per line)
 -e: the embedding (two formats accepted, see details below)
 ## optional flags:
+-c: the communities (in vertices order, 1 per line), if not given calculated using Louvain algorithm
 -a: 'asis' flag, use if embedding is provided unordered with vertices in first column
 -v: verbose, printing additional information
 -l: number of landmarks to create
@@ -131,6 +131,7 @@ Additional weights may be provided in third column
 
 Clusters can be 0-based or 1-based
 Clusters: one value per line in the numerical order of the nodes
+If not provided, clusters will be automatically calculated with Louvain algorithm
 
 ```
 1

diff --git a/bin/unix/convert b/bin/unix/convert
diff --git a/bin/unix/hierarchy b/bin/unix/hierarchy
diff --git a/bin/unix/louvain b/bin/unix/louvain
diff --git a/bin/win/convert.exe b/bin/win/convert.exe
diff --git a/bin/win/hierarchy.exe b/bin/win/hierarchy.exe
diff --git a/bin/win/louvain.exe b/bin/win/louvain.exe
diff --git a/example/CGE_CLI.jl b/example/CGE_CLI.jl
diff --git a/src/CGE.jl b/src/CGE.jl
@@ -13,8 +13,12 @@ export landmarks
 #divergence
 export wGCL
 
+#clustering
+export louvain
+
 # Include package code
 include("auxilary.jl")
 include("landmarks.jl")
 include("divergence.jl")
+include("clustering.jl")
 end
diff --git a/src/auxilary.jl b/src/auxilary.jl
@@ -70,8 +70,8 @@ function parseargs()
         # Check if calculations should be verbose
         verbose = !isnothing(findfirst(==("-v"),ARGS)) ? true : false
 
-        # Check for required arguments: -g graph_edgelist -c communities -e embedding -o outfile
-        @assert length(ARGS) >= 6
+        # Check for required arguments: -g graph_edgelist -e embedding
+        @assert length(ARGS) >= 4 "Graph edgelist and embedding files are required"
 
         # Load obligatory files
         ################
@@ -110,8 +110,8 @@ function parseargs()
                 vweight[edges[i,2]]+=1.0
             end
         else
-            edges = convert.(Int,edges[:,1:2])
             eweights = edges[:,3]
+            edges = convert.(Int,edges[:,1:2])
             vweight = zeros(no_vertices)
             for i in 1:rows
                 vweight[edges[i,1]]+=eweights[i]
@@ -123,174 +123,18 @@ function parseargs()
         ################
         ## Communities #
         ################
-
         idx = findfirst(==("-c"),ARGS)
-        @assert !isnothing(idx) "Communities file is required"
-        fn_comm = ARGS[idx+1]
-
-        # Read communities
-        comm = readdlm(fn_comm,Int)
-        comm_rows, no_cols = size(comm)
-
-        # Validate file structure
-        @assert no_cols==1 "Expected 1 column"
-        v_min = minimum(comm[:,1])
-        @assert v_min==0 || v_min==1 "Communities should be either 0-based or 1-based"
-        c_min=minimum(comm)
-
-        # make communities 1-based
-        if c_min == 0
-            comm[:,1] .+=1
-        end
-        verbose && println("Done preparing communities")
-
-        ##############
-        ## Embedding #
-        ##############
-
-        idx = findfirst(==("-e"),ARGS)
-        @assert !isnothing(idx) "Embedding file is required"
-        fn_embed = ARGS[idx+1]
-
-        # Read embedding
-        embedding = readdlm(fn_embed,Float64)
-
-        # Validate file
-        @assert comm_rows == size(embedding, 1) "No. rows in embedding and communities files differ"
-
-        # if embedding contains index in first column, sort by it and remove column
-        if asis == 0
-            embedding = embedding[sortperm(embedding[:,1]),2:end]
-        end
-        verbose && println("done preparing embedding")
-
-        #####################
-        ## Output file name #
-        #####################
-        # idx = findfirst(==("-o"),ARGS)
-        # @assert !isnothing(idx)
-        # outfile = ARGS[idx+1]
-
-        #############
-        ##Landmarks #
-        #############
-
-        # Transform communities
-        clusters = Dict{Any, Vector{Int}}()
-        for (i, c) in enumerate(comm[:,1])
-            if haskey(clusters, c)
-                push!(clusters[c], i)
-            else
-                clusters[c] = [i]
-            end
-        end
-        clusters = collect(values(clusters))
-
-        idx = findfirst(==("-l"),ARGS)
-        landmarks = !isnothing(idx) ? parse(Int, ARGS[idx+1]) : -1
-        idx = findfirst(==("-f"),ARGS)
-        forced = !isnothing(idx) ? parse(Int, ARGS[idx+1]) : -1
-        idx = findfirst(==("-m"),ARGS)
-        method_str = !isnothing(idx) ? lowercase(strip(ARGS[idx+1])) : "rss"
-        method = methods[method_str]
-        return edges, eweights, vweight, comm, clusters, embedding, asis, verbose, landmarks, forced, method
-    catch e
-        showerror(stderr, e)
-        println("\n\nUsage:")
-        println("\tjulia CGE.jl -g graph_edgelist -c communities -e embedding [-a -v] [-l landmarks -f forced -m method]")
-        println("\nParameters:")
-        println("graph_edgelist: rows should contain two vertices ids (edge) and optional weights")
-        println("communities: rows should contain cluster identifiers of consecutive vertices")
-        println("embedding: rows should contain whitespace separated locations of vertices in embedding")
-        println("-a: flag for sorting embedding")
-        println("-v: flag for debugging messages")
-        println("landmarks: required number of landmarks")
-        println("forced: required maximum number of forced splits of a cluster")
-        println("method: one of:")
-        println("\t* rss:      minimize maximum residual sum of squares when doing a cluster split")
-        println("\t* rss2:     minimize maximum residual sum of squares when doing a cluster split (slower)")
-        println("\t* size:     make clusters have approximately the same size after a cluster split")
-        println("\t* diameter: make clusters have approximately the same diameter along first " *
-                "principal component after a cluster split")
-        println("(note that always a cluster with largest residual sum of squares is selected for splitting)")
-        exit(1)
-    end
-end
-
-function parseargs(ARGS::Vector{String})
-    methods = Dict("rss" => split_cluster_rss,
-                   "rss2" => split_cluster_rss2,
-                   "size" => split_cluster_size,
-                   "diameter" => split_cluster_diameter)
-    try
-        # Optional arguments
-        ##Flags
-        asis = !isnothing(findfirst(==("-a"),ARGS)) ? true : false
-        # Check if calculations should be verbose
-        verbose = !isnothing(findfirst(==("-v"),ARGS)) ? true : false
-
-        # Check for required arguments: -g graph_edgelist -c communities -e embedding
-        @assert length(ARGS) >= 6
-
-        # Load obligatory files
-        ################
-        ## Graph edges #
-        ################
-
-        idx = findfirst(==("-g"),ARGS)
-        @assert !isnothing(idx)
-        fn_edges = ARGS[idx+1]
-
-        # read edges
-        edges = readdlm(fn_edges, Float64)
-        rows, no_cols = size(edges)
-        verbose && println("$no_cols columns in graph edgelist file")
-
-        # Validate file structure
-        @assert no_cols==2 || no_cols==3 "Expected 2 or 3 columns"
-        v_min = minimum(edges[:,1:2])
-        @assert v_min==0 || v_min==1 "Vertices should be either 0-based or 1-based"
-
-        # make vertices 1-based
-        if v_min == 0.0
-            edges[:,1:2] .+= 1.0
-        end
-        no_vertices = Int(maximum(edges[:,1:2]))
-        verbose && println("Vertices from $v_min to $no_vertices")
-
-        # if unweighted, add unit weights
-        # compute vertex weights
-        if no_cols == 2
-            edges = convert.(Int,edges)
-            eweights = ones(rows)
-            vweight = zeros(no_vertices)
-            for i in 1:rows
-                vweight[edges[i,1]]+=1.0
-                vweight[edges[i,2]]+=1.0
-            end
+        if !isnothing(idx)
+            fn_comm = ARGS[idx+1]
+            comm = readdlm(fn_comm,Int)
         else
-            edges = convert.(Int,edges[:,1:2])
-            eweights = edges[:,3]
-            vweight = zeros(no_vertices)
-            for i in 1:rows
-                vweight[edges[i,1]]+=eweights[i]
-                vweight[edges[i,2]]+=eweights[i]
-            end
+            no_cols == 2 ? louvain(fn_edges,) : louvain(fn_edges, edges, eweights)
+            fn_comm = fn_edges*".ecg"
+            comm = readdlm(fn_comm,Int)
+            comm = comm[2:end,2]
+            comm = reshape(comm,size(comm)[1],1)
         end
-        verbose && println("Done preparing edgelist and vertex weight")
-
-        ################
-        ## Communities #
-        ################
-
-        idx = findfirst(==("-c"),ARGS)
-        @assert !isnothing(idx)
-        fn_comm = ARGS[idx+1]
-
-        # Read communities
-        comm = readdlm(fn_comm,Int)
         comm_rows, no_cols = size(comm)
-
         # Validate file structure
         @assert no_cols==1 "Expected 1 column"
         v_min = minimum(comm[:,1])
@@ -308,7 +152,7 @@ function parseargs(ARGS::Vector{String})
         ##############
 
         idx = findfirst(==("-e"),ARGS)
-        @assert !isnothing(idx)
+        @assert !isnothing(idx) "Embedding file is required"
         fn_embed = ARGS[idx+1]
 
         # Read embedding
@@ -323,13 +167,6 @@ function parseargs(ARGS::Vector{String})
         end
         verbose && println("done preparing embedding")
 
-        #####################
-        ## Output file name #
-        #####################
-        # idx = findfirst(==("-o"),ARGS)
-        # @assert !isnothing(idx)
-        # outfile = ARGS[idx+1]
-
         #############
         ##Landmarks #
         #############
@@ -356,10 +193,11 @@ function parseargs(ARGS::Vector{String})
     catch e
         showerror(stderr, e)
         println("\n\nUsage:")
-        println("\tjulia CGE.jl -g graph_edgelist -c communities -e embedding [-a -v] [-l landmarks -f forced -m method]")
+        println("\tjulia CGE.jl -g graph_edgelist -e embedding [-c communities] [-a -v] [-l landmarks -f forced -m method]")
         println("\nParameters:")
         println("graph_edgelist: rows should contain two vertices ids (edge) and optional weights")
         println("communities: rows should contain cluster identifiers of consecutive vertices")
+        println("if no file is given communities are calculated with Louvain algorithm")
         println("embedding: rows should contain whitespace separated locations of vertices in embedding")
         println("-a: flag for sorting embedding")
         println("-v: flag for debugging messages")
@@ -372,5 +210,6 @@ function parseargs(ARGS::Vector{String})
         println("\t* diameter: make clusters have approximately the same diameter along first " *
                 "principal component after a cluster split")
         println("(note that always a cluster with largest residual sum of squares is selected for splitting)")
+        exit(1)
     end
 end
diff --git a/src/clustering.jl b/src/clustering.jl
@@ -0,0 +1,60 @@
+#######################
+# Community detection #
+#######################
+
+"""
+    louvain(edges::String)
+
+Calculate communities in graph using [Louvain algoritm](https://sites.google.com/site/findcommunities/)
+
+**Arguments**
+* `edges::String` name of file with edges definition
+"""
+function louvain(edges::String)
+    BASEPATH = dirname(dirname(pathof(CGE)))
+    TMP = Sys.iswindows() ? ENV["TEMP"] : "/tmp"
+    if Sys.iswindows()
+        TMP = ENV["TEMP"]
+        run(`$BASEPATH/bin/win/convert.exe -i $edges -o $TMP/louvain.bin`)
+        run(pipeline(`$BASEPATH/bin/win/louvain.exe $TMP/louvain.bin -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
+        run(pipeline(`$BASEPATH/bin/win/hierarchy.exe $TMP/louvain.txt -l 1`,"$edges.ecg"))
+    else
+        chmod(joinpath(BASEPATH,"bin/unix/"),0o744,recursive=true)
+        run(`$BASEPATH/bin/unix/convert -i $edges -o $TMP/louvain.bin`)
+        run(pipeline(`$BASEPATH/bin/unix/louvain $TMP/louvain.bin -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
+        run(pipeline(`$BASEPATH/bin/unix/hierarchy $TMP/louvain.txt -l 1`,"$edges.ecg"))
+    end
+end
+
+"""
+    louvain(filename::String, edges::Array{Int,2}, weights::Array{Float64,1}))
+
+Calculate communities in weighted graph using [Louvain algoritm](https://sites.google.com/site/findcommunities/)
+
+**Arguments**
+* `filename::String` name of file with edges definition
+* `edges::Array{Int,2}` list of edges
+* `weights::Array{Float64,1}` array of egdges' weights
+"""
+function louvain(filename::String, edges::Array{Int,2}, weights::Array{Float64,1})
+    BASEPATH = dirname(dirname(pathof(CGE)))
+    TMP = Sys.iswindows() ? ENV["TEMP"] : "/tmp"
+    tmp_edges = joinpath(TMP,filename)
+    open(tmp_edges, "w") do f
+        for i in edges println(f, i) end
+    end
+    tmp_weights = joinpath(TMP,filename*".weights")
+    open(tmp_weights, "w") do f
+        for i in weights println(f, i) end
+    end
+    if Sys.iswindows()
+        run(`$BASEPATH/bin/win/convert.exe -i $tmp_edges -o $TMP/louvain.bin -w $tmp_weights`)
+        run(pipeline(`$BASEPATH/bin/win/louvain.exe $TMP/louvain.bin -w $tmp_weights -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
+        run(pipeline(`$BASEPATH/bin/win/hierarchy.exe $TMP/louvain.txt -l 1`,"$filename.ecg"))
+    else
+        chmod(joinpath(BASEPATH,"bin/unix/"),0o744,recursive=true)
+        run(`$BASEPATH/bin/unix/convert -i $tmp_edges -o $TMP/louvain.bin -w $tmp_weights`)
+        run(pipeline(`$BASEPATH/bin/unix/louvain $TMP/louvain.bin -w $tmp_weights -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
+        run(pipeline(`$BASEPATH/bin/unix/hierarchy $TMP/louvain.txt -l 1`,"$filename.ecg"))
+    end
+end
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ language: julia @@
     os:
       - osx
-      - linux
     julia:
       - 1.4
@@ Expand Down @@