diff --git a/.travis.yml b/.travis.yml index 7dda688..711c4ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ language: julia os: - osx - - linux julia: - 1.4 diff --git a/Project.toml b/Project.toml index 7fe805d..1a76f69 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "CGE" uuid = "f7ff1d1e-e254-4b26-babe-fc3421add060" authors = ["KrainskiL "] -version = "1.0.1" +version = "1.1.0" [deps] DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" diff --git a/README.md b/README.md index bf7b8f7..a677f3d 100644 --- a/README.md +++ b/README.md @@ -56,13 +56,13 @@ When comparing embeddings, lower divergence is better. Format: ``` -julia CGE_CLI.jl -g edgelist_file -c clusters_file -e embedding_file [-a -v] [-l landmarks -f forced -m method] +julia CGE_CLI.jl -g edgelist_file -e embedding_file [-c clusters_file] [-a -v] [-l landmarks -f forced -m method] ## required flags: -g: the edgelist (1 per line, whitespace separated, optionally with weights) --c: the communities (in vertices order, 1 per line) -e: the embedding (two formats accepted, see details below) ## optional flags: +-c: the communities (in vertices order, 1 per line), if not given calculated using Louvain algorithm -a: 'asis' flag, use if embedding is provided unordered with vertices in first column -v: verbose, printing additional information -l: number of landmarks to create @@ -131,6 +131,7 @@ Additional weights may be provided in third column Clusters can be 0-based or 1-based Clusters: one value per line in the numerical order of the nodes +If not provided, clusters will be automatically calculated with Louvain algorithm ``` 1 diff --git a/bin/unix/convert b/bin/unix/convert new file mode 100755 index 0000000..e4dc69a Binary files /dev/null and b/bin/unix/convert differ diff --git a/bin/unix/hierarchy b/bin/unix/hierarchy new file mode 100755 index 0000000..803c63c Binary files /dev/null and b/bin/unix/hierarchy differ diff --git a/bin/unix/louvain b/bin/unix/louvain new file mode 100755 index 0000000..257331d Binary files /dev/null and b/bin/unix/louvain differ diff --git a/bin/win/convert.exe b/bin/win/convert.exe new file mode 100644 index 0000000..0a129c6 Binary files /dev/null and b/bin/win/convert.exe differ diff --git a/bin/win/hierarchy.exe b/bin/win/hierarchy.exe new file mode 100644 index 0000000..60e88c9 Binary files /dev/null and b/bin/win/hierarchy.exe differ diff --git a/bin/win/louvain.exe b/bin/win/louvain.exe new file mode 100644 index 0000000..1a92699 Binary files /dev/null and b/bin/win/louvain.exe differ diff --git a/example/CGE_CLI.jl b/example/CGE_CLI.jl old mode 100644 new mode 100755 diff --git a/src/CGE.jl b/src/CGE.jl index cc47474..1d11aec 100644 --- a/src/CGE.jl +++ b/src/CGE.jl @@ -13,8 +13,12 @@ export landmarks #divergence export wGCL +#clustering +export louvain + # Include package code include("auxilary.jl") include("landmarks.jl") include("divergence.jl") +include("clustering.jl") end diff --git a/src/auxilary.jl b/src/auxilary.jl index 326cbac..f4fb783 100644 --- a/src/auxilary.jl +++ b/src/auxilary.jl @@ -70,8 +70,8 @@ function parseargs() # Check if calculations should be verbose verbose = !isnothing(findfirst(==("-v"),ARGS)) ? true : false - # Check for required arguments: -g graph_edgelist -c communities -e embedding -o outfile - @assert length(ARGS) >= 6 + # Check for required arguments: -g graph_edgelist -e embedding + @assert length(ARGS) >= 4 "Graph edgelist and embedding files are required" # Load obligatory files ################ @@ -110,8 +110,8 @@ function parseargs() vweight[edges[i,2]]+=1.0 end else - edges = convert.(Int,edges[:,1:2]) eweights = edges[:,3] + edges = convert.(Int,edges[:,1:2]) vweight = zeros(no_vertices) for i in 1:rows vweight[edges[i,1]]+=eweights[i] @@ -123,174 +123,18 @@ function parseargs() ################ ## Communities # ################ - idx = findfirst(==("-c"),ARGS) - @assert !isnothing(idx) "Communities file is required" - fn_comm = ARGS[idx+1] - - # Read communities - comm = readdlm(fn_comm,Int) - comm_rows, no_cols = size(comm) - - # Validate file structure - @assert no_cols==1 "Expected 1 column" - v_min = minimum(comm[:,1]) - @assert v_min==0 || v_min==1 "Communities should be either 0-based or 1-based" - c_min=minimum(comm) - - # make communities 1-based - if c_min == 0 - comm[:,1] .+=1 - end - verbose && println("Done preparing communities") - - ############## - ## Embedding # - ############## - - idx = findfirst(==("-e"),ARGS) - @assert !isnothing(idx) "Embedding file is required" - fn_embed = ARGS[idx+1] - - # Read embedding - embedding = readdlm(fn_embed,Float64) - - # Validate file - @assert comm_rows == size(embedding, 1) "No. rows in embedding and communities files differ" - - # if embedding contains index in first column, sort by it and remove column - if asis == 0 - embedding = embedding[sortperm(embedding[:,1]),2:end] - end - verbose && println("done preparing embedding") - - ##################### - ## Output file name # - ##################### - # idx = findfirst(==("-o"),ARGS) - # @assert !isnothing(idx) - # outfile = ARGS[idx+1] - - ############# - ##Landmarks # - ############# - - # Transform communities - clusters = Dict{Any, Vector{Int}}() - for (i, c) in enumerate(comm[:,1]) - if haskey(clusters, c) - push!(clusters[c], i) - else - clusters[c] = [i] - end - end - clusters = collect(values(clusters)) - - idx = findfirst(==("-l"),ARGS) - landmarks = !isnothing(idx) ? parse(Int, ARGS[idx+1]) : -1 - idx = findfirst(==("-f"),ARGS) - forced = !isnothing(idx) ? parse(Int, ARGS[idx+1]) : -1 - idx = findfirst(==("-m"),ARGS) - method_str = !isnothing(idx) ? lowercase(strip(ARGS[idx+1])) : "rss" - method = methods[method_str] - return edges, eweights, vweight, comm, clusters, embedding, asis, verbose, landmarks, forced, method - catch e - showerror(stderr, e) - println("\n\nUsage:") - println("\tjulia CGE.jl -g graph_edgelist -c communities -e embedding [-a -v] [-l landmarks -f forced -m method]") - println("\nParameters:") - println("graph_edgelist: rows should contain two vertices ids (edge) and optional weights") - println("communities: rows should contain cluster identifiers of consecutive vertices") - println("embedding: rows should contain whitespace separated locations of vertices in embedding") - println("-a: flag for sorting embedding") - println("-v: flag for debugging messages") - println("landmarks: required number of landmarks") - println("forced: required maximum number of forced splits of a cluster") - println("method: one of:") - println("\t* rss: minimize maximum residual sum of squares when doing a cluster split") - println("\t* rss2: minimize maximum residual sum of squares when doing a cluster split (slower)") - println("\t* size: make clusters have approximately the same size after a cluster split") - println("\t* diameter: make clusters have approximately the same diameter along first " * - "principal component after a cluster split") - println("(note that always a cluster with largest residual sum of squares is selected for splitting)") - exit(1) - end -end - -function parseargs(ARGS::Vector{String}) - methods = Dict("rss" => split_cluster_rss, - "rss2" => split_cluster_rss2, - "size" => split_cluster_size, - "diameter" => split_cluster_diameter) - try - # Optional arguments - ##Flags - asis = !isnothing(findfirst(==("-a"),ARGS)) ? true : false - # Check if calculations should be verbose - verbose = !isnothing(findfirst(==("-v"),ARGS)) ? true : false - - # Check for required arguments: -g graph_edgelist -c communities -e embedding - @assert length(ARGS) >= 6 - - # Load obligatory files - ################ - ## Graph edges # - ################ - - idx = findfirst(==("-g"),ARGS) - @assert !isnothing(idx) - fn_edges = ARGS[idx+1] - - # read edges - edges = readdlm(fn_edges, Float64) - rows, no_cols = size(edges) - verbose && println("$no_cols columns in graph edgelist file") - - # Validate file structure - @assert no_cols==2 || no_cols==3 "Expected 2 or 3 columns" - v_min = minimum(edges[:,1:2]) - @assert v_min==0 || v_min==1 "Vertices should be either 0-based or 1-based" - - # make vertices 1-based - if v_min == 0.0 - edges[:,1:2] .+= 1.0 - end - no_vertices = Int(maximum(edges[:,1:2])) - verbose && println("Vertices from $v_min to $no_vertices") - - # if unweighted, add unit weights - # compute vertex weights - if no_cols == 2 - edges = convert.(Int,edges) - eweights = ones(rows) - vweight = zeros(no_vertices) - for i in 1:rows - vweight[edges[i,1]]+=1.0 - vweight[edges[i,2]]+=1.0 - end + if !isnothing(idx) + fn_comm = ARGS[idx+1] + comm = readdlm(fn_comm,Int) else - edges = convert.(Int,edges[:,1:2]) - eweights = edges[:,3] - vweight = zeros(no_vertices) - for i in 1:rows - vweight[edges[i,1]]+=eweights[i] - vweight[edges[i,2]]+=eweights[i] - end + no_cols == 2 ? louvain(fn_edges,) : louvain(fn_edges, edges, eweights) + fn_comm = fn_edges*".ecg" + comm = readdlm(fn_comm,Int) + comm = comm[2:end,2] + comm = reshape(comm,size(comm)[1],1) end - verbose && println("Done preparing edgelist and vertex weight") - - ################ - ## Communities # - ################ - - idx = findfirst(==("-c"),ARGS) - @assert !isnothing(idx) - fn_comm = ARGS[idx+1] - - # Read communities - comm = readdlm(fn_comm,Int) comm_rows, no_cols = size(comm) - # Validate file structure @assert no_cols==1 "Expected 1 column" v_min = minimum(comm[:,1]) @@ -308,7 +152,7 @@ function parseargs(ARGS::Vector{String}) ############## idx = findfirst(==("-e"),ARGS) - @assert !isnothing(idx) + @assert !isnothing(idx) "Embedding file is required" fn_embed = ARGS[idx+1] # Read embedding @@ -323,13 +167,6 @@ function parseargs(ARGS::Vector{String}) end verbose && println("done preparing embedding") - ##################### - ## Output file name # - ##################### - # idx = findfirst(==("-o"),ARGS) - # @assert !isnothing(idx) - # outfile = ARGS[idx+1] - ############# ##Landmarks # ############# @@ -356,10 +193,11 @@ function parseargs(ARGS::Vector{String}) catch e showerror(stderr, e) println("\n\nUsage:") - println("\tjulia CGE.jl -g graph_edgelist -c communities -e embedding [-a -v] [-l landmarks -f forced -m method]") + println("\tjulia CGE.jl -g graph_edgelist -e embedding [-c communities] [-a -v] [-l landmarks -f forced -m method]") println("\nParameters:") println("graph_edgelist: rows should contain two vertices ids (edge) and optional weights") println("communities: rows should contain cluster identifiers of consecutive vertices") + println("if no file is given communities are calculated with Louvain algorithm") println("embedding: rows should contain whitespace separated locations of vertices in embedding") println("-a: flag for sorting embedding") println("-v: flag for debugging messages") @@ -372,5 +210,6 @@ function parseargs(ARGS::Vector{String}) println("\t* diameter: make clusters have approximately the same diameter along first " * "principal component after a cluster split") println("(note that always a cluster with largest residual sum of squares is selected for splitting)") + exit(1) end end \ No newline at end of file diff --git a/src/clustering.jl b/src/clustering.jl new file mode 100644 index 0000000..bb534e3 --- /dev/null +++ b/src/clustering.jl @@ -0,0 +1,60 @@ +####################### +# Community detection # +####################### + +""" + louvain(edges::String) + +Calculate communities in graph using [Louvain algoritm](https://sites.google.com/site/findcommunities/) + +**Arguments** +* `edges::String` name of file with edges definition +""" +function louvain(edges::String) + BASEPATH = dirname(dirname(pathof(CGE))) + TMP = Sys.iswindows() ? ENV["TEMP"] : "/tmp" + if Sys.iswindows() + TMP = ENV["TEMP"] + run(`$BASEPATH/bin/win/convert.exe -i $edges -o $TMP/louvain.bin`) + run(pipeline(`$BASEPATH/bin/win/louvain.exe $TMP/louvain.bin -l -1 -q id_qual`, stdout="$TMP/louvain.txt")) + run(pipeline(`$BASEPATH/bin/win/hierarchy.exe $TMP/louvain.txt -l 1`,"$edges.ecg")) + else + chmod(joinpath(BASEPATH,"bin/unix/"),0o744,recursive=true) + run(`$BASEPATH/bin/unix/convert -i $edges -o $TMP/louvain.bin`) + run(pipeline(`$BASEPATH/bin/unix/louvain $TMP/louvain.bin -l -1 -q id_qual`, stdout="$TMP/louvain.txt")) + run(pipeline(`$BASEPATH/bin/unix/hierarchy $TMP/louvain.txt -l 1`,"$edges.ecg")) + end +end + +""" + louvain(filename::String, edges::Array{Int,2}, weights::Array{Float64,1})) + +Calculate communities in weighted graph using [Louvain algoritm](https://sites.google.com/site/findcommunities/) + +**Arguments** +* `filename::String` name of file with edges definition +* `edges::Array{Int,2}` list of edges +* `weights::Array{Float64,1}` array of egdges' weights +""" +function louvain(filename::String, edges::Array{Int,2}, weights::Array{Float64,1}) + BASEPATH = dirname(dirname(pathof(CGE))) + TMP = Sys.iswindows() ? ENV["TEMP"] : "/tmp" + tmp_edges = joinpath(TMP,filename) + open(tmp_edges, "w") do f + for i in edges println(f, i) end + end + tmp_weights = joinpath(TMP,filename*".weights") + open(tmp_weights, "w") do f + for i in weights println(f, i) end + end + if Sys.iswindows() + run(`$BASEPATH/bin/win/convert.exe -i $tmp_edges -o $TMP/louvain.bin -w $tmp_weights`) + run(pipeline(`$BASEPATH/bin/win/louvain.exe $TMP/louvain.bin -w $tmp_weights -l -1 -q id_qual`, stdout="$TMP/louvain.txt")) + run(pipeline(`$BASEPATH/bin/win/hierarchy.exe $TMP/louvain.txt -l 1`,"$filename.ecg")) + else + chmod(joinpath(BASEPATH,"bin/unix/"),0o744,recursive=true) + run(`$BASEPATH/bin/unix/convert -i $tmp_edges -o $TMP/louvain.bin -w $tmp_weights`) + run(pipeline(`$BASEPATH/bin/unix/louvain $TMP/louvain.bin -w $tmp_weights -l -1 -q id_qual`, stdout="$TMP/louvain.txt")) + run(pipeline(`$BASEPATH/bin/unix/hierarchy $TMP/louvain.txt -l 1`,"$filename.ecg")) + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 3c15c26..4acbca0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,8 +1,10 @@ using Test using CGE -argslist=["-g","test.edgelist","-c","test.ecg","-e","test.embedding","-l","20","-f","1","-m","rss"]; -edges, weights, vweights, comm, clusters, embed, asis, verbose, land, forced, method = parseargs(argslist) +for e in ["-g","test.edgelist","-c","test.ecg","-e","test.embedding","-l","20","-f","1","-m","rss"] + push!(Base.ARGS,e) +end +edges, weights, vweights, comm, clusters, embed, asis, verbose, land, forced, method = parseargs() @testset "parsing" begin @@ -27,23 +29,59 @@ edges, weights, vweights, comm, clusters, embed, asis, verbose, land, forced, me end -distances, embed, comm, edges, weights = landmarks(edges, weights, vweights, clusters, comm, +distances, lembed, lcomm, ledges, lweights = landmarks(edges, weights, vweights, clusters, comm, embed, verbose, land, forced, method); -@testset "landmarks" begin +@testset "landmarks-rss" begin -@test typeof(edges) == Array{Int,2} -@test minimum(edges) == 1 + @test typeof(ledges) == Array{Int,2} + @test minimum(ledges) == 1 + @test typeof(lweights) == Array{Float64,1} + @test typeof(lcomm) == Array{Int,2} + @test size(lcomm,2) == 1 + @test typeof(distances) == Array{Float64,1} +end -@test typeof(weights) == Array{Float64,1} +distances, lembed, lcomm, ledges, lweights = landmarks(edges, weights, vweights, clusters, comm, +embed, verbose, land, forced, CGE.split_cluster_rss2); -@test typeof(comm) == Array{Int,2} -@test size(comm,2) == 1 +@testset "landmarks-rss2" begin + +@test typeof(ledges) == Array{Int,2} +@test minimum(ledges) == 1 +@test typeof(lweights) == Array{Float64,1} +@test typeof(lcomm) == Array{Int,2} +@test size(lcomm,2) == 1 +@test typeof(distances) == Array{Float64,1} +end +distances, lembed, lcomm, ledges, lweights = landmarks(edges, weights, vweights, clusters, comm, +embed, verbose, land, forced, CGE.split_cluster_size); + +@testset "landmarks-size" begin + +@test typeof(ledges) == Array{Int,2} +@test minimum(ledges) == 1 +@test typeof(lweights) == Array{Float64,1} +@test typeof(lcomm) == Array{Int,2} +@test size(lcomm,2) == 1 +@test typeof(distances) == Array{Float64,1} +end + +distances, lembed, lcomm, ledges, lweights = landmarks(edges, weights, vweights, clusters, comm, +embed, verbose, land, forced, CGE.split_cluster_diameter); + +@testset "landmarks-diameter" begin + +@test typeof(ledges) == Array{Int,2} +@test minimum(ledges) == 1 +@test typeof(lweights) == Array{Float64,1} +@test typeof(lcomm) == Array{Int,2} +@test size(lcomm,2) == 1 @test typeof(distances) == Array{Float64,1} end -results = wGCL(edges, weights, comm, embed, distances, verbose); +results = wGCL(ledges, lweights, lcomm, lembed, distances, verbose); @testset "wgcl" begin @@ -51,3 +89,11 @@ results = wGCL(edges, weights, comm, embed, distances, verbose); @test results[1] <=10.0 end + +louvain("test.edgelist") + +@testset "clustering" begin + +@test isfile("test.edgelist.ecg") +isfile("test.edgelist.ecg") && rm("test.edgelist.ecg") +end \ No newline at end of file