Skip to content

Commit

Permalink
Adding Louvain clustering (weighted) on Mac and Windows
Browse files Browse the repository at this point in the history
  • Loading branch information
KrainskiL authored Aug 9, 2020
1 parent 25c1519 commit ee8678c
Show file tree
Hide file tree
Showing 14 changed files with 139 additions and 190 deletions.
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ language: julia

os:
- osx
- linux

julia:
- 1.4
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "CGE"
uuid = "f7ff1d1e-e254-4b26-babe-fc3421add060"
authors = ["KrainskiL <[email protected]>"]
version = "1.0.1"
version = "1.1.0"

[deps]
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ When comparing embeddings, lower divergence is better.
Format:

```
julia CGE_CLI.jl -g edgelist_file -c clusters_file -e embedding_file [-a -v] [-l landmarks -f forced -m method]
julia CGE_CLI.jl -g edgelist_file -e embedding_file [-c clusters_file] [-a -v] [-l landmarks -f forced -m method]
## required flags:
-g: the edgelist (1 per line, whitespace separated, optionally with weights)
-c: the communities (in vertices order, 1 per line)
-e: the embedding (two formats accepted, see details below)
## optional flags:
-c: the communities (in vertices order, 1 per line), if not given calculated using Louvain algorithm
-a: 'asis' flag, use if embedding is provided unordered with vertices in first column
-v: verbose, printing additional information
-l: number of landmarks to create
Expand Down Expand Up @@ -131,6 +131,7 @@ Additional weights may be provided in third column

Clusters can be 0-based or 1-based
Clusters: one value per line in the numerical order of the nodes
If not provided, clusters will be automatically calculated with Louvain algorithm

```
1
Expand Down
Binary file added bin/unix/convert
Binary file not shown.
Binary file added bin/unix/hierarchy
Binary file not shown.
Binary file added bin/unix/louvain
Binary file not shown.
Binary file added bin/win/convert.exe
Binary file not shown.
Binary file added bin/win/hierarchy.exe
Binary file not shown.
Binary file added bin/win/louvain.exe
Binary file not shown.
Empty file modified example/CGE_CLI.jl
100644 → 100755
Empty file.
4 changes: 4 additions & 0 deletions src/CGE.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ export landmarks
#divergence
export wGCL

#clustering
export louvain

# Include package code
include("auxilary.jl")
include("landmarks.jl")
include("divergence.jl")
include("clustering.jl")
end
191 changes: 15 additions & 176 deletions src/auxilary.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ function parseargs()
# Check if calculations should be verbose
verbose = !isnothing(findfirst(==("-v"),ARGS)) ? true : false

# Check for required arguments: -g graph_edgelist -c communities -e embedding -o outfile
@assert length(ARGS) >= 6
# Check for required arguments: -g graph_edgelist -e embedding
@assert length(ARGS) >= 4 "Graph edgelist and embedding files are required"

# Load obligatory files
################
Expand Down Expand Up @@ -110,8 +110,8 @@ function parseargs()
vweight[edges[i,2]]+=1.0
end
else
edges = convert.(Int,edges[:,1:2])
eweights = edges[:,3]
edges = convert.(Int,edges[:,1:2])
vweight = zeros(no_vertices)
for i in 1:rows
vweight[edges[i,1]]+=eweights[i]
Expand All @@ -123,174 +123,18 @@ function parseargs()
################
## Communities #
################

idx = findfirst(==("-c"),ARGS)
@assert !isnothing(idx) "Communities file is required"
fn_comm = ARGS[idx+1]

# Read communities
comm = readdlm(fn_comm,Int)
comm_rows, no_cols = size(comm)

# Validate file structure
@assert no_cols==1 "Expected 1 column"
v_min = minimum(comm[:,1])
@assert v_min==0 || v_min==1 "Communities should be either 0-based or 1-based"
c_min=minimum(comm)

# make communities 1-based
if c_min == 0
comm[:,1] .+=1
end
verbose && println("Done preparing communities")

##############
## Embedding #
##############

idx = findfirst(==("-e"),ARGS)
@assert !isnothing(idx) "Embedding file is required"
fn_embed = ARGS[idx+1]

# Read embedding
embedding = readdlm(fn_embed,Float64)

# Validate file
@assert comm_rows == size(embedding, 1) "No. rows in embedding and communities files differ"

# if embedding contains index in first column, sort by it and remove column
if asis == 0
embedding = embedding[sortperm(embedding[:,1]),2:end]
end
verbose && println("done preparing embedding")

#####################
## Output file name #
#####################
# idx = findfirst(==("-o"),ARGS)
# @assert !isnothing(idx)
# outfile = ARGS[idx+1]

#############
##Landmarks #
#############

# Transform communities
clusters = Dict{Any, Vector{Int}}()
for (i, c) in enumerate(comm[:,1])
if haskey(clusters, c)
push!(clusters[c], i)
else
clusters[c] = [i]
end
end
clusters = collect(values(clusters))

idx = findfirst(==("-l"),ARGS)
landmarks = !isnothing(idx) ? parse(Int, ARGS[idx+1]) : -1
idx = findfirst(==("-f"),ARGS)
forced = !isnothing(idx) ? parse(Int, ARGS[idx+1]) : -1
idx = findfirst(==("-m"),ARGS)
method_str = !isnothing(idx) ? lowercase(strip(ARGS[idx+1])) : "rss"
method = methods[method_str]
return edges, eweights, vweight, comm, clusters, embedding, asis, verbose, landmarks, forced, method
catch e
showerror(stderr, e)
println("\n\nUsage:")
println("\tjulia CGE.jl -g graph_edgelist -c communities -e embedding [-a -v] [-l landmarks -f forced -m method]")
println("\nParameters:")
println("graph_edgelist: rows should contain two vertices ids (edge) and optional weights")
println("communities: rows should contain cluster identifiers of consecutive vertices")
println("embedding: rows should contain whitespace separated locations of vertices in embedding")
println("-a: flag for sorting embedding")
println("-v: flag for debugging messages")
println("landmarks: required number of landmarks")
println("forced: required maximum number of forced splits of a cluster")
println("method: one of:")
println("\t* rss: minimize maximum residual sum of squares when doing a cluster split")
println("\t* rss2: minimize maximum residual sum of squares when doing a cluster split (slower)")
println("\t* size: make clusters have approximately the same size after a cluster split")
println("\t* diameter: make clusters have approximately the same diameter along first " *
"principal component after a cluster split")
println("(note that always a cluster with largest residual sum of squares is selected for splitting)")
exit(1)
end
end

function parseargs(ARGS::Vector{String})
methods = Dict("rss" => split_cluster_rss,
"rss2" => split_cluster_rss2,
"size" => split_cluster_size,
"diameter" => split_cluster_diameter)
try
# Optional arguments
##Flags
asis = !isnothing(findfirst(==("-a"),ARGS)) ? true : false
# Check if calculations should be verbose
verbose = !isnothing(findfirst(==("-v"),ARGS)) ? true : false

# Check for required arguments: -g graph_edgelist -c communities -e embedding
@assert length(ARGS) >= 6

# Load obligatory files
################
## Graph edges #
################

idx = findfirst(==("-g"),ARGS)
@assert !isnothing(idx)
fn_edges = ARGS[idx+1]

# read edges
edges = readdlm(fn_edges, Float64)
rows, no_cols = size(edges)
verbose && println("$no_cols columns in graph edgelist file")

# Validate file structure
@assert no_cols==2 || no_cols==3 "Expected 2 or 3 columns"
v_min = minimum(edges[:,1:2])
@assert v_min==0 || v_min==1 "Vertices should be either 0-based or 1-based"

# make vertices 1-based
if v_min == 0.0
edges[:,1:2] .+= 1.0
end
no_vertices = Int(maximum(edges[:,1:2]))
verbose && println("Vertices from $v_min to $no_vertices")

# if unweighted, add unit weights
# compute vertex weights
if no_cols == 2
edges = convert.(Int,edges)
eweights = ones(rows)
vweight = zeros(no_vertices)
for i in 1:rows
vweight[edges[i,1]]+=1.0
vweight[edges[i,2]]+=1.0
end
if !isnothing(idx)
fn_comm = ARGS[idx+1]
comm = readdlm(fn_comm,Int)
else
edges = convert.(Int,edges[:,1:2])
eweights = edges[:,3]
vweight = zeros(no_vertices)
for i in 1:rows
vweight[edges[i,1]]+=eweights[i]
vweight[edges[i,2]]+=eweights[i]
end
no_cols == 2 ? louvain(fn_edges,) : louvain(fn_edges, edges, eweights)
fn_comm = fn_edges*".ecg"
comm = readdlm(fn_comm,Int)
comm = comm[2:end,2]
comm = reshape(comm,size(comm)[1],1)
end
verbose && println("Done preparing edgelist and vertex weight")

################
## Communities #
################

idx = findfirst(==("-c"),ARGS)
@assert !isnothing(idx)
fn_comm = ARGS[idx+1]

# Read communities
comm = readdlm(fn_comm,Int)
comm_rows, no_cols = size(comm)

# Validate file structure
@assert no_cols==1 "Expected 1 column"
v_min = minimum(comm[:,1])
Expand All @@ -308,7 +152,7 @@ function parseargs(ARGS::Vector{String})
##############

idx = findfirst(==("-e"),ARGS)
@assert !isnothing(idx)
@assert !isnothing(idx) "Embedding file is required"
fn_embed = ARGS[idx+1]

# Read embedding
Expand All @@ -323,13 +167,6 @@ function parseargs(ARGS::Vector{String})
end
verbose && println("done preparing embedding")

#####################
## Output file name #
#####################
# idx = findfirst(==("-o"),ARGS)
# @assert !isnothing(idx)
# outfile = ARGS[idx+1]

#############
##Landmarks #
#############
Expand All @@ -356,10 +193,11 @@ function parseargs(ARGS::Vector{String})
catch e
showerror(stderr, e)
println("\n\nUsage:")
println("\tjulia CGE.jl -g graph_edgelist -c communities -e embedding [-a -v] [-l landmarks -f forced -m method]")
println("\tjulia CGE.jl -g graph_edgelist -e embedding [-c communities] [-a -v] [-l landmarks -f forced -m method]")
println("\nParameters:")
println("graph_edgelist: rows should contain two vertices ids (edge) and optional weights")
println("communities: rows should contain cluster identifiers of consecutive vertices")
println("if no file is given communities are calculated with Louvain algorithm")
println("embedding: rows should contain whitespace separated locations of vertices in embedding")
println("-a: flag for sorting embedding")
println("-v: flag for debugging messages")
Expand All @@ -372,5 +210,6 @@ function parseargs(ARGS::Vector{String})
println("\t* diameter: make clusters have approximately the same diameter along first " *
"principal component after a cluster split")
println("(note that always a cluster with largest residual sum of squares is selected for splitting)")
exit(1)
end
end
60 changes: 60 additions & 0 deletions src/clustering.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#######################
# Community detection #
#######################

"""
louvain(edges::String)
Calculate communities in graph using [Louvain algoritm](https://sites.google.com/site/findcommunities/)
**Arguments**
* `edges::String` name of file with edges definition
"""
function louvain(edges::String)
BASEPATH = dirname(dirname(pathof(CGE)))
TMP = Sys.iswindows() ? ENV["TEMP"] : "/tmp"
if Sys.iswindows()
TMP = ENV["TEMP"]
run(`$BASEPATH/bin/win/convert.exe -i $edges -o $TMP/louvain.bin`)
run(pipeline(`$BASEPATH/bin/win/louvain.exe $TMP/louvain.bin -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
run(pipeline(`$BASEPATH/bin/win/hierarchy.exe $TMP/louvain.txt -l 1`,"$edges.ecg"))
else
chmod(joinpath(BASEPATH,"bin/unix/"),0o744,recursive=true)
run(`$BASEPATH/bin/unix/convert -i $edges -o $TMP/louvain.bin`)
run(pipeline(`$BASEPATH/bin/unix/louvain $TMP/louvain.bin -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
run(pipeline(`$BASEPATH/bin/unix/hierarchy $TMP/louvain.txt -l 1`,"$edges.ecg"))
end
end

"""
louvain(filename::String, edges::Array{Int,2}, weights::Array{Float64,1}))
Calculate communities in weighted graph using [Louvain algoritm](https://sites.google.com/site/findcommunities/)
**Arguments**
* `filename::String` name of file with edges definition
* `edges::Array{Int,2}` list of edges
* `weights::Array{Float64,1}` array of egdges' weights
"""
function louvain(filename::String, edges::Array{Int,2}, weights::Array{Float64,1})
BASEPATH = dirname(dirname(pathof(CGE)))
TMP = Sys.iswindows() ? ENV["TEMP"] : "/tmp"
tmp_edges = joinpath(TMP,filename)
open(tmp_edges, "w") do f
for i in edges println(f, i) end
end
tmp_weights = joinpath(TMP,filename*".weights")
open(tmp_weights, "w") do f
for i in weights println(f, i) end
end
if Sys.iswindows()
run(`$BASEPATH/bin/win/convert.exe -i $tmp_edges -o $TMP/louvain.bin -w $tmp_weights`)
run(pipeline(`$BASEPATH/bin/win/louvain.exe $TMP/louvain.bin -w $tmp_weights -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
run(pipeline(`$BASEPATH/bin/win/hierarchy.exe $TMP/louvain.txt -l 1`,"$filename.ecg"))
else
chmod(joinpath(BASEPATH,"bin/unix/"),0o744,recursive=true)
run(`$BASEPATH/bin/unix/convert -i $tmp_edges -o $TMP/louvain.bin -w $tmp_weights`)
run(pipeline(`$BASEPATH/bin/unix/louvain $TMP/louvain.bin -w $tmp_weights -l -1 -q id_qual`, stdout="$TMP/louvain.txt"))
run(pipeline(`$BASEPATH/bin/unix/hierarchy $TMP/louvain.txt -l 1`,"$filename.ecg"))
end
end
Loading

0 comments on commit ee8678c

Please sign in to comment.