From 156b86074e6a680a15cfd48becd92f8006bf377e Mon Sep 17 00:00:00 2001 From: Heiko Date: Sat, 10 Aug 2024 00:05:33 +0200 Subject: [PATCH] [GA-165] Phenolrs: Support Edge Values in COO Loading for `NetworkXGraph` (#29) * support to write edge values when using data structure coo * u64 instead of vec * fixed insert edge in networkx graph when "@collection_name" is being set * black fmt * cargo fmt * add EdgeValuesDict and use it * black fmt * add a test for edge values * added failure test * removed comment, added todo * use latest lib vers 0.0.7, code cleanup * fmt * Update python/tests/test_all.py * put coo relevant logic in coo relevant code block, f64 instead of u64 * Update python/phenolrs/networkx/typings.py Co-authored-by: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> * applied proper error testing code review request * remove not required res * more tests * black fmt * remove `bool` as possible `EdgeValue` type (my mistake) --------- Co-authored-by: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Co-authored-by: Anthony Mahanna --- Cargo.lock | 4 +- Cargo.toml | 2 +- python/phenolrs/networkx/loader.py | 29 +++-- python/phenolrs/networkx/typings.py | 1 + python/phenolrs/phenolrs.pyi | 2 + python/tests/conftest.py | 44 ++++++++ python/tests/test_all.py | 161 +++++++++++++++++++++++++++- src/graph.rs | 46 ++++++-- src/lib.rs | 21 ++-- src/output/construct.rs | 14 +++ 10 files changed, 292 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3885de..497a8aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,9 +40,9 @@ checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" [[package]] name = "arangors-graph-exporter" -version = "0.0.6" +version = "0.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "551cbd6efc058ff4dcd79a675660c9640d60a9aa0a6d03e891e9dd9102d35b51" +checksum = "9d073d5151e27cf0d3a1c51a5f088e24d4ded0353e714469c4963c2aba72bab8" dependencies = [ "bytes", "log", diff --git a/Cargo.toml b/Cargo.toml index 139fe8a..0389495 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ lto = "fat" # TODO: add rustflags for target features/cpus [dependencies] -arangors-graph-exporter = "0.0.6" +arangors-graph-exporter = "0.0.7" numpy = "0.20.0" tokio = { version = "1", features = ["full"] } bytes = "1.5.0" diff --git a/python/phenolrs/networkx/loader.py b/python/phenolrs/networkx/loader.py index 7299858..d4490a1 100644 --- a/python/phenolrs/networkx/loader.py +++ b/python/phenolrs/networkx/loader.py @@ -12,6 +12,7 @@ MultiGraphAdjDict, NodeDict, SrcIndices, + EdgeValuesDict, ) @@ -41,6 +42,7 @@ def load_into_networkx( DstIndices, EdgeIndices, ArangoIDtoIndex, + EdgeValuesDict, ]: if "vertexCollections" not in metagraph: raise PhenolError("vertexCollections not found in metagraph") @@ -113,16 +115,22 @@ def load_into_networkx( for e_col_name, entries in metagraph["edgeCollections"].items() ] - node_dict, adj_dict, src_indices, dst_indices, edge_indices, id_to_index_map = ( - graph_to_networkx_format( - request={ - "vertex_collections": vertex_collections, - "edge_collections": edge_collections, - "database_config": db_config_options, - "load_config": load_config_options, - }, - graph_config=graph_config, # TODO Anthony: Move into request - ) + ( + node_dict, + adj_dict, + src_indices, + dst_indices, + edge_indices, + id_to_index_map, + edge_values, + ) = graph_to_networkx_format( + request={ + "vertex_collections": vertex_collections, + "edge_collections": edge_collections, + "database_config": db_config_options, + "load_config": load_config_options, + }, + graph_config=graph_config, # TODO Anthony: Move into request ) return ( @@ -132,4 +140,5 @@ def load_into_networkx( dst_indices, edge_indices, id_to_index_map, + edge_values, ) diff --git a/python/phenolrs/networkx/typings.py b/python/phenolrs/networkx/typings.py index ac45c80..066a824 100644 --- a/python/phenolrs/networkx/typings.py +++ b/python/phenolrs/networkx/typings.py @@ -9,6 +9,7 @@ DiGraphAdjDict = dict[str, GraphAdjDict] MultiGraphAdjDict = dict[str, dict[str, dict[int, Json]]] MultiDiGraphAdjDict = dict[str, MultiGraphAdjDict] +EdgeValuesDict = dict[str, list[int | float]] SrcIndices = npt.NDArray[np.int64] DstIndices = npt.NDArray[np.int64] diff --git a/python/phenolrs/phenolrs.pyi b/python/phenolrs/phenolrs.pyi index ead9ba9..ffc123a 100644 --- a/python/phenolrs/phenolrs.pyi +++ b/python/phenolrs/phenolrs.pyi @@ -13,6 +13,7 @@ from .networkx.typings import ( MultiGraphAdjDict, NodeDict, SrcIndices, + EdgeValuesDict, ) from .numpy.typings import ( ArangoCollectionToArangoKeyToIndex, @@ -36,6 +37,7 @@ def graph_to_networkx_format( DstIndices, EdgeIndices, ArangoIDtoIndex, + EdgeValuesDict, ]: ... class PhenolError(Exception): ... diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 39e3540..d57ca0f 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -60,6 +60,50 @@ def abide_db_name() -> str: return "abide" +@pytest.fixture(scope="module") +def custom_graph_db_name() -> str: + return "custom_graph" + + +@pytest.fixture(scope="module") +def load_line_graph( + custom_graph_db_name: str, connection_information: Dict[str, Any] +) -> None: + client = arango.ArangoClient(connection_information["url"]) + sys_db = client.db( + "_system", + username=connection_information["username"], + password=connection_information["password"], + ) + + if not sys_db.has_database(custom_graph_db_name): + sys_db.delete_database(custom_graph_db_name, ignore_missing=True) + sys_db.create_database(custom_graph_db_name) + custom_graph_db = client.db( + custom_graph_db_name, + username=connection_information["username"], + password=connection_information["password"], + ) + + edge_def = [ + { + "edge_collection": "line_graph_edges", + "from_vertex_collections": ["line_graph_vertices"], + "to_vertex_collections": ["line_graph_vertices"], + } + ] + + G = nx.Graph() + G.add_edge(0, 1, boolean_weight=True, int_value=1, float_value=1.1) + G.add_edge(1, 2, boolean_weight=False, int_value=2, float_value=2.2) + G.add_edge(2, 3, boolean_weight=True, int_value=3, float_value=3.3) + G.add_edge(3, 4, boolean_weight=False, int_value=4, float_value=4.4) + + ADBNX_Adapter(custom_graph_db).networkx_to_arangodb( + custom_graph_db_name, G, edge_def + ) + + @pytest.fixture(scope="module") def load_karate(karate_db_name: str, connection_information: Dict[str, Any]) -> None: client = arango.ArangoClient(connection_information["url"]) diff --git a/python/tests/test_all.py b/python/tests/test_all.py index 5cd7e4a..5cc169a 100644 --- a/python/tests/test_all.py +++ b/python/tests/test_all.py @@ -132,6 +132,8 @@ def test_karate_networkx( adj_dict: Any from_key = "person/1" to_key = "person/2" + # TODO: This value is actually never used. This var + # is going to be overwritten. # MultiDiGraph res = NetworkXLoader.load_into_networkx( @@ -154,6 +156,7 @@ def test_karate_networkx( dst_indices, edge_indices, vertex_ids_to_indices, + edge_values, ) = res assert isinstance(node_dict, dict) @@ -161,8 +164,10 @@ def test_karate_networkx( assert isinstance(src_indices, numpy.ndarray) assert isinstance(dst_indices, numpy.ndarray) assert isinstance(vertex_ids_to_indices, dict) + assert isinstance(edge_values, dict) assert len(node_dict) == len(vertex_ids_to_indices) == 34 assert len(src_indices) == len(dst_indices) == len(edge_indices) == 78 + assert len(edge_values) == 0 assert set(adj_dict.keys()) == {"succ", "pred"} succ = adj_dict["succ"] @@ -213,10 +218,13 @@ def test_karate_networkx( dst_indices, edge_indices, vertex_ids_to_indices, + edge_values, ) = res assert from_key in adj_dict["succ"][to_key] assert to_key in adj_dict["pred"][from_key] assert len(src_indices) == len(dst_indices) == len(edge_indices) == 156 + assert isinstance(edge_values, dict) + assert len(edge_values) == 0 # DiGraph res = NetworkXLoader.load_into_networkx( @@ -239,6 +247,7 @@ def test_karate_networkx( dst_indices, edge_indices, vertex_ids_to_indices, + edge_values, ) = res assert len(src_indices) == len(dst_indices) == 78 @@ -247,6 +256,8 @@ def test_karate_networkx( for to_id, edge in adj.items(): assert isinstance(edge, dict) assert edge == adj_dict["pred"][to_id][from_id] + assert isinstance(edge_values, dict) + assert len(edge_values) == 0 # MultiGraph res = NetworkXLoader.load_into_networkx( @@ -269,6 +280,7 @@ def test_karate_networkx( dst_indices, edge_indices, vertex_ids_to_indices, + edge_values, ) = res assert ( @@ -283,6 +295,8 @@ def test_karate_networkx( assert len(adj_dict[from_key][to_key]) == 1 assert type(next(iter(adj_dict[from_key][to_key].keys()))) is int assert isinstance(adj_dict[from_key][to_key][0], dict) # type: ignore + assert isinstance(edge_values, dict) + assert len(edge_values) == 0 # Graph res = NetworkXLoader.load_into_networkx( @@ -305,12 +319,15 @@ def test_karate_networkx( dst_indices, edge_indices, vertex_ids_to_indices, + edge_values, ) = res assert len(edge_indices) == 0 assert len(adj_dict[from_key][to_key]) > 1 for key in adj_dict[from_key][to_key].keys(): assert isinstance(key, str) + assert isinstance(edge_values, dict) + assert len(edge_values) == 0 # Graph (no vertex/edge attributes) res = NetworkXLoader.load_into_networkx( @@ -329,7 +346,15 @@ def test_karate_networkx( is_directed=False, is_multigraph=False, ) - node_dict, adj_dict, _, _, _, _ = res + ( + node_dict, + adj_dict, + _, + _, + _, + _, + _, + ) = res assert len(node_dict) == len(adj_dict) > 0 for v in node_dict.values(): @@ -343,7 +368,7 @@ def test_karate_networkx( # Graph (custom vertex/edge attributes) with pytest.raises(PhenolError): - res = NetworkXLoader.load_into_networkx( + NetworkXLoader.load_into_networkx( karate_db_name, { "vertexCollections": {"person": {"club"}}, @@ -356,7 +381,7 @@ def test_karate_networkx( ) with pytest.raises(PhenolError): - res = NetworkXLoader.load_into_networkx( + NetworkXLoader.load_into_networkx( karate_db_name, { "vertexCollections": {"person": {"club"}}, @@ -386,7 +411,7 @@ def test_karate_networkx( is_multigraph=False, ) - node_dict, adj_dict, _, _, _, _ = res + node_dict, adj_dict, _, _, _, _, _ = res assert len(node_dict) == len(adj_dict) > 0 for v in node_dict.values(): @@ -398,6 +423,132 @@ def test_karate_networkx( assert isinstance(v2, dict) assert list(v2.keys()) == ["weight"] + # Test that numeric values out of edges can be read + res = NetworkXLoader.load_into_networkx( + karate_db_name, + { + "vertexCollections": {"person": {"club"}}, + "edgeCollections": {"knows": {"weight"}}, + }, + [connection_information["url"]], + username=connection_information["username"], + password=connection_information["password"], + load_adj_dict=True, + load_coo=True, + load_all_vertex_attributes=False, + load_all_edge_attributes=False, + is_directed=False, + is_multigraph=False, + ) + + _, _, _, _, _, _, edge_values = res + + assert isinstance(edge_values, dict) + assert "weight" in edge_values + assert isinstance(edge_values["weight"], list) + assert len(edge_values["weight"]) == 78 + assert all(isinstance(x, (int, float)) for x in edge_values["weight"]) + + # Test that non-numeric read of edge values will fail + # -> In this case, strings are being tested. + with pytest.raises(PhenolError) as e: + NetworkXLoader.load_into_networkx( + karate_db_name, + { + "vertexCollections": {"person": {"club"}}, + # Selecting _key here as this is guaranteed to be a string + "edgeCollections": {"knows": {"_key"}}, + }, + [connection_information["url"]], + username=connection_information["username"], + password=connection_information["password"], + load_adj_dict=True, + load_coo=True, + load_all_vertex_attributes=False, + load_all_edge_attributes=False, + is_directed=False, + is_multigraph=False, + ) + assert "Could not insert edge" in str(e) + assert "Edge data must be a numeric value" in str(e) + + +def test_coo_edge_values_networkx( + load_line_graph: None, + custom_graph_db_name: str, + connection_information: dict[str, str], +) -> None: + # Non-numeric: Booleans + with pytest.raises(PhenolError) as e: + NetworkXLoader.load_into_networkx( + custom_graph_db_name, + { + "vertexCollections": {"line_graph_vertices": set()}, + "edgeCollections": {"line_graph_edges": {"boolean_weight"}}, + }, + [connection_information["url"]], + username=connection_information["username"], + password=connection_information["password"], + load_adj_dict=False, + load_coo=True, + load_all_vertex_attributes=False, + load_all_edge_attributes=False, + is_directed=False, + is_multigraph=False, + ) + assert "Could not insert edge" in str(e) + assert "Edge data must be a numeric value" in str(e) + + # Numeric: Ints + res = NetworkXLoader.load_into_networkx( + custom_graph_db_name, + { + "vertexCollections": {"line_graph_vertices": set()}, + "edgeCollections": {"line_graph_edges": {"int_value"}}, + }, + [connection_information["url"]], + username=connection_information["username"], + password=connection_information["password"], + load_adj_dict=False, + load_coo=True, + load_all_vertex_attributes=False, + load_all_edge_attributes=False, + is_directed=False, + is_multigraph=False, + ) + _, _, _, _, _, _, edge_values = res + + assert isinstance(edge_values, dict) + assert "int_value" in edge_values + assert isinstance(edge_values["int_value"], list) + assert len(edge_values["int_value"]) == 4 + assert all(isinstance(x, float) for x in edge_values["int_value"]) + + # Numeric: Floats + res = NetworkXLoader.load_into_networkx( + custom_graph_db_name, + { + "vertexCollections": {"line_graph_vertices": set()}, + "edgeCollections": {"line_graph_edges": {"float_value"}}, + }, + [connection_information["url"]], + username=connection_information["username"], + password=connection_information["password"], + load_adj_dict=False, + load_coo=True, + load_all_vertex_attributes=False, + load_all_edge_attributes=False, + is_directed=False, + is_multigraph=False, + ) + _, _, _, _, _, _, edge_values = res + + assert isinstance(edge_values, dict) + assert "float_value" in edge_values + assert isinstance(edge_values["float_value"], list) + assert len(edge_values["float_value"]) == 4 + assert all(isinstance(x, float) for x in edge_values["float_value"]) + def test_multigraph_networkx( load_multigraph: None, @@ -426,6 +577,7 @@ def test_multigraph_networkx( dst_indices, edge_indices, _, + _, # edge_values ) = res assert list(src_indices) == [0, 1, 0, 1, 1, 2, 2, 3, 2, 3] @@ -454,6 +606,7 @@ def test_multigraph_networkx( dst_indices, edge_indices, _, + _, # edge_values ) = res assert list(src_indices) == [0, 0, 1, 2, 2] diff --git a/src/graph.rs b/src/graph.rs index 63a3b87..fafb3b3 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -1,5 +1,6 @@ use serde_json::{Map, Value}; use std::collections::HashMap; +use std::hash::Hash; use std::sync::{Arc, RwLock}; use anyhow::{anyhow, Result}; @@ -81,7 +82,7 @@ pub struct NetworkXGraph { pub vertex_id_to_index: HashMap, // e.g {'user/1': 0, 'user/2': 1, ...} pub edge_indices: Vec, //only for multi(di)graph pub edge_index_map: HashMap<(String, String), usize>, //only for multi(di)graph - // pub edge_values: HashMap>, // {'weight': [4, 5, 1], ...)} + pub edge_values: HashMap>, // {'weight': [4, 5, 1], ...)} // pre-defined functions get_vertex_properties_fn: @@ -90,7 +91,7 @@ pub struct NetworkXGraph { get_edge_properties_fn: fn(&mut NetworkXGraph, String, String, Vec, &Vec) -> Map, - insert_coo_fn: fn(&mut NetworkXGraph, String, String), + insert_coo_fn: fn(&mut NetworkXGraph, String, String, HashMap), insert_adj_fn: fn(&mut NetworkXGraph, String, String, Map), } @@ -188,7 +189,7 @@ impl NetworkXGraph { vertex_id_to_index: HashMap::new(), edge_indices: vec![], edge_index_map: HashMap::new(), - // edge_values: HashMap::new(), + edge_values: HashMap::new(), get_vertex_properties_fn, get_edge_properties_fn, insert_coo_fn, @@ -301,11 +302,20 @@ impl NetworkXGraph { (from_id_index, to_id_index) } + fn store_edge_properties(&mut self, properties: HashMap) { + for (key, value) in properties { + if !self.edge_values.contains_key(&key) { + self.edge_values.insert(key.clone(), vec![]); + } + self.edge_values.get_mut(&key).unwrap().push(value); + } + } + fn insert_coo_graph( &mut self, from_id_str: String, to_id_str: String, - // _properties: Map, + properties: HashMap, ) { let (from_id_index, to_id_index) = self.get_from_and_to_id_index(from_id_str, to_id_str); @@ -314,25 +324,28 @@ impl NetworkXGraph { self.coo.0.push(to_id_index); self.coo.1.push(from_id_index); + + self.store_edge_properties(properties); } fn insert_coo_digraph( &mut self, from_id_str: String, to_id_str: String, - // _properties: Map, + properties: HashMap, ) { let (from_id_index, to_id_index) = self.get_from_and_to_id_index(from_id_str, to_id_str); self.coo.0.push(from_id_index); self.coo.1.push(to_id_index); + self.store_edge_properties(properties); } fn insert_coo_multigraph( &mut self, from_id_str: String, to_id_str: String, - // _properties: Map, + properties: HashMap, ) { let (from_id_index, to_id_index) = self.get_from_and_to_id_index(from_id_str.clone(), to_id_str.clone()); @@ -364,13 +377,15 @@ impl NetworkXGraph { self.coo.0.push(to_id_index); self.coo.1.push(from_id_index); self.edge_indices.push(edge_index); + + self.store_edge_properties(properties); } fn insert_coo_multidigraph( &mut self, from_id_str: String, to_id_str: String, - // _properties: Map, + properties: HashMap, ) { let (from_id_index, to_id_index) = self.get_from_and_to_id_index(from_id_str.clone(), to_id_str.clone()); @@ -394,6 +409,8 @@ impl NetworkXGraph { self.coo.0.push(from_id_index); self.coo.1.push(to_id_index); self.edge_indices.push(edge_index); + + self.store_edge_properties(properties); } fn insert_adj_graph( @@ -723,7 +740,20 @@ impl Graph for NetworkXGraph { let to_id_str: String = String::from_utf8(to_id.clone()).unwrap(); if self.load_coo { - (self.insert_coo_fn)(self, from_id_str.clone(), to_id_str.clone()); + let mut properties: HashMap = HashMap::new(); + for (field_position, field_name) in field_names.iter().enumerate() { + if field_name == "@collection_name" { + continue; + } + let field_vec = match columns[field_position].as_f64() { + Some(v) => v, + _ => return Err(anyhow!("Edge data must be a numeric value")), + }; + + properties.insert(field_name.clone(), field_vec); + } + + (self.insert_coo_fn)(self, from_id_str.clone(), to_id_str.clone(), properties); } if self.load_adj_dict { diff --git a/src/lib.rs b/src/lib.rs index f90ac5c..1b8cb4a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,12 +72,13 @@ fn graph_to_networkx_format( request: DataLoadRequest, graph_config: NetworkXGraphConfig, ) -> PyResult<( - &PyDict, - &PyDict, - &PyArray1, - &PyArray1, - &PyArray1, - &PyDict, + &PyDict, // node_dict + &PyDict, // adj_dict + &PyArray1, // src_indices + &PyArray1, // dst_indices + &PyArray1, // edge_indices + &PyDict, // vertex_id_to_index + &PyDict, // edge_values )> { let load_all_vertex_attributes = request.load_config.load_all_vertex_attributes; let load_all_edge_attributes = request.load_config.load_all_edge_attributes; @@ -95,7 +96,11 @@ fn graph_to_networkx_format( }; println!("Retrieving NetworkX Graph..."); - let graph = load::retrieve::get_arangodb_graph(request, graph_factory).unwrap(); + let graph_res = load::retrieve::get_arangodb_graph(request, graph_factory); + if let Err(e) = graph_res { + return Err(PhenolError::new_err(e.to_string())); + } + let graph = graph_res.unwrap(); println!("Retrieved. Building python objects..."); let node_dict = construct::construct_node_dict(graph.node_map, py)?; @@ -119,6 +124,7 @@ fn graph_to_networkx_format( let dst_indices = PyArray1::from_vec(py, coo.1); let edge_indices = PyArray1::from_vec(py, graph.edge_indices); let vertex_id_to_index = construct::construct_vertex_id_to_index(graph.vertex_id_to_index, py)?; + let edge_values = construct::construct_edge_value_dict(graph.edge_values, py)?; let res = ( node_dict, @@ -127,6 +133,7 @@ fn graph_to_networkx_format( dst_indices, edge_indices, vertex_id_to_index, + edge_values, ); Ok(res) diff --git a/src/output/construct.rs b/src/output/construct.rs index ded1e23..efa4680 100644 --- a/src/output/construct.rs +++ b/src/output/construct.rs @@ -75,6 +75,20 @@ pub fn construct_vertex_id_to_index( Ok(pydict) } +pub fn construct_edge_value_dict( + input: HashMap>, + py: Python, +) -> PyResult<&PyDict> { + let pydict = PyDict::new(py); + + for (key, value) in input.iter() { + let py_value = value; + pydict.set_item(key, py_value)?; + } + + Ok(pydict) +} + #[cfg(not(test))] /// { /// "node/1": {property_key: property_value},