From 29e5ac9f45fb9c11c885f4c3ce6046bf217d9289 Mon Sep 17 00:00:00 2001 From: Christopher Schmidt Date: Tue, 26 Jul 2022 10:10:48 +0200 Subject: [PATCH 1/2] add normalization as (col(contVar) - col(contVar).mean()) / col(contVar).std() --- README.md | 1 + manm_cs/__main__.py | 4 ++++ manm_cs/graph/graph.py | 13 ++++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0bc4440..a8c1c0a 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ python3 -m twine upload dist/* | beta_lower_limit | (0, Inf) | 0.5 | Lower limit for beta values for influence of continuous parents. Betas are sampled uniform from the union of [-upper,-lower] and [lower,upper]. Upper limit see below. | | beta_upper_limit | (0, Inf) | 1 | Upper limit for beta values for influence of continuous parents. Betas are sampled uniform from the union of [-upper,-lower] and [lower,upper]. Lower limit see above. | | graph_structure_file | | None | Defines a path to a .gml file for a fixed DAG structure (ignoring node and edge characteristics) used during manm_cs graph building. Note graph_structure_file is mutually exclusive to num_nodes and edge_density. | +| normalized | 0 or 1 | 0 | Defines if the generated samples of continuous variables are normalized after sampling. | ## License diff --git a/manm_cs/__main__.py b/manm_cs/__main__.py index 90db02f..ec5a181 100644 --- a/manm_cs/__main__.py +++ b/manm_cs/__main__.py @@ -124,6 +124,8 @@ def parse_args(): parser.add_argument('--output_samples_file', type=str, required=False, default=SAMPLES_FILE, help='Output file (path) for the generated samples csv. Relative to the directory from which the library is executed.' 'Specify without file extension.') + parser.add_argument('--normalize', type=to_bool, required=False, default=False, + help='Normalize the continuous variables in the dataset once all samples are generated.') args = parser.parse_args() assert args.min_discrete_value_classes <= args.max_discrete_value_classes, \ @@ -171,6 +173,8 @@ def graph_from_args(args) -> Graph: graph = graph_from_args(args) dfs = graph.sample(num_observations=args.num_samples, num_processes=args.num_processes) + if args.normalize: + dfs = graph.normalize_continous_columns(dataframes=dfs) write_single_csv(dataframes=dfs, target_path=f"{args.output_samples_file}.csv") nx_graph = graph.to_networkx_graph() diff --git a/manm_cs/graph/graph.py b/manm_cs/graph/graph.py index 170e0f6..d0f76ff 100644 --- a/manm_cs/graph/graph.py +++ b/manm_cs/graph/graph.py @@ -6,7 +6,7 @@ import numpy as np from pathos.multiprocessing import ProcessingPool -from manm_cs.variables.variable import Variable +from manm_cs.variables.variable import Variable, VariableType class Graph: @@ -50,3 +50,14 @@ def to_networkx_graph(self) -> nx.DiGraph: for parent in var.parents: nx_graph.add_edge(parent.idx, var.idx) return nx_graph + + def normalize_continous_columns(self, dataframes: List[pd.DataFrame]) -> List[pd.DataFrame]: + + # merge df in dataframes + merged_df = pd.concat(dataframes, axis=1) + for variable in self.variables: + if variable.type == VariableType.CONTINUOUS: + print(variable.idx, merged_df[variable.idx].mean(), merged_df[variable.idx].std()) + merged_df[variable.idx] = (merged_df[variable.idx] - merged_df[variable.idx].mean())/ merged_df[variable.idx].std() + + return [merged_df] From c2e75ac45b0f744dce75f80b44da407eee630458 Mon Sep 17 00:00:00 2001 From: Christopher Hagedorn Date: Tue, 26 Jul 2022 10:29:47 +0200 Subject: [PATCH 2/2] bump version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index bf349db..cfdc278 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = manm_cs -version = 0.1.1 +version = 0.1.2 author = Johannes Huegle, Christopher Hagedorn, Lukas Boehme, Mats Poerschke, Jonas Umland author_email = johannes.huegle@hpi.de description = Data generation for causal structure learning based on mixed additive noise model (MANM)