privategsd, instalation needed before usage

DataResponsibly · Nov 30, 2023 · c84ed49 · c84ed49
1 parent 809112d
commit c84ed49
Show file tree

Hide file tree

Showing 6 changed files with 421 additions and 2 deletions.
diff --git a/SynRD/synthesizers/__init__.py b/SynRD/synthesizers/__init__.py
@@ -1,8 +1,10 @@
 from .synthesizer import Synthesizer, MSTSynthesizer, PATECTGAN, PrivBayes, PacSynth, AIMSynthesizer
+from .gsd_synth import GsdSynthesizer
 
 __all__ = ["Synthesizer", 
            "MSTSynthesizer",
            "PATECTGAN",
            "PrivBayes",
            "PacSynth",
-           "AIMSynthesizer"]
+           "AIMSynthesizer",
+           "GsdSynthesizer"]
diff --git a/SynRD/synthesizers/gsd_synth.py b/SynRD/synthesizers/gsd_synth.py
@@ -0,0 +1,242 @@
+from synthesizer import Synthesizer
+import pandas as pd
+from models import GSD
+from stats import ChainedStatistics, Marginals
+from jax.random import KeyArray
+from src.utils import Dataset, Domain
+import numpy as np
+from snsynth.transform.table import TableTransformer
+
+class GsdSynthesizer(Synthesizer):
+    """
+    Genetic algorithm synthesizer.
+
+    ----------
+    Parameters
+        epsilon : float
+            Privacy budget for the synthesizer.
+    -----------
+    Optional keyword arguments:
+        slide_range : bool = False
+            Specifies if the slide range transformation should be applied, this will 
+            make the minimal value of each column 0 before fitting.
+        thresh : float = 0.05
+            Specifies what the ratio of unique values to the column length should be for
+            the column to be threated as cathegorical.
+        delta : float = 1e-09
+            Privacy parameter, should be small, in the range of 1/(n * sqrt(n)).
+        num_generations : int = 20000
+            Total number of generations to run algorithm for.
+        data_size : int = 2000
+            The size of resulting dataframe.
+        population_size_muta : int = 50
+            Mutations population size.
+        population_size_cross: int = 50
+            Crossover population size.
+        population_size : int = None
+            Total size of the population. If None will be taken as sum of mutations and crossover
+            populations. Otherwise the sizes of mutations and crossover populations will be derived
+            as a half of the total population size.
+        muta_rate : int = 1
+            The number of rows altered with the mutation operation.
+        mate_rate : int = 1
+            The number of rows altered with th crossover operation.
+        print_progress : bool = False
+            Specifies if additional information should be printed or not.
+        stop_early : bool = True
+            Specifies if early stopping mechanism should be applied.
+        stop_early_gen : int = None
+            The number of generations after which the early stopping is available. If this value
+            is set to k, then early stopping condition will be checked on every k-th generation.
+            Setting it to None will result in the same behaviour as setting it to the data_size
+            value.
+        stop_eary_threshold : float = 0
+            Defines the minimal fitness score the generations best candidate should have to proceed
+            with the algorithm.
+        sparse_statistics : bool = False
+            Defines if sparsed statistics should be used.
+
+    """
+    def __init__(
+        self,
+        epsilon: float = None,
+        slide_range: bool = None,
+        thresh: float = None,
+        delta: float = None, 
+        num_generations : int = None,
+        data_size : int = None,
+        population_size_muta : int = None,
+        population_size_cross: int = None,
+        population_size : int = None,
+        muta_rate : int = None,
+        mate_rate : int = None,
+        print_progress : bool = None,
+        stop_early : bool = None,
+        stop_early_gen : int = None,
+        stop_early_threshold : float = None,
+        sparse_statistics=False,
+        **synth_kwargs: dict()
+    ):
+        super().__init__(epsilon, slide_range, thresh)
+        allowed_additional_params = {"delta", "num_generations", "data_size", "population_size_muta",
+                                     "population_size_cross", "population_size", "muta_rate",
+                                     "mate_rate", "print_progress", "stop_early", "stop_early_gen",
+                                     "stop_early_threshold", "sparse_statistics"}
+        for param in synth_kwargs.keys():
+            if param not in allowed_additional_params:
+                raise ValueError(
+                    f"Parameter '{param}' is not available for this type of synthesizer."
+                )
+
+        param_defaults = {
+            "delta": (float, 1e-9), 
+            "num_generations": (int, 20000), 
+            "data_size": (int, 2000), 
+            "population_size_muta": (int, 50),
+            "population_size_cross": (int, 50), 
+            "population_size": (int, None), 
+            "muta_rate": (int, 1),
+            "mate_rate": (int, 1), 
+            "print_progress": (bool, False), 
+            "stop_early": (bool, True), 
+            "stop_early_gen": (int, None),
+            "stop_early_threshold": (float, 0), 
+            "sparse_statistics": (bool, False)
+        }
+
+        for param, (param_type, default_value) in param_defaults.items():
+            param_value = locals().get(param)
+            if param_value is not None:
+                if type(param_value) is int and param_type is float:
+                    param_value = float(param_value)
+                if isinstance(param_type, tuple):
+                    correctly_typed = False
+                    for single_type in param_type:
+                        if type(param_value) is single_type:
+                            correctly_typed = True
+                    if not correctly_typed:
+                        raise TypeError(
+                        f"{param} must be of one of the types {', '.join(list(map(lambda x: x.__name__, param_type)))}, got {type(param_value).__name__}."
+                    )
+                elif type(param_value) is not param_type:
+                    raise TypeError(
+                        f"{param} must be of type {param_type.__name__}, got {type(param_value).__name__}."
+                    )
+                setattr(self, param, param_value)
+            else:
+                setattr(self, param, default_value)
+
+        self.synth_kwargs = synth_kwargs
+        # self.synthesizer = SmartnoiseAIMSynthesizer(
+        #     epsilon=self.epsilon,
+        #     delta=self.delta,
+        #     max_model_size=self.max_model_size,
+        #     degree=self.degree,
+        #     num_marginals=self.num_marginals,
+        #     max_cells=self.max_cells,
+        #     rounds=self.rounds,
+        #     verbose=self.verbose,
+        #     **synth_kwargs,
+        # )
+    def _get_train_data(self, data, *ignore, style, transformer, categorical_columns, ordinal_columns, continuous_columns, nullable, preprocessor_eps):
+        if transformer is None or isinstance(transformer, dict):
+            self._transformer = TableTransformer.create(data, style=style,
+                categorical_columns=categorical_columns,
+                continuous_columns=continuous_columns,
+                ordinal_columns=ordinal_columns,
+                nullable=nullable,
+                constraints=transformer)
+        elif isinstance(transformer, TableTransformer):
+            self._transformer = transformer
+        else:
+            raise ValueError("transformer must be a TableTransformer object, a dictionary or None.")
+        if not self._transformer.fit_complete:
+            if self._transformer.needs_epsilon and (preprocessor_eps is None or preprocessor_eps == 0.0):
+                raise ValueError("Transformer needs some epsilon to infer bounds.  If you know the bounds, pass them in to save budget.  Otherwise, set preprocessor_eps to a value > 0.0 and less than the training epsilon.  Preprocessing budget will be subtracted from training budget.")
+            self._transformer.fit(
+                data,
+                epsilon=preprocessor_eps
+            )
+            eps_spent, _ = self._transformer.odometer.spent
+            if eps_spent > 0.0:
+                self.epsilon -= eps_spent
+                print(f"Spent {eps_spent} epsilon on preprocessor, leaving {self.epsilon} for training")
+                if self.epsilon < 10E-3:
+                    raise ValueError("Epsilon remaining is too small!")
+        train_data = self._transformer.transform(data)
+        return train_data
+
+
+    def fit(self,  
+            key: KeyArray, 
+            df: pd.DataFrame, 
+            tolerance: float = 0.0, 
+            adaptive_epoch=1,
+            transformer=None,
+            categorical_columns=[],
+            ordinal_columns=[],
+            continuous_columns=[],
+            preprocessor_eps=0.0,
+            nullable=False ):
+
+        if type(df) is pd.DataFrame:
+            self.original_column_names = df.columns
+
+        categorical_check = (len(self._categorical_continuous(df)['categorical']) == len(list(df.columns)))
+        if not categorical_check:
+            raise ValueError('Please make sure that RAP gets categorical/ordinal\
+                features only. If you are sure you only passed categorical, \
+                increase the `thresh` parameter.')
+        df = self._slide_range(df)
+
+        train_data = self._get_train_data(
+            df,
+            style='cube',
+            transformer=transformer,
+            categorical_columns=categorical_columns,
+            ordinal_columns=ordinal_columns,
+            continuous_columns=continuous_columns,
+            nullable=nullable,
+            preprocessor_eps=preprocessor_eps
+        )
+        print(train_data)
+
+        if self._transformer is None:
+            raise ValueError("We weren't able to fit a transformer to the data. Please check your data and try again.")
+
+        cards = self._transformer.cardinality
+        if any(c is None for c in cards):
+            raise ValueError("The transformer appears to have some continuous columns. Please provide only categorical or ordinal.")
+
+        colnames = ["col" + str(i) for i in range(self._transformer.output_width)]
+
+        if len(cards) != len(colnames):
+            raise ValueError("Cardinality and column names must be the same length.")
+
+        domain = Domain(colnames, cards)
+        print(colnames)
+        data = pd.DataFrame(train_data, columns=colnames)
+        data = Dataset(df=data, domain=domain)
+        marginal_module2 = Marginals.get_all_kway_combinations(data.domain, k=2, bins=[2, 4, 8, 16, 32])
+        stat_module = ChainedStatistics([marginal_module2])
+        stat_module.fit(data)
+        print(stat_module)
+        self.synthesizer = GSD(num_generations=self.num_generations,
+                               domain=domain, 
+                               data_size=self.data_size,
+                               population_size_muta=self.population_size_muta,
+                               population_size_cross=self.population_size_cross,
+                               population_size=self.population_size,
+                               muta_rate=self.muta_rate,
+                               mate_rate=self.mate_rate,
+                               print_progress=self.print_progress,
+                               stop_early=self.stop_early,
+                               stop_early_gen=self.stop_early_gen,
+                               stop_eary_threshold=self.stop_early_threshold)
+        self.res_df = self.synthesizer.fit(key, stat_module, data, tolerance, adaptive_epoch)
+
+    def sample(self, n):
+        assert n == self.data_size, "Can sample only the same amount as data_size provided during initialization"
+        assert self.res_df is not None, "Please fit the synthesizer first."
+        df = self._unslide_range(self.res_df)
+        return df
diff --git a/SynRD/synthesizers/gsd_test.ipynb b/SynRD/synthesizers/gsd_test.ipynb
@@ -0,0 +1,102 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.\n",
+      "/home/sasha-tsepilova/miniconda3/envs/synrd1/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'PNGKey' from 'jax.random' (/home/sasha-tsepilova/miniconda3/envs/synrd1/lib/python3.9/site-packages/jax/random.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[1;32m/home/sasha-tsepilova/SynRD/SynRD/synthesizers/gsd_test.ipynb Cell 1\u001b[0m line \u001b[0;36m5\n\u001b[1;32m      <a href='vscode-notebook-cell://wsl%2Bubuntu-22.04/home/sasha-tsepilova/SynRD/SynRD/synthesizers/gsd_test.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mgsd_synth\u001b[39;00m \u001b[39mimport\u001b[39;00m GsdSynthesizer\n\u001b[1;32m      <a href='vscode-notebook-cell://wsl%2Bubuntu-22.04/home/sasha-tsepilova/SynRD/SynRD/synthesizers/gsd_test.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mSynRD\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m save_synthesizer, load_synthesizer, do_binning, unbin_df\n\u001b[0;32m----> <a href='vscode-notebook-cell://wsl%2Bubuntu-22.04/home/sasha-tsepilova/SynRD/SynRD/synthesizers/gsd_test.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mjax\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mrandom\u001b[39;00m \u001b[39mimport\u001b[39;00m PNGKey\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'PNGKey' from 'jax.random' (/home/sasha-tsepilova/miniconda3/envs/synrd1/lib/python3.9/site-packages/jax/random.py)"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "from SynRD.papers import Iverson22Football, Pierce2019Who\n",
+    "from SynRD.benchmark import Benchmark\n",
+    "from gsd_synth import GsdSynthesizer\n",
+    "from SynRD.utils import save_synthesizer, load_synthesizer, do_binning, unbin_df\n",
+    "from jax.random import PRNGKey"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "benchmark = Benchmark()\n",
+    "papers = [Iverson22Football]\n",
+    "papers = benchmark.initialize_papers(papers)\n",
+    "transforms = {}\n",
+    "for paper in papers:\n",
+    "    df, transform = do_binning(paper.real_dataframe)\n",
+    "    transforms[paper.__class__.__name__.lower()] = transform\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "delta = 1.0 / len(df) ** 2\n",
+    "gsd = GsdSynthesizer(print_progress=True,\n",
+    "            stop_early=True,\n",
+    "            delta = delta,\n",
+    "            num_generations=2000,\n",
+    "            population_size_muta=50,\n",
+    "            population_size_cross=50,\n",
+    "            data_size=2000)\n",
+    "\n",
+    "seed = 0\n",
+    "key = PRNGKey(seed)\n",
+    "sync_data = gsd.fit(key,  df)\n",
+    "\n",
+    "synth_df = gsd.sample(2000)\n",
+    "df.describe()\n",
+    "synth_df.describe()\n",
+    "paper.set_synthetic_dataframe(synth_df)\n",
+    "benchmark.eval_soft_findings_each_finding(paper, 5)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "synrd1",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}