diff --git a/.gitignore b/.gitignore
index c34fe5df..671c67da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,6 +135,7 @@ logs/
 archive/
 data/
 *.out
+outputs
 
 # generated dot files and tree graphs
 .gv
diff --git a/run_elm.py b/run_elm.py
index 64cdab21..6bb8274a 100644
--- a/run_elm.py
+++ b/run_elm.py
@@ -1,8 +1,8 @@
 """
 This module gives an example of how to run the main ELM class.
 
-It uses the hydra library to load the config from the
-config/elm_sodarace_cfg.yaml file.
+It uses the hydra library to load the config from the config dataclasses in
+configs.py.
 
 This config file demonstrates an example of running ELM with the Sodarace
 environment, a 2D physics-based environment in which robots specified by
@@ -10,27 +10,23 @@
 
 """
 import hydra
-from hydra.core.config_store import ConfigStore
 from omegaconf import OmegaConf
 
 from openelm import ELM
-from openelm.configs import SodaraceELMConfig
 
-cs = ConfigStore.instance()
-cs.store(name="config", node=SodaraceELMConfig)
 
-
-# Load hydra config from yaml files and command line arguments.
 @hydra.main(
-    config_name="config",
+    config_name="elmconfig",
     version_base="1.2",
 )
-def main(cfg):
+def main(config):
     print("----------------- Config ---------------")
-    print(OmegaConf.to_yaml(cfg))
+    print(OmegaConf.to_yaml(config))
     print("-----------------  End -----------------")
-    elm = ELM(cfg)
-    print("Best Individual: ", elm.run())
+    config = OmegaConf.to_object(config)
+    elm = ELM(config)
+    print("Best Individual: ", elm.run(init_steps=config.qd.init_steps,
+                                       total_steps=config.qd.total_steps))
 
 
 if __name__ == "__main__":
diff --git a/run_p3.py b/run_p3.py
new file mode 100644
index 00000000..cfa38fd6
--- /dev/null
+++ b/run_p3.py
@@ -0,0 +1,176 @@
+import logging
+import pathlib
+import requests
+import time
+import json
+from collections import Counter
+
+from openelm.environments import p3_long_init_args, p3_med_init_args, P3Problem
+from openelm.mutation_model import DiffModel, MutationModel, PromptModel
+from openelm.configs import P3Config
+from openelm.sandbox.server.sandbox_codex_execute import ExecResult
+from openelm.utils.code_eval import pass_at_k
+from openelm.codegen.codegen_utilities import set_seed
+
+import hydra
+from omegaconf import OmegaConf
+
+
+class P3:
+    def __init__(self, cfg: P3Config) -> None:
+        """
+        Evaluate models on P3 dataset
+        """
+        self.cfg: P3Config = cfg
+
+        # Prompt size
+        if cfg.env.prompt_size == 'long':
+            env_args = p3_long_init_args
+        elif cfg.env.prompt_size == 'med':
+            env_args = p3_med_init_args
+        else:
+            raise ValueError('No init args found')
+
+        # Model
+        if self.cfg.model.model_name == 'prompt':
+            self.mutation_model: MutationModel = PromptModel(self.cfg.model)
+        elif self.cfg.model.model_name == 'diff':
+            self.mutation_model: MutationModel = DiffModel(self.cfg.model)
+
+        self.seed = env_args["seed"]
+        self.log_dir = 'logs/p3/problems'
+
+
+    def run(self):
+        """
+        Query PromptMutationModelForP3 for solutions to programming puzzles
+        """
+        # Get problems
+        problems = requests.get("https://raw.githubusercontent.com/microsoft/PythonProgrammingPuzzles/v0.2/puzzles/puzzles.json").json()
+        run_start_time = time.time()
+        num_problem_errors = 0
+        for problem in problems:
+            problem_start_time = time.time()
+            problem_dict = {'name': problem['name']}
+            logging.info(problem['name'])
+
+            problem['problem_func'] = problem['sat'].replace('def sat(', 'def f6(') # prompt form is f6()
+            problem['solution_preamble'] = problem['sol_header'].replace('def sol(', 'def g6(') # solution form is g6()
+            if self.cfg.env.prompt_size == 'long':
+                problem['solution_preamble'] = problem['solution_preamble'] + '\n' + problem['sol_docstring']
+
+            env = P3Problem(seed=self.seed,
+                            config=self.cfg,
+                            mutation_model=self.mutation_model,
+                            problem_func=problem['problem_func'],
+                            solution_preamble=problem['solution_preamble'],
+                            ans_type = problem['ans_type'])
+
+            # Find solutions
+            # If there is an error during finding a solution, log it and skip this problem
+            solutions = []
+            try:
+                for i in range(self.cfg.env.solutions_per_problem // self.cfg.model.batch_size):
+                    set_seed(i) # Change seed for each query
+
+                    try:
+                        solutions += env.random()
+                    except Exception as e:
+                        logging.error(f'ERROR with solution {i} in {problem["name"]}: {e}')
+                        num_problem_errors += 1
+                        raise(e)
+            except Exception as e:
+                continue
+
+            # Evaluate fitness of solutions
+            res_sols_list = []
+            solved = False
+            for sol in solutions:
+                res_sol_dict = {}
+                res_sol_dict['program_str'] = sol.program_str
+
+                if isinstance(sol.result_obj, ExecResult):
+                    if self.cfg.save_result_obj: res_sol_dict['result_obj'] = sol.result_obj.name
+                    fitness = 0.0
+                else:
+                    if self.cfg.save_result_obj: res_sol_dict['result_obj'] = sol.result_obj
+                    fitness = env.fitness(sol)
+
+                res_sol_dict['fitness'] = fitness
+                res_sols_list.append(res_sol_dict)
+                if not solved and fitness == 1.0:
+                    solved = True # just want to save if solved at all
+
+            problem_dict['config'] = OmegaConf.to_container(self.cfg)
+            problem_dict['solutions'] = res_sols_list
+            problem_dict['solved'] = solved
+            problem_dict['time_elapsed'] = time.time() - problem_start_time
+
+            # Save results
+            dir = f'{self.log_dir}/{problem_dict["name"]}/{run_start_time}'
+            pathlib.Path(dir).mkdir(parents=True, exist_ok=True)
+
+            with open(f'{dir}/results.json', 'w') as file:
+                file.write(json.dumps(problem_dict))
+
+        logging.info(f'Successfully ran on {len(problems)}/{len(problems)-num_problem_errors}' +
+                        f' problems and saved results to {self.log_dir}')
+
+
+    def eval_pass_at_k(self, timestamp: str, k: int):
+        """
+        pass@k metric over a subset of run logs
+        
+        Args:
+            timestamp (str): (optional) go through all problems with a run generated with timestamp
+                (if None, go through the latest run for every problem currently in logs)
+            k (int): k for pass@k
+        """
+
+        path = pathlib.Path(self.log_dir)
+        problem_paths = sorted(list(path.iterdir())) # Get all logged problems
+        paks = []
+        for p in problem_paths:
+            n = 0
+            c = 0
+            # Select one of the runs per problem
+            if len(timestamp) == 0:
+                # Get latest run
+                path = pathlib.Path(p)
+                run_paths = sorted(list(path.iterdir())) # Get all the runs per problem
+                run_path = run_paths[-1]
+            else:
+                # Get 'timestamp' run
+                run_path = p / timestamp
+
+            with open(f'{run_path}/results.json', 'r') as f:
+                results = json.load(f)
+                n += len(results['solutions'])
+                c += Counter(sol['fitness'] for sol in results['solutions'])[1.0]
+
+                pak = pass_at_k(n=n, c=c, k=k)
+                paks.append(pak)
+            
+        pak_overall = sum(paks) / len(paks)
+        return pak_overall
+
+
+# Load hydra config from yaml files and command line arguments.
+@hydra.main(
+    config_name="p3config",
+    version_base="1.2",
+)
+def main(cfg):
+    # Run
+    logging.info("----------------- Config ---------------")
+    logging.info(OmegaConf.to_yaml(cfg))
+    logging.info("-----------------  End -----------------")
+    p3 = P3(cfg)
+    
+    if cfg.eval_k > 0: logging.info(f"PASS@K: {p3.eval_pass_at_k(timestamp=cfg.eval_timestamp, k=cfg.eval_k)}")
+    else: p3.run()
+
+
+if __name__ == "__main__":
+    main()
+ 
\ No newline at end of file
diff --git a/src/openelm/__init__.py b/src/openelm/__init__.py
index 731e512f..19ead3a9 100644
--- a/src/openelm/__init__.py
+++ b/src/openelm/__init__.py
@@ -3,3 +3,5 @@
 from openelm.elm import ELM
 
 __version__ = importlib_version("openelm")
+
+__all__ = ["ELM"]
diff --git a/src/openelm/benchmarks/benchmark_bugs.py b/src/openelm/benchmarks/benchmark_bugs.py
index f0c99ca2..6b4aaa8d 100644
--- a/src/openelm/benchmarks/benchmark_bugs.py
+++ b/src/openelm/benchmarks/benchmark_bugs.py
@@ -13,8 +13,7 @@
 
 from openelm.codegen import model_setup, sample, truncate
 from openelm.configs import BaseConfig
-from openelm.utils.code_eval import eval_completions, mutate_code
-from openelm.utils.diff_eval import apply_diff, split_diff
+from openelm.utils import apply_diff, eval_completions, mutate_code, split_diff
 
 
 @dataclass
@@ -54,7 +53,7 @@ def __init__(self, cfg: BenchmarkBugsConfig):
 
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-        self.device = torch.device("cuda" if cfg.cuda else "cpu")
+        self.device = torch.device("cuda")
         self.model, self.tokenizer, self.device = model_setup(cfg, self.device)
 
     def benchmark_parity(self, n_bugs, **kwargs):
diff --git a/src/openelm/benchmarks/benchmark_crossover.py b/src/openelm/benchmarks/benchmark_crossover.py
index d4bf8d60..ab23e226 100644
--- a/src/openelm/benchmarks/benchmark_crossover.py
+++ b/src/openelm/benchmarks/benchmark_crossover.py
@@ -86,7 +86,7 @@ def __init__(self, cfg: BenchmarkCrossoverConfig):
 
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-        self.device = torch.device("cuda" if cfg.cuda else "cpu")
+        self.device = torch.device("cuda")
         self.model, self.tokenizer, self.device = model_setup(cfg, self.device)
 
     def construct_prompt(self, seeds):
diff --git a/src/openelm/benchmarks/benchmark_lm_speed.py b/src/openelm/benchmarks/benchmark_lm_speed.py
index 857dd2e7..ef3c73f7 100644
--- a/src/openelm/benchmarks/benchmark_lm_speed.py
+++ b/src/openelm/benchmarks/benchmark_lm_speed.py
@@ -8,11 +8,12 @@
 from tqdm import trange
 
 from openelm.codegen import model_setup, sample
+from openelm.configs import BaseConfig
 from openelm.environments import SQUARE_SEED
 
 
 @dataclass
-class BenchmarkSpeedConfig:
+class BenchmarkSpeedConfig(BaseConfig):
     hydra: Any = field(
         default_factory=lambda: {
             "run": {"dir": "logs/benchmarks/lm_speed/${now:%Y-%m-%d-%H-%M-%S}"}
diff --git a/src/openelm/codegen/codegen_utilities.py b/src/openelm/codegen/codegen_utilities.py
index ce39436b..065ad78d 100644
--- a/src/openelm/codegen/codegen_utilities.py
+++ b/src/openelm/codegen/codegen_utilities.py
@@ -1,11 +1,14 @@
 import os
 import random
 import re
+from typing import Optional
 
 import numpy as np
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from openelm.configs import ModelConfig
+
 
 def set_seed(seed=None, deterministic=True) -> int:
     if seed is None:
@@ -66,37 +69,46 @@ def find_re(string, pattern, start_pos):
         return completion
 
 
-def model_setup(cfg, device=None):
+def model_setup(cfg: ModelConfig, device=None, codegen_tokenizer: bool = True):
     set_seed(cfg.seed, deterministic=True)
     if device is None:
-        device = torch.device("cuda" if cfg.cuda else "cpu")
+        device = torch.device("cuda")
     use_fp16 = True
     if not cfg.fp16 or device.type == "cpu":
         use_fp16 = False
 
-    if cfg.model.startswith("codegen-16B"):
+    if "codegen-16B" in cfg.model_path:
         use_fp16 = True
 
-    tokenizer = AutoTokenizer.from_pretrained(cfg.model)
-    tokenizer.padding_side = "left"
-    tokenizer.pad_token = 50256
+    tokenizer = AutoTokenizer.from_pretrained(cfg.model_path)
+    if codegen_tokenizer:
+        tokenizer.padding_side = "left"
+        tokenizer.pad_token = 50256
 
-    model_path = cfg.model
     if cfg.gpus > 1:
         model = torch.nn.DataParallel(
-            create_model(model_path, fp16=use_fp16), device_ids=list(range(cfg.gpus))
+            create_model(cfg.model_path, fp16=use_fp16),
+            device_ids=list(range(cfg.gpus)),
         ).to(device)
     else:
-        model = create_model(model_path, fp16=use_fp16).to(device)
+        model = create_model(cfg.model_path, fp16=use_fp16).to(device)
     return model, tokenizer, device
 
 
 def sample(
-    batch, cfg, model, tokenizer, decode: bool = True, starting_idx=None, **kwargs
+    batch,
+    cfg: ModelConfig,
+    model,
+    tokenizer,
+    decode: bool = True,
+    starting_idx: Optional[int] = None,
+    num_return_sequences: Optional[int] = None,
+    **kwargs
 ) -> list[str]:
     """Run a model on a batch of contexts for a particular task."""
-    batch_size = kwargs.get("batch_size", cfg.batch_size)
-    device = kwargs.get("device", torch.device("cuda" if cfg.cuda else "cpu"))
+    if num_return_sequences is None:
+        num_return_sequences = cfg.batch_size
+    device = kwargs.get("device", torch.device("cuda"))
     temperature = kwargs.get("temperature", cfg.temp)
     top_p = kwargs.get("top_p", cfg.top_p)
     gen_max_len = kwargs.get("gen_max_len", cfg.gen_max_len)
@@ -111,7 +123,7 @@ def sample(
             tokens = model.module.generate(
                 **batch,
                 do_sample=True,
-                num_return_sequences=batch_size,
+                num_return_sequences=num_return_sequences,
                 temperature=temperature,
                 max_new_tokens=gen_max_len,
                 top_p=top_p,
@@ -122,7 +134,7 @@ def sample(
             tokens = model.generate(
                 **batch,
                 do_sample=True,
-                num_return_sequences=batch_size,
+                num_return_sequences=num_return_sequences,
                 temperature=temperature,
                 max_new_tokens=gen_max_len,
                 top_p=top_p,
diff --git a/src/openelm/configs.py b/src/openelm/configs.py
index ec7b282c..4f6512b9 100644
--- a/src/openelm/configs.py
+++ b/src/openelm/configs.py
@@ -11,91 +11,147 @@ class BaseConfig:
 
 
 @dataclass
-class ConfigClass(BaseConfig):
-    model: str = MISSING
-    epochs: int = MISSING
-    batch_size: int = MISSING
-    fp16: bool = MISSING
-    cuda: bool = MISSING
-    gpus: int = MISSING
-    seed: int = MISSING
-    deterministic: bool = MISSING
-    top_p: float = MISSING
-    temp: float = MISSING
-    timeout: float = MISSING
-    gen_max_len: int = MISSING
-    evo_init_steps: int = MISSING
-    evo_n_steps: int = MISSING
-    behavior_n_bins: int = MISSING
-    evo_history_length: int = MISSING
-    evaluation_steps: int = MISSING
-    env_name: str = MISSING
-    run_name: str = MISSING
-
-
-@dataclass
-class SodaraceELMConfig(BaseConfig):
-    hydra: Any = field(
-        default_factory=lambda: {
-            "run": {"dir": "logs/elm/sodarace/${hydra.job.override_dirname}"}
-        }
-    )
-    model: str = "Salesforce/codegen-350M-mono"
-    env_name: str = "sodarace"
-    batch_size: int = 32
+class ModelConfig(BaseConfig):
     fp16: bool = True
     cuda: bool = True
     gpus: int = 1
     seed: Optional[int] = None
-    debug: bool = False
     deterministic: bool = False
     top_p: float = 0.95
     temp: float = 0.85
-    timeout: float = 5.0  # Seconds
-    eval_ms: int = 1000  # Milliseconds
     gen_max_len: int = 768
-    evo_init_steps: int = 2
-    evo_n_steps: int = 5
-    behavior_n_bins: int = 12
-    evo_history_length: int = 1
+    batch_size: int = 32
+    model_path: str = MISSING  # Can be HF model name or path to local model
+
+
+@dataclass
+class PromptModelConfig(ModelConfig):
+    model_name: str = "prompt"
+    model_path: str = "Salesforce/codegen-350M-mono"
+
+
+@dataclass
+class DiffModelConfig(ModelConfig):
+    model_name: str = "diff"
+    model_path: str = "CarperAI/diff-codegen-350m-v2"
+
+
+@dataclass
+class QDConfig(BaseConfig):
+    init_steps: int = 2
+    total_steps: int = 5
+
+
+@dataclass
+class MAPElitesConfig(QDConfig):
+    history_length: int = 1
+    save_history: bool = False
+    map_grid_size: tuple[int, ...] = field(default_factory=lambda: (12,))
+
+
+@dataclass
+class EnvConfig(BaseConfig):
+    timeout: float = 5.0  # Seconds
+    sandbox: bool = False
+    sandbox_server: str = "http://localhost:5000"
     processes: int = 12
+    batch_size: int = 32  # Batch size of MAP-Elites
+    env_name: str = MISSING
+    debug: bool = False
+
+
+@dataclass
+class SodaraceEnvConfig(EnvConfig):
+    env_name: str = "sodarace"
+    eval_ms: int = 1000  # Milliseconds
+    behavior_space: list[list[float]] = field(
+        default_factory=lambda: [
+            # Height, Width, Mass dimensions
+            [0, 1000],
+            [0, 1000],
+            [0, 2000],
+        ]
+    )
+    starting_seeds: list[str] = field(default_factory=lambda: ["square"])
+    instruction: int = 1
+    crossover: bool = False
+
+
+@dataclass
+class ImageEnvConfig(EnvConfig):
+    env_name: str = "image_evolution"
+    behavior_mode: str = "3-channel"
+    target: str = "circle"
+
+
+@dataclass
+class P3EnvConfig(EnvConfig):
+    env_name: str = "p3_problem"
+    solutions_per_problem: int = 128
+    prompt_size: str = "long"  # med or long
+    timeout: float = 1.0
+
+
+defaults_elm = [
+    {"model": "prompt"},
+    {"qd": "mapelites"},
+    {"env": "sodarace"},
+    "_self_",
+]
+
+
+@dataclass
+class ELMConfig(BaseConfig):
+    hydra: Any = field(
+        default_factory=lambda: {
+            "run": {"dir": "logs/elm/${hydra.job.override_dirname}"}
+        }
+    )
+    defaults: list[Any] = field(default_factory=lambda: defaults_elm)
+    model: Any = MISSING
+    qd: Any = MISSING
+    env: Any = MISSING
     run_name: Optional[str] = None
-    sandbox: bool = False
+
+
+defaults_p3 = [
+    {"model": "prompt"},
+    {"env": "p3_problem"},
+    "_self_",
+]
 
 
 @dataclass
-class ImageELMConfig(BaseConfig):
+class P3Config(BaseConfig):
     hydra: Any = field(
         default_factory=lambda: {
-            "run": {"dir": "logs/elm/image/${hydra.job.override_dirname}"}
+            "run": {"dir": "logs/p3/${hydra.job.override_dirname}"}
         }
     )
-    model: str = "Salesforce/codegen-350M-mono"
-    batch_size: int = 32
-    fp16: bool = True
-    cuda: bool = True
-    gpus: int = 1
-    seed: Optional[int] = None
-    debug: bool = False
-    deterministic: bool = False
-    top_p: float = 0.95
-    temp: float = 0.85
-    timeout: float = 5.0  # Seconds
-    evaluation_steps: int = 1000  # Milliseconds
-    gen_max_len: int = 1024
-    evo_init_steps: int = 10
-    evo_n_steps: int = 15
-    behavior_n_bins: int = 12
-    evo_history_length: int = 1
-    processes: int = 12
+    defaults: list[Any] = field(default_factory=lambda: defaults_p3)
+    model: Any = MISSING
+    env: Any = MISSING
+    save_result_obj: bool = False
+    # set >0, evaluate pass@k of previous runs using this k, instead of doing a new run
+    eval_k: int = -1
+    # optionally provide timestamp of run to eval pass@k, otherwise eval with
+    # latest run of every problem
+    eval_timestamp: str = ""
     run_name: Optional[str] = None
-    sandbox: bool = False
 
 
-# TODO: Hierarchy of configs
-# e.g. ModelConfig, QDConfig, EnvConfig, etc.
-# Also add base ELMConfig(BaseConfig)
+def register_configstore() -> ConfigStore:
+    """Register configs with Hydra's ConfigStore."""
+    cs = ConfigStore.instance()
+    cs.store(group="env", name="sodarace", node=SodaraceEnvConfig)
+    cs.store(group="env", name="image_evolution", node=ImageEnvConfig)
+    cs.store(group="env", name="p3_problem", node=P3EnvConfig)
+    cs.store(group="qd", name="mapelites", node=MAPElitesConfig)
+    cs.store(group="model", name="prompt", node=PromptModelConfig)
+    cs.store(group="model", name="diff", node=DiffModelConfig)
+    cs.store(name="elmconfig", node=ELMConfig)
+    cs.store(name="p3config", node=P3Config)
+    return cs
 
 
-cs = ConfigStore.instance()
-cs.store(name="elm_cfg", node=ConfigClass)
+CONFIGSTORE = register_configstore()
diff --git a/src/openelm/diff_model.py b/src/openelm/diff_model.py
deleted file mode 100644
index e2982e5c..00000000
--- a/src/openelm/diff_model.py
+++ /dev/null
@@ -1,347 +0,0 @@
-import functools
-import json
-import os
-import re
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-
-import numpy as np
-import requests
-
-from openelm.codegen import model_setup, sample, set_seed, truncate
-from openelm.configs import SodaraceELMConfig
-from openelm.environments.sodaracer import IMPORTS, SQUARE_PREREQ, Walker
-from openelm.utils.code_eval import pool_exec_processes
-from openelm.utils.diff_eval import apply_diff, split_diff
-
-
-class MutationModel(ABC):
-    """Base model class for all mutation models."""
-
-    @abstractmethod
-    def generate_program(self, code_batch: list[str]) -> list[dict]:
-        pass
-
-
-@dataclass
-class FunctionTemplate:
-    """
-    A function template for a mutation model.
-
-    Attributes:
-        func_name: (str) The name of the function that we want to execute.
-        import_line: (str) The import lines we add to the code.
-        func_preamble: (str) The function definition, as well as potentially a
-        few initial lines to generate code.
-        instruction (str): The instruction we give to the model, before the
-        preamble.
-    """
-
-    func_name: str
-    import_line: str
-    func_preamble: str
-    instruction: str
-
-
-class PromptMutationModel(MutationModel):
-    """Mutation model that uses prompts to change a seed."""
-
-    def __init__(
-        self,
-        cfg: SodaraceELMConfig,
-        function_template: FunctionTemplate,
-        sandbox_server: str = "http://localhost:5000",
-    ) -> None:
-
-        self.cfg: SodaraceELMConfig = cfg
-        seed: int = set_seed(self.cfg.seed)
-        # Use RNG to rotate random seeds during inference.
-        self.rng = np.random.default_rng(seed=seed)
-        self.sandbox_server = sandbox_server
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        self.model, self.tokenizer, self.device = model_setup(self.cfg)
-        self.func_template: FunctionTemplate = function_template
-
-    def construct_prompt(self, code: str) -> tuple[str, str]:
-        """
-        Construct a prompt from a code string.
-
-        Args:
-            code (str): The code string.
-
-        Returns:
-            A tuple of the prompt string and imports plus instruction.
-        """
-        prompt_str = (
-            code + self.func_template.instruction + self.func_template.func_preamble
-        )
-        preamble_str = (
-            self.func_template.import_line
-            + self.func_template.instruction
-            + self.func_template.func_preamble
-        )
-        return prompt_str, preamble_str
-
-    def generate_program(self, code_batch: list[str]) -> list[dict]:
-        """
-        Generate a new program from a batch of programs.
-
-        Given a piece of code, do prompt mutation, execute the code,
-        and return the result.
-
-        Args:
-            code (str): The full code string.
-
-        Returns:
-            A numpy array (if successful) or the exception object.
-        """
-        prompts, preamble_strings = zip(*map(self.construct_prompt, code_batch))
-        encodings = self.tokenizer(
-            list(prompts),
-            truncation=True,
-            padding=True,
-            return_tensors="pt",
-        )
-        completions: list[str] = sample(
-            encodings,
-            self.cfg,
-            self.model,
-            self.tokenizer,
-            batch_size=1,
-        )
-        local_scope_exec: bool = len(self.func_template.func_preamble) > 0
-        trunc = functools.partial(truncate, only_local_scope=local_scope_exec)
-        self.truncations: list[str] = [
-            preamble_strings[i] + trunc(completions[i]) for i in range(len(completions))
-        ]
-        if self.cfg.sandbox:
-            results = []
-            for code in self.truncations:
-                resp = self._get_response(code, self.cfg.timeout)
-                if resp.status_code == 200:
-                    return_dict = json.loads(resp.text)
-                    results.append(return_dict)
-        else:
-            results = pool_exec_processes(
-                self.truncations,
-                func_name=self.func_template.func_name,
-                timeout=self.cfg.timeout,
-                processes=self.cfg.processes,
-                debug=self.cfg.debug,
-            )
-        return self._post_process(results)
-
-    @abstractmethod
-    def _get_response(self, code: str, timeout: float) -> requests.models.Response:
-        raise NotImplementedError
-
-    @abstractmethod
-    def _post_process(self, results: list) -> list:
-        raise NotImplementedError
-
-
-class PromptMutationForSodarace(PromptMutationModel):
-    def __init__(self, cfg, sandbox_server="http://localhost:5000") -> None:
-        function_template = FunctionTemplate(
-            func_name="make_walker",
-            import_line=IMPORTS + SQUARE_PREREQ,
-            instruction="",
-            func_preamble="def make_walker():\n",
-        )
-        super().__init__(cfg, function_template, sandbox_server)
-
-    def _get_response(self, code: str, timeout: float) -> requests.models.Response:
-        return requests.post(
-            f"{self.sandbox_server}/gen_racer",
-            json={"code": code, "timeout": timeout},
-            timeout=timeout,
-        )
-
-    def _post_process(self, results: list) -> list:
-        if self.cfg.sandbox:
-            return results
-        else:
-            result_list: list = []
-            for i, result in enumerate(results):
-                try:
-                    if isinstance(result, Walker) and result.validate():
-                        result_list.append(
-                            {
-                                "program_str": self.truncations[i],
-                                "result_obj": result.to_dict(),
-                            }
-                        )
-                    else:
-                        if self.cfg.debug:
-                            print("Failed execution, type:", result)
-                            print(self.truncations[i])
-                except Exception as e:
-                    if self.cfg.debug:
-                        print(type(e), e)
-            return result_list
-
-
-class PromptMutationForImgTask(PromptMutationModel):
-    def __init__(self, cfg, sandbox_server="http://localhost:5000") -> None:
-        func_name = "draw"
-        func_preamble = (
-            f'def {func_name}():\n\t"""Draw a yellow circle.\n'
-            '\t"""\n\tpic = np.zeros((32, 32, 3))\n'
-        )
-        function_template = FunctionTemplate(
-            func_name=func_name,
-            import_line="import math\nimport numpy as np",
-            func_preamble=func_preamble,
-            instruction="",
-        )
-        super().__init__(cfg, function_template, sandbox_server)
-
-    def reset_shape(self, shape: tuple):
-        func_name = self.func_template.func_name
-        self.func_preamble = f'def {func_name}():\n\t"""Draw a yellow circle.\n\t"""\n\tpic = np.zeros({shape})\n'
-
-    def _get_response(self, code: str, timeout: float) -> requests.models.Response:
-        func_name = self.func_template.func_name
-        return requests.post(
-            f"{self.sandbox_server}/eval_imageoptim_func",
-            json={"code": code, "func_name": func_name, "timeout": timeout},
-            timeout=timeout,
-        )
-
-    def _post_process(self, results: list) -> list:
-        for i in range(len(results)):
-            results[i]["result_obj"] = np.array(results[i]["result_obj"])
-        return results
-
-
-class DiffModel(PromptMutationModel):
-    def __init__(
-        self,
-        cfg: SodaraceELMConfig,
-        function_template: FunctionTemplate,
-        sandbox_server: str = "http://localhost:5000",
-    ) -> None:
-        super().__init__(cfg, function_template, sandbox_server)
-
-    def construct_prompt(self, code: str) -> tuple[str, str]:
-        prompt_list = [
-            "<NME> walker.py\n<BEF> ",
-            code,
-            "\n<MSG> Fixed bugs",
-        ]
-        prompt_str = "".join(prompt_list)
-        prompt_str = (
-            code + self.func_template.instruction + self.func_template.func_preamble
-        )
-        preamble_str = (
-            self.func_template.import_line
-            + self.func_template.instruction
-            + self.func_template.func_preamble
-        )
-        return prompt_str, preamble_str
-
-    def generate_program(self, code_batch: list[str]) -> list[dict]:
-        """
-        Generate a new program for a diff model from a batch of programs.
-
-        Given a piece of code, do prompt mutation, execute the code,
-        and return the result.
-
-        Args:
-            code (str): The full code string.
-
-        Returns:
-            A numpy array (if successful) or the exception object.
-        """
-        prompts, preamble_strings = zip(*map(self.construct_prompt, code_batch))
-        encodings = self.tokenizer(
-            list(prompts),
-            truncation=True,
-            padding=True,
-            return_tensors="pt",
-        )
-        completions: list[str] = sample(
-            encodings,
-            self.cfg,
-            self.model,
-            self.tokenizer,
-            batch_size=1,
-        )
-
-        local_scope_exec: bool = len(self.func_template.func_preamble) > 0
-        end_of_diff = re.compile("\n[^ +-@]+")
-        trunc = functools.partial(truncate, only_local_scope=local_scope_exec)
-        self.truncations: list[str] = [
-            preamble_strings[i] + trunc(completions[i]) for i in range(len(completions))
-        ]
-        outputs = []
-        for i, code in enumerate(self.truncations):
-            # split the diff text according to <NME>, <BEF>, <MSG>, <DFF>.
-            parsed: dict = split_diff(code)
-            # truncate the diff hunk at the first line not starting with " ",
-            # "+", "-", or "@".
-            if parsed and all(
-                (s in parsed for s in ["name", "file", "message", "diff"])
-            ):
-                diff_hunk: str = end_of_diff.split(parsed["diff"])[0]
-                nme_idx: int = diff_hunk.find("<NME>")
-                if nme_idx != -1:
-                    diff_hunk = diff_hunk[:nme_idx]
-                outputs.append(apply_diff(prompts[i], diff_hunk))
-        if self.cfg.sandbox:
-            results = []
-            for code in outputs:
-                resp = self._get_response(code, self.cfg.timeout)
-                if resp.status_code == 200:
-                    return_dict = json.loads(resp.text)
-                    results.append(return_dict)
-        else:
-            results = pool_exec_processes(
-                outputs,
-                func_name=self.func_template.func_name,
-                timeout=self.cfg.timeout,
-                processes=self.cfg.processes,
-                debug=self.cfg.debug,
-            )
-        return self._post_process(results)
-
-
-class DiffModelForSodarace(DiffModel):
-    def __init__(self, cfg, sandbox_server="http://localhost:5000") -> None:
-        function_template = FunctionTemplate(
-            func_name="make_walker",
-            import_line=IMPORTS + SQUARE_PREREQ,
-            instruction="",
-            func_preamble="def make_walker():\n",
-        )
-        super().__init__(cfg, function_template, sandbox_server)
-
-    def _get_response(self, code: str, timeout: float) -> requests.models.Response:
-        return requests.post(
-            f"{self.sandbox_server}/gen_racer",
-            json={"code": code, "timeout": timeout},
-            timeout=timeout,
-        )
-
-    def _post_process(self, results: list) -> list:
-        if self.cfg.sandbox:
-            return results
-        else:
-            result_list: list = []
-            for i, result in enumerate(results):
-                try:
-                    if isinstance(result, Walker) and result.validate():
-                        result_list.append(
-                            {
-                                "program_str": self.truncations[i],
-                                "result_obj": result.to_dict(),
-                            }
-                        )
-                    else:
-                        if self.cfg.debug:
-                            print("Failed execution, type:", result)
-                            print(self.truncations[i])
-                except Exception as e:
-                    if self.cfg.debug:
-                        print(type(e), e)
-            return result_list
diff --git a/src/openelm/elm.py b/src/openelm/elm.py
index 382b2eaf..7f449c3b 100644
--- a/src/openelm/elm.py
+++ b/src/openelm/elm.py
@@ -1,17 +1,13 @@
-from openelm.environments import (
-    ImageOptim,
-    Sodarace,
-    image_init_args,
-    sodarace_init_args,
-)
-from openelm.map_elites import MAPElites
+from typing import Optional
 
-ENVS_DICT = {"sodarace": Sodarace, "imageoptim": ImageOptim}
-ARG_DICT = {"sodarace": sodarace_init_args, "imageoptim": image_init_args}
+from openelm.configs import DiffModelConfig, ELMConfig, PromptModelConfig
+from openelm.environments import ENVS_DICT
+from openelm.map_elites import MAPElites
+from openelm.mutation_model import DiffModel, MutationModel, PromptModel
 
 
 class ELM:
-    def __init__(self, cfg, diff_model_cls=None, env_args: dict = None) -> None:
+    def __init__(self, config: ELMConfig) -> None:
         """
         The main class of ELM.
 
@@ -19,41 +15,43 @@ def __init__(self, cfg, diff_model_cls=None, env_args: dict = None) -> None:
         from the passed config.
 
         Args:
-            cfg: The config (e.g. OmegaConf who uses dot to access members).
-            diff_model_cls: (Optional) The class of diff model. One can apply
-            alternative models here for comparison.
-            env_args: (Optional) The argument dict for Environment.
+            config: The config containing the diff model, environment, and QD algorithm.
         """
-        self.cfg = cfg
-
-        # Get the defaults if `env_args` is not specified.
-        if env_args is None:
-            env_args = ARG_DICT[self.cfg.env_name]
-        env_args["config"] = self.cfg  # Override default environment config
-
-        # Override diff model if `diff_model_cls` is specified.
-        if diff_model_cls is not None:
-            self.diff_model = diff_model_cls(self.cfg)
-            env_args = {**env_args, "diff_model": self.diff_model}
-        else:
-            self.diff_model = None
-
-        self.seed = env_args["seed"]
-        self.environment = ENVS_DICT[self.cfg.env_name](**env_args)
+        self.config: ELMConfig = config
+        env_name: str = self.config.env.env_name
+        if isinstance(self.config.model, PromptModelConfig):
+            self.mutation_model: MutationModel = PromptModel(self.config.model)
+        elif isinstance(self.config.model, DiffModelConfig):
+            self.mutation_model = DiffModel(self.config.model)
+
+        self.environment = ENVS_DICT[env_name](
+            config=self.config.env,
+            mutation_model=self.mutation_model,
+        )
         self.qd_algorithm = MAPElites(
             self.environment,
-            n_bins=self.cfg.behavior_n_bins,
-            history_length=self.cfg.evo_history_length,
+            map_grid_size=self.config.qd.map_grid_size,
+            history_length=self.config.qd.history_length,
+            save_history=self.config.qd.save_history,
         )
 
-    def run(self) -> str:
+    def run(
+        self, init_steps: Optional[int] = None, total_steps: Optional[int] = None
+    ) -> str:
         """
         Run the ELM algorithm to evolve the population in the environment.
 
+        Args:
+            init_steps: The number of steps to run the initialisation phase.
+            total_steps: The number of steps to run the QD algorithm in total,
+            including init_steps.
+
         Returns:
             str: A string representing the maximum fitness genotype. The
             `qd_algorithm` class attribute will be updated.
         """
-        return self.qd_algorithm.search(
-            initsteps=self.cfg.evo_init_steps, totalsteps=self.cfg.evo_n_steps
-        )
+        if init_steps is None:
+            init_steps = self.config.qd.init_steps
+        if total_steps is None:
+            total_steps = self.config.qd.total_steps
+        return self.qd_algorithm.search(init_steps=init_steps, total_steps=total_steps)
diff --git a/src/openelm/environments/__init__.py b/src/openelm/environments/__init__.py
index 58bd70ea..3e7a34b9 100644
--- a/src/openelm/environments/__init__.py
+++ b/src/openelm/environments/__init__.py
@@ -1,4 +1,4 @@
-import numpy as np
+from typing import Any
 
 from openelm.environments.environments import (
     BaseEnvironment,
@@ -6,67 +6,124 @@
     Genotype,
     ImageOptim,
     MatchString,
+    P3Problem,
     Sodarace,
 )
-from openelm.environments.sodaracer import IMPORTS, SQUARE, SQUARE_PREREQ
-
-# ----- Generate sample seeds and init args for environments -----
-# They are simple template arguments to initialize several environments.
-# Sample usage:
-#   from openelm.environment import sodarace_init_args
-#   sodarace = Sodarace(**sodarace_init_args, run_name="test")
-
-
-IMAGE_SEED = {
-    "program_str": """import numpy as np
-def draw_blue_rectangle() -> np.ndarray:
-\tpic = np.zeros((32, 32, 3))
-\tfor x in range(2, 30):
-\t\tfor y in range(2, 30):
-\t\t\tpic[x, y] = np.array([0, 0, 255])
-\treturn pic
-""",
-    "result_obj": None,
+
+P3_MED_SEED = {
+    "program_str": """from typing import List
+
+def f1(s: str):
+    return "Hello " + s == "Hello world"
+
+def g1():
+    return "world"
+
+assert f1(g1())
+
+def f2(s: str):
+    return "Hello " + s[::-1] == "Hello world"
+
+def g2():
+    return "world"[::-1]
+
+assert f2(g2())
+
+def f3(x: List[int]):
+    return len(x) == 2 and sum(x) == 3
+
+def g3():
+    return [1, 2]
+
+assert f3(g3())
+
+def f4(s: List[str]):
+    return len(set(s)) == 1000 and all(
+        (x.count("a") > x.count("b")) and ('b' in x) for x in s)
+
+def g4():
+    return ["a"*(i+2)+"b" for i in range(1000)]
+
+assert f4(g4())
+
+def f5(n: int):
+    return str(n * n).startswith("123456789")
+
+def g5():
+    return int(int("123456789" + "0"*9) ** 0.5) + 1
+
+assert f5(g5())""",
+    "result_obj": {},
 }
-exec(IMAGE_SEED["program_str"], globals())
-IMAGE_SEED["result_obj"] = globals()["draw_blue_rectangle"]()
-target = np.zeros((32, 32, 3))
-for y in range(32):
-    for x in range(32):
-        if (y - 16) ** 2 + (x - 16) ** 2 <= 100:  # a radius-10 circle
-            target[y, x] = np.array([1, 1, 0])
-
-
-SQUARE_SEED = {
-    "program_str": IMPORTS + SQUARE_PREREQ + SQUARE,
-    "result_obj": {
-        "joints": [(0, 0), (0, 10), (10, 10), (10, 0), (5, 5)],
-        "muscles": [
-            [0, 1, {"type": "distance", "amplitude": 0.0, "phase": 0.0}],
-            [1, 2, {"type": "distance", "amplitude": 0.0, "phase": 0.0}],
-            [2, 3, {"type": "distance", "amplitude": 0.0, "phase": 0.0}],
-            [3, 0, {"type": "distance", "amplitude": 0.0, "phase": 0.0}],
-            [3, 4, {"type": "distance", "amplitude": 0.0, "phase": 0.0}],
-            [0, 4, {"type": "muscle", "amplitude": 5.0, "phase": 0.0}],
-            [1, 4, {"type": "muscle", "amplitude": 10.0, "phase": 0.0}],
-            [2, 4, {"type": "muscle", "amplitude": 2.0, "phase": 0.0}],
-        ],
-    },
+
+P3_LONG_SEED = {
+    "program_str": '''from typing import List
+
+def f1(s: str):
+    return "Hello " + s == "Hello world"
+
+def g1():
+    """Find a string that when concatenated onto 'Hello ' gives 'Hello world'."""
+    return "world"
+
+assert f1(g1())
+
+def f2(s: str):
+    return "Hello " + s[::-1] == "Hello world"
+
+def g2():
+    """Find a string that when reversed and concatenated onto 'Hello ' gives 'Hello world'."""
+    return "world"[::-1]
+
+assert f2(g2())
+
+def f3(x: List[int]):
+    return len(x) == 2 and sum(x) == 3
+
+def g3():
+    """Find a list of two integers whose sum is 3."""
+    return [1, 2]
+
+assert f3(g3())
+
+def f4(s: List[str]):
+    return len(set(s)) == 1000 and all(
+        (x.count("a") > x.count("b")) and ('b' in x) for x in s)
+
+def g4():
+    """Find a list of 1000 distinct strings which each have more 'a's than 'b's and at least one 'b'."""
+    return ["a"*(i+2)+"b" for i in range(1000)]
+
+assert f4(g4())
+
+def f5(n: int):
+    return str(n * n).startswith("123456789")
+
+def g5():
+    """Find an integer whose perfect square begins with 123456789 in its decimal representation."""
+    return int(int("123456789" + "0"*9) ** 0.5) + 1
+
+assert f5(g5())''',
+    "result_obj": {},
 }
 
-# A sample init args for ImageOptim
-image_init_args = {
-    "seed": IMAGE_SEED,
-    "config": "openelm/config/elm_image_cfg.yaml",
-    "target_img": target,
+p3_med_init_args = {
+    "seed": P3_MED_SEED,
+    "config": "openelm/config/elm_p3_cfg.yaml",
     "diff_model": None,
-    "behavior_mode": "3-channel",
 }
 
-# A sample init args for Sodarace
-sodarace_init_args = {"seed": SQUARE_SEED, "diff_model": None, "eval_ms": 1000}
+p3_long_init_args = {
+    "seed": P3_LONG_SEED,
+    "config": "openelm/config/elm_p3_cfg.yaml",
+    "diff_model": None,
+}
 
-# ----- (Sample init args end) -----
+ENVS_DICT: dict[str, Any] = {
+    "sodarace": Sodarace,
+    "image_evolution": ImageOptim,
+    "p3": P3Problem,
+}
 
 __all__ = [
     "Genotype",
@@ -75,8 +132,6 @@ def draw_blue_rectangle() -> np.ndarray:
     "ImageOptim",
     "MatchString",
     "Sodarace",
-    "IMAGE_SEED",
-    "image_init_args",
-    "SQUARE_SEED",
-    "sodarace_init_args",
+    "ENVS_DICT",
+    "P3Problem",
 ]
diff --git a/src/openelm/environments/env_utils.py b/src/openelm/environments/env_utils.py
new file mode 100644
index 00000000..fbad4a12
--- /dev/null
+++ b/src/openelm/environments/env_utils.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+
+def get_image_target(name: str) -> np.ndarray:
+    if name == "circle":
+        target = np.zeros((32, 32, 3))
+        for y in range(32):
+            for x in range(32):
+                if (y - 16) ** 2 + (x - 16) ** 2 <= 100:  # a radius-10 circle
+                    target[y, x] = np.array([1, 1, 0])
+    else:
+        raise NotImplementedError(f"Image target {name} not implemented")
+    return target
+
+
+IMAGE_SEED: str = """
+def draw():
+\tpic = np.zeros((32, 32, 3))
+\tfor x in range(2, 30):
+\t\tfor y in range(2, 30):
+\t\t\tpic[x, y] = np.array([0, 0, 255])
+\treturn pic
+"""
diff --git a/src/openelm/environments/environments.py b/src/openelm/environments/environments.py
index 15b1209e..bbcb1931 100644
--- a/src/openelm/environments/environments.py
+++ b/src/openelm/environments/environments.py
@@ -1,15 +1,31 @@
+import json
 import math
 import string
+import sys
 from abc import ABC, abstractmethod
-from dataclasses import is_dataclass
-from typing import Generic, Optional, TypeVar, Union
+from typing import Generic, Optional, Type, TypeVar, Union
 
 import numpy as np
-from omegaconf import DictConfig, OmegaConf
-
-from openelm.configs import BaseConfig, ImageELMConfig, SodaraceELMConfig
-from openelm.diff_model import PromptMutationForImgTask, PromptMutationForSodarace
-from openelm.environments.sodaracer import SodaraceSimulator
+import requests
+
+from openelm.configs import EnvConfig, ImageEnvConfig, P3EnvConfig, SodaraceEnvConfig
+from openelm.environments.env_utils import IMAGE_SEED, get_image_target
+from openelm.environments.sodaracer import (
+    CIRCLE,
+    GALLOPER_PREREQ,
+    IMPORTS,
+    INSTRUCTIONS,
+    QUERY_CPPN,
+    SEEDS_DICT,
+    SQUARE_PREREQ,
+    SodaraceSimulator,
+    Walker,
+)
+from openelm.mutation_model import MutationModel
+from openelm.utils.code_eval import pool_exec_processes, type_check
+
+sys.set_int_max_str_digits(0)  # remove length limitation for int->str conversion
+# (model sometimes outputs really long ints)
 
 Phenotype = Optional[np.ndarray]
 
@@ -41,6 +57,7 @@ class BaseEnvironment(ABC, Generic[GenoType]):
     def __init__(self) -> None:
         self.genotype_space: np.ndarray
         self.batch_size: int
+        self.config: EnvConfig
 
     @abstractmethod
     def random(self) -> list[GenoType]:
@@ -67,18 +84,6 @@ def behavior_space(self) -> np.ndarray:
     def behavior_ndim(self) -> int:
         return self.behavior_space.shape[1]
 
-    @staticmethod
-    def _load_config(config):
-        # TODO: convert all to dataclass
-        if isinstance(config, str):
-            return OmegaConf.load(config)
-        elif isinstance(config, (dict, DictConfig)):
-            return DictConfig(config)
-        elif is_dataclass(config):
-            return OmegaConf.structured(config)
-        else:
-            raise ValueError
-
 
 class ArrayGenotype(Genotype, np.ndarray):
     def __new__(cls, input_array):
@@ -190,106 +195,119 @@ class ImageOptim(BaseEnvironment[ImageGeneration]):
 
     Fitness is simply the absolute difference between the returning
     image and the target image. To map into the behavior space,
-    if behavior_mode=="3-channel", the image will be divided into blocks
+    if behavior_ndims=="3-channel", the image will be divided into blocks
     (specified in `block_size`), and average
     values of RGB channels in each block will be put together as a point in the
     behavior space (average-pooling).
     """
 
-    default_diff_model_cls = PromptMutationForImgTask
-    # Record different definitions of behavior spaces in a dict. Feel free to add.
-    behavior_mode_spec = {"3-channel-avg": {"genotype_ndim": 3}}
+    # Record different definitions of behavior spaces in a dict.
+    behavior_ndims = {"3-channel": 3}
 
     def __init__(
         self,
-        seed: dict,
-        config: Union[str, dict, DictConfig],
-        target_img: np.ndarray,
-        diff_model,
-        behavior_mode: str = "3-channel",
-        run_name: Optional[str] = None,
+        config: ImageEnvConfig,
+        mutation_model: MutationModel,
     ):
-        """
-        Mutate programs that return images.
+        self.config: ImageEnvConfig = config
+        self.batch_size = self.config.batch_size
+        self.target_img: np.ndarray = get_image_target(self.config.target)
+        self.seed: str = IMAGE_SEED
+        self.mutation_model: MutationModel = mutation_model
 
-        Fitness is simply the absolute difference between the returning
-        image and the target image. To map into the behavior space,
-        if behavior_mode=="3-channel", the image will be divided into blocks
-        (specified in `block_size`), and average values of RGB channels in each
-        block will be put together as a point in the behavior space (average-pooling).
+        self.behavior_mode: str = self.config.behavior_mode
+        self.genotype_ndim: int = self.behavior_ndims[self.behavior_mode]
+        self.genotype_space = np.repeat([[0, 255]], self.genotype_ndim, axis=0).T
 
-        Args:
-            seed: the seed dict.
-            config: the config file path or dict.
-            target_img: the target image.
-            diff_model: the diff model (or alternatives).
-            behavior_mode: (Optional) a string indicating the way an individual
-            is mapped into behavior space.
-            run_name: (Optional) override the run_name in config.
-        """
-        if isinstance(seed, dict):
-            self.seed = ImageGeneration(**seed)
+    def construct_prompt(
+        self, code_batch: Optional[Union[list[str], str]] = None
+    ) -> dict[str, str]:
+        prompt_str: str = "import math\nimport numpy as np\n"
+        instruction_str: str = """
+def draw():
+    \"\"\"Draw a yellow circle.\"\"\"
+    pic = np.zeros((32, 32, 3))
+"""
+        import_str: str = prompt_str
+        if code_batch is None:
+            # Initialization steps
+            prompt_str += self.seed
         else:
-            raise TypeError
-
-        self.config: ImageELMConfig = self._load_config(config)
-        if run_name is not None:
-            self.config.run_name = run_name
-
-        self.target_img = target_img
-        self.shape = target_img.shape
-
-        if diff_model is None:
-            self.diff_model = self.default_diff_model_cls(self.config)
+            # Evolution steps
+            if isinstance(code_batch, list):
+                prompt_str += code_batch[0]
+            elif isinstance(code_batch, str):
+                prompt_str += code_batch
+        import_str += instruction_str
+        prompt_str += instruction_str
+        return {"prompt": prompt_str, "template": import_str}
+
+    def generate_programs(
+        self, code_batch: list[dict[str, str]]
+    ) -> list[ImageGeneration]:
+        func_name: str = "draw"
+        generated_programs = self.mutation_model.generate_programs(
+            code_batch, local_scope_truncate=True
+        )
+        if self.config.sandbox:
+            results = []
+            for code in generated_programs:
+                resp = requests.post(
+                    f"{self.config.sandbox_server}/eval_imageoptim_func",
+                    json={
+                        "code": code,
+                        "func_name": func_name,
+                        "timeout": self.config.timeout,
+                    },
+                    timeout=self.config.timeout,
+                )
+                if resp.status_code == 200:
+                    return_dict = json.loads(resp.text)
+                    results.append(return_dict)
+            return [ImageGeneration(**p) for p in results]
+        # for i in range(len(results)):
+        #     results[i]["result_obj"] = np.array(results[i]["result_obj"])
+        # return results
         else:
-            self.diff_model = diff_model
-
-        self.behavior_mode = behavior_mode
-        self.genotype_ndim: int = self.behavior_mode_spec[self.behavior_mode][
-            "genotype_ndim"
-        ]
-        self.genotype_space = np.repeat([[0, 255]], self.genotype_ndim, axis=0).T
-
-    def generate_program(self, code_batch: list[str]) -> list[ImageGeneration]:
-        """
-        Call LM to generate a new program and run it.
-
-        Returns:
-            An ImageGeneration object containing the code, the resulting image
-            and the error code.
-        """
-        generated_programs = self.diff_model.generate_program(code_batch)
-        return [ImageGeneration(**p) for p in generated_programs]
+            results = pool_exec_processes(
+                generated_programs,
+                func_name=func_name,
+                timeout=self.config.timeout,
+                processes=self.config.processes,
+                debug=self.config.debug,
+            )
+            result_list: list = []
+            for i, result in enumerate(results):
+                try:
+                    if isinstance(result, np.ndarray):
+                        result_list.append(
+                            {
+                                "program_str": generated_programs[i],
+                                "result_obj": result,
+                            }
+                        )
+                    else:
+                        if self.config.debug:
+                            print("Failed execution, type:", result)
+                            print(generated_programs[i])
+                except Exception as e:
+                    if self.config.debug:
+                        print(type(e), e)
+            return [ImageGeneration(**p) for p in result_list]
 
     def random(self) -> list[ImageGeneration]:
-        """
-        Randomly generate a batch of codes and evaluate their outputs.
-
-        Returns:
-            a tuple of the code string and the returning result (None if there
-            is error).
-        """
-        program_str_list = [self.seed.program_str] * self.batch_size
-        new_images = self.generate_program(program_str_list)
+        program_list = [self.construct_prompt() for _ in range(self.config.batch_size)]
+        new_images = self.generate_programs(program_list)
         return new_images
 
     def mutate(self, images_list: list[ImageGeneration]) -> list[ImageGeneration]:
-        """
-        Randomly mutate a batch of codes and evaluate their outputs.
-
-        Args:
-            x: the individual to be mutated.
-
-        Returns:
-            a tuple of the code string and the returning result (None if there
-            is an error).
-        """
-        program_str_list = [sr.program_str for sr in images_list]
-        new_images = self.generate_program(program_str_list)
+        images = [img.program_str for img in images_list]
+        program_list = list(map(self.construct_prompt, images))
+        new_images = self.generate_programs(program_list)
         return new_images
 
     def fitness(self, x: ImageGeneration) -> float:
-        if not x.valid or x.result_obj.shape != self.shape:
+        if not x.valid or x.result_obj.shape != self.target_img.shape:
             return -np.inf
         return -np.abs(x.result_obj - self.target_img).sum()
 
@@ -305,7 +323,6 @@ def __init__(self, program_str: str, result_obj: dict):
         """
         self.program_str: str = program_str
         self.result_obj: dict = result_obj
-        # self._fitness: Optional[float] = None
 
         # Check whether the Sodaracer is valid.
         try:
@@ -320,6 +337,9 @@ def __init__(self, program_str: str, result_obj: dict):
     def evaluate(self, eval_ms: int) -> float:
         self._fitness = self.simulator.evaluate(eval_ms)
         # if self._fitness is None:
+        #     print(self.valid)
+        #     self.simulator = SodaraceSimulator(body=self.result_obj)
+        #     print(self.evaluate(0))
         return self._fitness
 
     def __str__(self) -> str:
@@ -343,74 +363,279 @@ def fitness(self) -> Optional[float]:
 
 
 class Sodarace(BaseEnvironment[Sodaracer]):
-    default_diff_model_cls = PromptMutationForSodarace
-
     def __init__(
         self,
-        seed: dict,
-        config: Union[str, dict, DictConfig, BaseConfig],
-        diff_model,
-        eval_ms: int,
-        max_height: int = 1000,
-        max_width: int = 1000,
-        max_mass: int = 2000,
-        ndim: int = 3,
-        run_name: Optional[str] = None,
+        config: SodaraceEnvConfig,
+        mutation_model: MutationModel,
     ) -> None:
         """
         Sodarace environment.
 
         Args:
-            seed: the seed dict.
-            config: the config file path or dict.
-            diff_model: the diff model (or alternatives).
-            eval_ms: The time in ms for sodaracer evaluation.
-            max_height: (Optional) the maximal height.
-            max_width: (Optional) the maximal width.
-            max_mass: (Optional) the maximal mass.
-            ndim: (Optional) the dimension of behavior space.
-            run_name: (Optional) override the run_name in config.
+            seeds: the seed dict.
+            config: the environment config.
+            mutation_model: the mutation model.
         """
-        if isinstance(seed, dict):
-            self.seed = Sodaracer(**seed)
+        self.config: SodaraceEnvConfig = config
+        self.batch_size = self.config.batch_size
+        self.mutation_model: MutationModel = mutation_model
+
+        self.genotype_space = np.array(self.config.behavior_space).T
+        self.genotype_ndim = self.genotype_space.shape[1]
+
+        self.seed_strs: list[str] = self.config.starting_seeds
+
+    def construct_prompt(
+        self, code_batch: Optional[Union[list[str], str]] = None
+    ) -> dict[str, str]:
+        prompt_str: str = IMPORTS
+        if "square" in self.seed_strs:
+            prompt_str += SQUARE_PREREQ
+        if "galloper" in self.seed_strs:
+            prompt_str += GALLOPER_PREREQ
+        if "radial" in self.seed_strs or "wheel" in self.seed_strs:
+            prompt_str += CIRCLE
+        if (
+            "cppn_fixed" in self.seed_strs
+            or "cppn_mutable" in self.seed_strs
+            or "runner" in self.seed_strs
+        ):
+            prompt_str += QUERY_CPPN
+        # For crossover:
+        # If init steps, combine seeds and prereqs, and use instruction 3 code below.
+        # For all other steps, prepend all prereqs and ignore instruction 3 code.
+        # For non-crossover
+        # Always preprend prereq, and len(code_batch) == 1
+        import_str: str = prompt_str
+        if code_batch is None:
+            # Initialization steps
+            seeds = [SEEDS_DICT[seed] for seed in self.seed_strs]
+            if not self.config.crossover:
+                # TODO: Sample from seeds randomly
+                prompt_str += seeds[0]
+            elif self.config.crossover:
+                if self.config.instruction == 3:
+                    instruction_str: str = INSTRUCTIONS[self.config.instruction].split(
+                        ","
+                    )[0]
+                for seed in seeds:
+                    prompt_str += seed
+                    if self.config.instruction == 3:
+                        reverse_seeds: dict[str, str] = {
+                            v: k for k, v in SEEDS_DICT.items()
+                        }
+                        instruction_str += reverse_seeds[seed] + ", "
+                if self.config.instruction == 3:
+                    instruction_str += INSTRUCTIONS[self.config.instruction].split(",")[
+                        1
+                    ]
+                raise NotImplementedError
         else:
-            raise TypeError
-        # TODO: rewrite config code to make everything an instance of a dataclass
-        self.config: SodaraceELMConfig = self._load_config(config)
-        if run_name is not None:
-            self.config.run_name = run_name
-
-        if diff_model is None:
-            self.diff_model = self.default_diff_model_cls(self.config)
+            # Evolution steps
+            if not self.config.crossover:
+                if isinstance(code_batch, list):
+                    # TODO: get nearby genotypes
+                    prompt_str += code_batch[0]
+                elif isinstance(code_batch, str):
+                    prompt_str += code_batch
+            elif self.config.crossover:
+                # Crossover
+                raise NotImplementedError
+        instruction_str = INSTRUCTIONS[self.config.instruction]
+        import_str += instruction_str
+        prompt_str += instruction_str
+        return {"prompt": prompt_str, "template": import_str}
+
+    def generate_programs(self, code_batch: list[dict[str, str]]) -> list[Sodaracer]:
+        """Generate new programs with a mutation model and evaluate them."""
+        local_scope_exec: bool = self.config.instruction != 0
+        generated_programs = self.mutation_model.generate_programs(
+            code_batch, local_scope_exec
+        )
+        if self.config.sandbox:
+            results = []
+            for code in generated_programs:
+                resp = requests.post(
+                    f"{self.config.sandbox_server}/gen_racer",
+                    json={"code": code, "timeout": self.config.timeout},
+                    timeout=self.config.timeout,
+                )
+                if resp.status_code == 200:
+                    return_dict = json.loads(resp.text)
+                    results.append(return_dict)
+            return [Sodaracer(**p) for p in results]
         else:
-            self.diff_model = diff_model
-
-        self.batch_size = self.config.batch_size
-        self.eval_ms = eval_ms
-        self.genotype_ndim = ndim
-        self.genotype_space = np.array(
-            [[0, max_height], [0, max_width], [0, max_mass]]
-        ).T
-
-    def generate_program(self, code_batch: list[str]) -> list[Sodaracer]:
-        # Call LM to generate a new program and run it, returning a dict
-        # containing the program string and the dict from running it.
-        generated_programs = self.diff_model.generate_program(code_batch)
-        return [Sodaracer(**p) for p in generated_programs]
+            results = pool_exec_processes(
+                generated_programs,
+                func_name="make_walker",
+                timeout=self.config.timeout,
+                processes=self.config.processes,
+                debug=self.config.debug,
+            )
+            result_list: list = []
+            for i, result in enumerate(results):
+                try:
+                    if isinstance(result, Walker) and result.validate():
+                        result_list.append(
+                            {
+                                "program_str": generated_programs[i],
+                                "result_obj": result.to_dict(),
+                            }
+                        )
+                    else:
+                        if self.config.debug:
+                            print("Failed execution, type:", result)
+                            print(generated_programs[i])
+                except Exception as e:
+                    if self.config.debug:
+                        print(type(e), e)
+            return [Sodaracer(**p) for p in result_list]
 
     def random(self) -> list[Sodaracer]:
-        program_str_list = [self.seed.program_str] * self.batch_size
-        new_sodaracers = self.generate_program(program_str_list)
+        program_list = [self.construct_prompt() for _ in range(self.config.batch_size)]
+        new_sodaracers = self.generate_programs(program_list)
         return new_sodaracers
 
     def mutate(self, sodaracer_list: list[Sodaracer]) -> list[Sodaracer]:
-        program_str_list = [sr.program_str for sr in sodaracer_list]
-        new_sodaracers = self.generate_program(program_str_list)
+        sodaracers = [sr.program_str for sr in sodaracer_list]
+        program_list = list(map(self.construct_prompt, sodaracers))
+        new_sodaracers = self.generate_programs(program_list)
         return new_sodaracers
 
     def fitness(self, x: Sodaracer) -> float:
-        # Call Sodaracers environment to get the fitness.
         if x.valid:
-            return x.evaluate(self.eval_ms)
+            return x.evaluate(self.config.eval_ms)
         else:
             return -np.inf
+
+
+class P3Solution(Genotype):
+    def __init__(self, program_str: str, result_obj: dict):
+        """
+        Genotype for a programming puzzle solution.
+
+        Args:
+            program_str: the solution program string (the g6() function).
+            result_obj: dict.
+        """
+        self.program_str = program_str
+        self.result_obj = result_obj
+
+    def __str__(self) -> str:
+        return self.program_str
+
+    def to_phenotype(self) -> Optional[Phenotype]:
+        return None
+
+
+class P3Problem(BaseEnvironment[P3Solution]):
+    def __init__(
+        self,
+        seed: dict,
+        config: P3EnvConfig,
+        mutation_model: MutationModel,
+        problem_func: str,
+        solution_preamble: str,
+        ans_type: Type,
+    ) -> None:
+        """
+        P3 Environment.
+
+        Args:
+            seed: the seed dict.
+            config: the config file path or dict.
+            mutation_model: the diff model (or alternatives).
+            problem_func: the f6(<params>) function containing the programming problem
+            solution_preamble: the g6(<params>) function definition (must be passed in in order to include params)
+            ans_type: answer type
+        """
+        if isinstance(seed, dict):
+            self.seed = seed
+        else:
+            raise TypeError
+        self.mutation_model = mutation_model
+        self.problem_func = problem_func
+        self.solution_preamble = solution_preamble
+        self.config = config
+        self.batch_size = self.config.batch_size
+        # The only import that's necessary as of P3 v0.2
+        self.import_line = "from typing import List\n"
+        self.ans_type = ans_type
+
+    def construct_prompt(self) -> dict[str, str]:
+        prompt_str = (
+            self.seed["program_str"]
+            + f"\n\n{self.problem_func}"  # add f6() to the prompt
+            f"\n\n{self.solution_preamble}"  # add g6() preamble
+        )
+
+        template = f"{self.import_line}\n{self.solution_preamble}"
+        return {"prompt": prompt_str, "template": template}
+
+    def generate_program(self, code_batch: list[dict[str, str]]) -> list[P3Solution]:
+        """Generate new programs with a mutation model and evaluate them."""
+        local_scope_exec = True
+        generated_programs = self.mutation_model.generate_programs(
+            code_batch, local_scope_exec
+        )
+
+        if self.config.sandbox:
+            results = []
+            for code in generated_programs:
+                resp = requests.post(
+                    f"{self.config.sandbox_server}/eval_p3_solution",
+                    json={"code": code, "timeout": self.config.timeout},
+                    timeout=self.config.timeout,
+                )
+                if resp.status_code == 200:
+                    return_dict = json.loads(resp.text)
+                    results.append(return_dict)
+        else:
+            results = pool_exec_processes(
+                generated_programs,
+                func_name="g6",
+                timeout=self.config.timeout,
+                processes=self.config.processes,
+                debug=self.config.debug,
+            )
+        results = [
+            {"program_str": gen_prog, "result_obj": res_obj}
+            for (gen_prog, res_obj) in zip(generated_programs, results)
+        ]
+        return [P3Solution(**p) for p in results]
+
+    def fitness(self, sol: P3Solution) -> float:
+        # If passing the solution to the problem returns True, fitness is 1.0
+        # else 0.0
+        if not type_check(self.ans_type, sol.result_obj):
+            return 0.0
+
+        eval_code = (
+            f"{self.import_line}\n"
+            f"{self.problem_func}\n"
+            f"def run_eval():\n"
+            f"    return f6({sol.result_obj})"
+        )
+
+        result = pool_exec_processes(
+            eval_code,
+            func_name="run_eval",
+            timeout=self.config.timeout,
+            processes=self.config.processes,
+            debug=self.config.debug,
+        )
+        if result[0] is True:
+            return 1.0
+        else:
+            return 0.0
+
+    def random(self) -> list[P3Solution]:
+        program_list = [self.construct_prompt() for _ in range(self.config.batch_size)]
+        new_solutions = self.generate_program(program_list)
+        return new_solutions
+
+    def mutate(self, x: P3Solution) -> list[P3Solution]:
+        raise NotImplementedError
+
+    def to_behavior_space(self, x: Sodaracer) -> Optional[Phenotype]:
+        raise NotImplementedError
diff --git a/src/openelm/environments/sodaracer/__init__.py b/src/openelm/environments/sodaracer/__init__.py
index 52d77f8d..5087e9af 100644
--- a/src/openelm/environments/sodaracer/__init__.py
+++ b/src/openelm/environments/sodaracer/__init__.py
@@ -240,6 +240,13 @@ def phase(x1, y1, x2, y2):
     "runner": RUNNER,
 }
 
+INSTRUCTIONS = {
+    0: "",
+    1: "def make_walker():\n",
+    2: "#Create a new walker by modifying the starting function above.\ndef make_walker():\n",
+    3: "#Combine the ,starting programs above to make a new program.\ndef make_walker():\n",
+}
+
 __all__ = [
     "IESoRWorld",
     "SodaraceSimulator",
diff --git a/src/openelm/environments/sodaracer/simulator.py b/src/openelm/environments/sodaracer/simulator.py
index 5a9efb49..c8c22435 100644
--- a/src/openelm/environments/sodaracer/simulator.py
+++ b/src/openelm/environments/sodaracer/simulator.py
@@ -13,6 +13,7 @@
 from pathlib import Path
 from typing import Any
 
+import numpy as np
 from Box2D import Box2D as b2
 
 from openelm.environments.sodaracer.helpers import (
@@ -629,7 +630,7 @@ def evaluate(self, time: float) -> float:
             )
             return abs(end + self.morphology["offsetX"])
         except Exception as e:
-            # print(e)
+            print(e)
             # print(self.world.bone_list)
             # print(self.world.muscle_list)
-            return None
+            return -np.inf
diff --git a/src/openelm/map_elites.py b/src/openelm/map_elites.py
index 5bd7c7b9..30ccaf7e 100644
--- a/src/openelm/map_elites.py
+++ b/src/openelm/map_elites.py
@@ -112,7 +112,7 @@ class MAPElites:
     def __init__(
         self,
         env,
-        n_bins: int,
+        map_grid_size: tuple[int, ...],
         init_map: Optional[Map] = None,
         history_length: int = 1,
         save_history: bool = False,
@@ -125,7 +125,7 @@ def __init__(
             should be a subclass of `BaseEnvironment`, and should implement
             methods to generate random solutions, mutate existing solutions,
             and evaluate solutions for their fitness in the environment.
-            n_bins (int): Number of bins to partition the behavior space into.
+            map_grid_size (int): Number of bins to partition the behavior space into.
             init_map (Map, optional): A map to use for the algorithm. If not passed,
             a new map will be created. Defaults to None.
             history_length (int): Length of history to store for each niche (cell)
@@ -137,18 +137,19 @@ def __init__(
             Defaults to False.
         """
         self.env: BaseEnvironment = env
-        self.n_bins = n_bins
+        self.map_grid_size = map_grid_size
         self.history_length = history_length
         self.save_history = save_history
         # self.history will be set/reset each time when calling `.search(...)`
         self.history: dict = defaultdict(list)
         # discretization of space
-        self.bins = np.linspace(*env.behavior_space, n_bins + 1)[1:-1].T  # type: ignore
+        # TODO: make this work for any number of dimensions
+        self.bins = np.linspace(*env.behavior_space, map_grid_size[0] + 1)[1:-1].T  # type: ignore
         # TODO: abstract all maps out to a single class.
         # perfomance of niches
         if init_map is None:
             self.fitnesses: Map = Map(
-                dims=(n_bins,) * env.behavior_ndim,
+                dims=map_grid_size * env.behavior_ndim,
                 fill_value=-np.inf,
                 dtype=float,
                 history_length=history_length,
@@ -183,7 +184,7 @@ def random_selection(self) -> MapIndex:
         ix = np.random.choice(np.flatnonzero(self.nonzero.array))
         return np.unravel_index(ix, self.nonzero.dims)
 
-    def search(self, initsteps: int, totalsteps: int, atol: float = 1.0) -> str:
+    def search(self, init_steps: int, total_steps: int, atol: float = 1.0) -> str:
         """
         Run the MAP-Elites search algorithm.
 
@@ -200,16 +201,17 @@ def search(self, initsteps: int, totalsteps: int, atol: float = 1.0) -> str:
                 best performing solution object can be accessed via the
                 `current_max_genome` class attribute.
         """
-        tbar = trange(int(totalsteps))
+        tbar = trange(int(total_steps))
         max_fitness = -np.inf
         max_genome = None
         if self.save_history:
             self.history = defaultdict(list)
 
         for n_steps in tbar:
-            if n_steps < initsteps or self.genomes.empty:
+            if n_steps < init_steps or self.genomes.empty:
                 # Initialise by generating initsteps random solutions.
                 # If map is still empty: force to do generation instead of mutation.
+                # TODO: use a separate sampler, move batch size to qd config.
                 new_individuals: list[Genotype] = self.env.random()
             else:
                 # Randomly select a batch of elites from the map.
@@ -225,6 +227,9 @@ def search(self, initsteps: int, totalsteps: int, atol: float = 1.0) -> str:
             # TODO: account for the case where multiple new individuals are
             # placed in the same niche, for saving histories.
             for individual in new_individuals:
+                fitness = self.env.fitness(individual)
+                if np.isinf(fitness):
+                    continue
                 map_ix = self.to_mapindex(individual.to_phenotype())
                 # if the return is None, the individual is invalid and is thrown
                 # into the recycle bin.
@@ -238,7 +243,6 @@ def search(self, initsteps: int, totalsteps: int, atol: float = 1.0) -> str:
                     self.history[map_ix].append(individual)
                 self.nonzero[map_ix] = True
 
-                fitness = self.env.fitness(individual)
                 # If new fitness greater than old fitness in niche, replace.
                 if fitness > self.fitnesses[map_ix]:
                     self.fitnesses[map_ix] = fitness
diff --git a/src/openelm/mutation_model.py b/src/openelm/mutation_model.py
new file mode 100644
index 00000000..ac2eb5f6
--- /dev/null
+++ b/src/openelm/mutation_model.py
@@ -0,0 +1,117 @@
+import functools
+import os
+import re
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from openelm.codegen import model_setup, sample, set_seed, truncate
+from openelm.configs import ModelConfig
+from openelm.utils.diff_eval import apply_diff, split_diff
+
+
+class MutationModel(ABC):
+    """Base model class for all mutation models."""
+
+    def __init__(self) -> None:
+        self.config: ModelConfig
+
+    @abstractmethod
+    def generate_programs(self, *args, **kwargs) -> list[str]:
+        raise NotImplementedError
+
+
+class PromptModel(MutationModel):
+    """Mutation model that uses prompts to change a seed."""
+
+    def __init__(self, config: ModelConfig) -> None:
+        self.config: ModelConfig = config
+        seed: int = set_seed(self.config.seed)
+        # Use RNG to rotate random seeds during inference.
+        self.rng = np.random.default_rng(seed=seed)
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.model, self.tokenizer, self.device = model_setup(self.config)
+
+    def generate_programs(
+        self, prompt_dicts: list[dict[str, str]], local_scope_truncate: bool, **kwargs
+    ) -> list[str]:
+        """
+        Generate new programs from a batch of programs.
+
+        Given a piece of code, do prompt mutation, execute the code,
+        and return the result.
+
+        Args:
+            prompt_dicts (list[dict[str, str]): A list of dictionaries containing
+            the prompt and template for each program.
+            local_scope_truncate (bool): Whether or not to truncate the code to
+            the local scope.
+
+        Returns:
+            A list of code strings.
+        """
+        prompts = [prompt_dict["prompt"] for prompt_dict in prompt_dicts]
+        templates = [prompt_dict["template"] for prompt_dict in prompt_dicts]
+        encodings = self.tokenizer(
+            prompts,
+            truncation=True,
+            padding=True,
+            return_tensors="pt",
+        )
+        completions: list[str] = sample(
+            encodings,
+            self.config,
+            self.model,
+            self.tokenizer,
+            num_return_sequences=1,
+        )
+        trunc = functools.partial(truncate, only_local_scope=local_scope_truncate)
+        truncations: list[str] = [
+            templates[i] + trunc(completions[i]) for i in range(len(completions))
+        ]
+        return truncations
+
+
+class DiffModel(PromptModel):
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__(config)
+
+    def generate_programs(
+        self, prompt_dicts: list[dict[str, str]], local_scope_truncate: bool, **kwargs
+    ) -> list[str]:
+        prompts = [prompt_dict["prompt"] for prompt_dict in prompt_dicts]
+        templates = [prompt_dict["template"] for prompt_dict in prompt_dicts]
+        encodings = self.tokenizer(
+            prompts,
+            truncation=True,
+            padding=True,
+            return_tensors="pt",
+        )
+        completions: list[str] = sample(
+            encodings,
+            self.config,
+            self.model,
+            self.tokenizer,
+            num_return_sequences=1,
+        )
+
+        end_of_diff = re.compile("\n[^ +-@]+")
+        trunc = functools.partial(truncate, only_local_scope=local_scope_truncate)
+        truncations: list[str] = [
+            templates[i] + trunc(completions[i]) for i in range(len(completions))
+        ]
+        outputs: list[str] = []
+        for i, code in enumerate(truncations):
+            # split the diff text according to <NME>, <BEF>, <MSG>, <DFF>.
+            parsed: dict = split_diff(code)
+            # truncate the diff hunk at the first line not starting with " ",
+            # "+", "-", or "@".
+            if parsed and all(
+                (s in parsed for s in ["name", "file", "message", "diff"])
+            ):
+                diff_hunk: str = end_of_diff.split(parsed["diff"])[0]
+                nme_idx: int = diff_hunk.find("<NME>")
+                if nme_idx != -1:
+                    diff_hunk = diff_hunk[:nme_idx]
+                outputs.append(apply_diff(prompts[i], diff_hunk))
+        return outputs
diff --git a/src/openelm/sandbox/server/index.py b/src/openelm/sandbox/server/index.py
index 09a18dd7..4d4ac4d1 100644
--- a/src/openelm/sandbox/server/index.py
+++ b/src/openelm/sandbox/server/index.py
@@ -2,7 +2,7 @@
 from numpy import ndarray
 
 from .environments.walker.walk_creator import Walker
-from .sandbox_codex_execute import unsafe_execute
+from .sandbox_codex_execute import ExecResult, unsafe_execute
 
 app = Flask(__name__)
 
@@ -11,9 +11,11 @@ def bad_request(message, **kwargs):
     return {"message": message, **kwargs}, 500
 
 
-def generate_racer(code_str, timeout):
+def generate_racer(code_str: str, timeout: float):
     try:
-        execution_result = unsafe_execute(code_str, "make_walker", timeout)
+        execution_result = unsafe_execute(
+            code_str, func_name="make_walker", timeout=timeout
+        )
     except Exception:
         return bad_request(
             "Failed to execute code", unsafe_execute_error_code=6
@@ -32,9 +34,10 @@ def generate_racer(code_str, timeout):
                 walker=execution_result.to_dict(),
                 unsafe_execute_error_code=1,
             )
-    elif isinstance(execution_result, int):
+    elif isinstance(execution_result, ExecResult):
         return bad_request(
-            "Failed sandbox_unsafe_execute", unsafe_execute_error_code=execution_result
+            "Failed sandbox_unsafe_execute",
+            unsafe_execute_error_code=execution_result.name,
         )
     else:
         return bad_request(
@@ -57,20 +60,22 @@ def gen_racer():
 
 @app.route("/eval_imageoptim_func", methods=["POST"])
 def evaluate_function():
-    req_json = request.get_json()
+    req_json: dict = request.get_json()
     try:
         execution_result = unsafe_execute(
-            req_json["code"], req_json["func_name"], req_json["timeout"]
+            code_str=req_json["code"],
+            func_name=req_json["func_name"],
+            timeout=req_json["timeout"],
         )
         if isinstance(execution_result, ndarray):
             return {
                 "program_str": req_json["code"],
                 "result_obj": execution_result.tolist().__repr__(),
             }, 200
-        elif isinstance(execution_result, int):
+        elif isinstance(execution_result, ExecResult):
             return bad_request(
                 "Failed sandbox_unsafe_execute",
-                unsafe_execute_error_code=execution_result,
+                unsafe_execute_error_code=execution_result.name,
             )
         else:
             bad_request(
@@ -80,3 +85,24 @@ def evaluate_function():
         return bad_request(
             "Failed to execute code", unsafe_execute_error_code=6
         )  # 6: Other errors.
+
+
+@app.route("/eval_p3_solution", methods=["POST"])
+def evaluate_p3_solution():
+    req_json = request.get_json()
+    try:
+        execution_result = unsafe_execute(
+            req_json["code"], req_json["func_name"], req_json["timeout"]
+        )
+        if isinstance(execution_result, ExecResult):
+            return bad_request(
+                "Failed sandbox_unsafe_execute",
+                unsafe_execute_error_code=execution_result.name,
+            )
+        return {
+            "program_str": req_json["code"],
+            "result_obj": execution_result.__repr__(),
+        }, 200
+    except Exception:
+        return bad_request("Failed to execute code", unsafe_execute_error_code=6)
+        # 6: Other errors.
diff --git a/src/openelm/sandbox/server/sandbox_codex_execute.py b/src/openelm/sandbox/server/sandbox_codex_execute.py
index e7ad0fac..9828e8fa 100644
--- a/src/openelm/sandbox/server/sandbox_codex_execute.py
+++ b/src/openelm/sandbox/server/sandbox_codex_execute.py
@@ -66,9 +66,18 @@ def unsafe_execute(
                 exec(code_str, code_dct)
                 if ground_truth is None:
                     if args is None:
-                        return code_dct[func_name]()
+                        result = code_dct[func_name]()
                     elif args is not None:
-                        return code_dct[func_name](**args)
+                        result = code_dct[func_name](**args)
+
+                    # Multiprocessing.pool.map
+                    # (in utils.code_eval.pool_exec_processes())
+                    # cannot return 'generators'
+                    # (this may not catch all 'invalid' generator uses)
+                    if isinstance(result, range):
+                        result = list(result)
+
+                    return result
                 elif ground_truth is not None:
                     if all(
                         [
@@ -76,7 +85,7 @@ def unsafe_execute(
                             for arguments, res in ground_truth.items()
                         ]
                     ):
-                        return 0
+                        return ExecResult(0)
                     else:
                         return ExecResult(1)
         except Exception as e:
diff --git a/src/openelm/utils/__init__.py b/src/openelm/utils/__init__.py
index e69de29b..4b58d3fe 100644
--- a/src/openelm/utils/__init__.py
+++ b/src/openelm/utils/__init__.py
@@ -0,0 +1,12 @@
+from openelm.utils.code_eval import eval_completions, mutate_code, pool_exec_processes
+from openelm.utils.diff_eval import apply_diff, split_diff
+from openelm.utils.utils import validate_config
+
+__all__ = [
+    "pool_exec_processes",
+    "eval_completions",
+    "mutate_code",
+    "apply_diff",
+    "split_diff",
+    "validate_config",
+]
diff --git a/src/openelm/utils/code_eval.py b/src/openelm/utils/code_eval.py
index 6f8c3b6f..d6dc1eb2 100644
--- a/src/openelm/utils/code_eval.py
+++ b/src/openelm/utils/code_eval.py
@@ -3,6 +3,8 @@
 import multiprocessing as mp
 from typing import Any, Iterable, Optional, Union
 
+import numpy as np
+
 from openelm.sandbox.server.sandbox_codex_execute import ExecResult, unsafe_execute
 
 
@@ -144,3 +146,46 @@ def parity_reference(b1, b2, b3, b4):
 def quadratic(a, b, c, x):
     """Return quadratic: a,b,c are coefficients and x is the independent variable."""
     return a * x**2 + b * x + c
+
+
+def pass_at_k(n, c, k):
+    """
+    :param n: total number of samples
+    :param c: number of correct samples
+    :param k: k in pass@k
+    """
+    if n - c < k: return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+def type_check(typ, obj):
+    """
+    Checks the object is the correct type. Supports only bool, int, float, str, and (possibly nested) lists of these
+
+    From: https://github.com/microsoft/PythonProgrammingPuzzles/blob/v0.2/puzzle_generator.py
+    """
+    type_s = type_str(typ)  # convert to string if necessary
+
+    nest_depth = type_s.count("List")
+    assert type_s.count("[") == nest_depth, "type_check only supports List for now, no Sets, Dicts, Tuples, ..."
+
+    assert type_s.startswith("List[" * nest_depth) and type_s.endswith("]" * nest_depth)
+    base_type = {"bool": bool, "int": int, "float": float, "str": str}[type_s[5 * nest_depth:len(type_s) - nest_depth]]
+
+    def helper(depth, o):
+        if depth == 0:
+            return type(o) is base_type
+        else:
+            return type(o) is list and all(helper(depth - 1, i) for i in o)
+
+    return helper(nest_depth, obj)
+
+def type_str(ty: type) -> str:
+    """
+    Convert type ty to string.
+    :param ty: str, typing.List[int] , typing.List[typing.List[bool]], etc.
+    :return: string form of type, "str", "List[int]" , "List[List[bool]]", etc.
+
+    From: https://github.com/microsoft/PythonProgrammingPuzzles/blob/v0.2/puzzle_generator.py
+    """
+    type_str = str(ty).replace("typing.", "")
+    return type_str[8:-2] if type_str.startswith("<class '") else type_str
\ No newline at end of file
diff --git a/src/openelm/utils/utils.py b/src/openelm/utils/utils.py
new file mode 100644
index 00000000..625e803f
--- /dev/null
+++ b/src/openelm/utils/utils.py
@@ -0,0 +1,24 @@
+from dataclasses import is_dataclass
+from pathlib import Path
+
+from omegaconf import DictConfig, OmegaConf
+
+
+def validate_config(config):
+    if isinstance(config, (str, Path)):
+        config = OmegaConf.load(config)
+        try:
+            return OmegaConf.to_object(config)
+        except ValueError:
+            return config
+    elif isinstance(config, (dict, DictConfig)):
+        return DictConfig(config)
+    elif is_dataclass(config):
+        return config
+    else:
+        try:
+            return OmegaConf.load(config)
+        except IOError:
+            raise IOError(
+                "Invalid config type. Must be a path to a yaml, a dict, or dataclass."
+            )
diff --git a/tests/__test_mapelites.py b/tests/__test_mapelites.py
index 5ee6e8a0..13cbed12 100644
--- a/tests/__test_mapelites.py
+++ b/tests/__test_mapelites.py
@@ -1,4 +1,3 @@
-from openelm.environments import image_init_args
 from openelm.environments.environments import (
     BaseEnvironment,
     FunctionOptim,
@@ -25,7 +24,8 @@ def test_function_optim():
 
 
 def test_image_optim():
-    env = ImageOptim(**image_init_args)
-    elites = MAPElites(env, n_bins=2, history_length=10)
+    # env = ImageOptim(**image_init_args)
+    # elites = MAPElites(env, n_bins=2, history_length=10)
 
-    print("Best image", elites.search(initsteps=5, totalsteps=10))
+    # print("Best image", elites.search(initsteps=5, totalsteps=10))
+    pass
diff --git a/tests/test_code_execute.py b/tests/test_code_execute.py
index e55997d9..56e1f748 100644
--- a/tests/test_code_execute.py
+++ b/tests/test_code_execute.py
@@ -20,7 +20,7 @@ def parity(b1,b2,b3,b4):
 
     # Completion test
     result = eval_completions(eval_results=[PARITY_PROMPT])
-    assert result == [0]
+    assert result == [ExecResult.VALID]
 
     # Timeout test
     PROMPT = """