Release/0.2.1 (#45)

* Check for ExecResult error in sandbox (#44) * Big refactor * Fix env bugs * Refactor diff model * Update image evo environment * P3 prompting (#41) * Check for ExecResult error in sandbox * Add PromptMutationForP3 * Add p3 run script * Add long prompt; Prompt mutation v0.2 workflow * pass@k * Update to new API and configs --------- Co-authored-by: Herbie Bradley <[email protected]> * Improvements to integrate P3 * Benchmark config fix --------- Co-authored-by: Daniel Scott <[email protected]>
CarperAI · Mar 8, 2023 · e6402a0 · e6402a0
1 parent 98e5c22
commit e6402a0
Show file tree

Hide file tree

Showing 25 changed files with 1,157 additions and 715 deletions.
diff --git a/.gitignore b/.gitignore
@@ -135,6 +135,7 @@ logs/
 archive/
 data/
 *.out
+outputs
 
 # generated dot files and tree graphs
 .gv

diff --git a/run_elm.py b/run_elm.py
@@ -1,36 +1,32 @@
 """
 This module gives an example of how to run the main ELM class.
 
-It uses the hydra library to load the config from the
-config/elm_sodarace_cfg.yaml file.
+It uses the hydra library to load the config from the config dataclasses in
+configs.py.
 
 This config file demonstrates an example of running ELM with the Sodarace
 environment, a 2D physics-based environment in which robots specified by
 Python dictionaries are evolved over.
 
 """
 import hydra
-from hydra.core.config_store import ConfigStore
 from omegaconf import OmegaConf
 
 from openelm import ELM
-from openelm.configs import SodaraceELMConfig
 
-cs = ConfigStore.instance()
-cs.store(name="config", node=SodaraceELMConfig)
 
-
-# Load hydra config from yaml files and command line arguments.
 @hydra.main(
-    config_name="config",
+    config_name="elmconfig",
     version_base="1.2",
 )
-def main(cfg):
+def main(config):
     print("----------------- Config ---------------")
-    print(OmegaConf.to_yaml(cfg))
+    print(OmegaConf.to_yaml(config))
     print("-----------------  End -----------------")
-    elm = ELM(cfg)
-    print("Best Individual: ", elm.run())
+    config = OmegaConf.to_object(config)
+    elm = ELM(config)
+    print("Best Individual: ", elm.run(init_steps=config.qd.init_steps,
+                                       total_steps=config.qd.total_steps))
 
 
 if __name__ == "__main__":

diff --git a/run_p3.py b/run_p3.py
@@ -0,0 +1,176 @@
+import logging
+import pathlib
+import requests
+import time
+import json
+from collections import Counter
+
+from openelm.environments import p3_long_init_args, p3_med_init_args, P3Problem
+from openelm.mutation_model import DiffModel, MutationModel, PromptModel
+from openelm.configs import P3Config
+from openelm.sandbox.server.sandbox_codex_execute import ExecResult
+from openelm.utils.code_eval import pass_at_k
+from openelm.codegen.codegen_utilities import set_seed
+
+import hydra
+from omegaconf import OmegaConf
+
+
+class P3:
+    def __init__(self, cfg: P3Config) -> None:
+        """
+        Evaluate models on P3 dataset
+        """
+        self.cfg: P3Config = cfg
+
+        # Prompt size
+        if cfg.env.prompt_size == 'long':
+            env_args = p3_long_init_args
+        elif cfg.env.prompt_size == 'med':
+            env_args = p3_med_init_args
+        else:
+            raise ValueError('No init args found')
+
+        # Model
+        if self.cfg.model.model_name == 'prompt':
+            self.mutation_model: MutationModel = PromptModel(self.cfg.model)
+        elif self.cfg.model.model_name == 'diff':
+            self.mutation_model: MutationModel = DiffModel(self.cfg.model)
+
+        self.seed = env_args["seed"]
+        self.log_dir = 'logs/p3/problems'
+
+
+    def run(self):
+        """
+        Query PromptMutationModelForP3 for solutions to programming puzzles
+        """
+        # Get problems
+        problems = requests.get("https://raw.githubusercontent.com/microsoft/PythonProgrammingPuzzles/v0.2/puzzles/puzzles.json").json()
+        run_start_time = time.time()
+        num_problem_errors = 0
+        for problem in problems:
+            problem_start_time = time.time()
+            problem_dict = {'name': problem['name']}
+            logging.info(problem['name'])
+
+            problem['problem_func'] = problem['sat'].replace('def sat(', 'def f6(') # prompt form is f6()
+            problem['solution_preamble'] = problem['sol_header'].replace('def sol(', 'def g6(') # solution form is g6()
+            if self.cfg.env.prompt_size == 'long':
+                problem['solution_preamble'] = problem['solution_preamble'] + '\n' + problem['sol_docstring']
+
+            env = P3Problem(seed=self.seed,
+                            config=self.cfg,
+                            mutation_model=self.mutation_model,
+                            problem_func=problem['problem_func'],
+                            solution_preamble=problem['solution_preamble'],
+                            ans_type = problem['ans_type'])
+
+            # Find solutions
+            # If there is an error during finding a solution, log it and skip this problem
+            solutions = []
+            try:
+                for i in range(self.cfg.env.solutions_per_problem // self.cfg.model.batch_size):
+                    set_seed(i) # Change seed for each query
+
+                    try:
+                        solutions += env.random()
+                    except Exception as e:
+                        logging.error(f'ERROR with solution {i} in {problem["name"]}: {e}')
+                        num_problem_errors += 1
+                        raise(e)
+            except Exception as e:
+                continue
+
+            # Evaluate fitness of solutions
+            res_sols_list = []
+            solved = False
+            for sol in solutions:
+                res_sol_dict = {}
+                res_sol_dict['program_str'] = sol.program_str
+
+                if isinstance(sol.result_obj, ExecResult):
+                    if self.cfg.save_result_obj: res_sol_dict['result_obj'] = sol.result_obj.name
+                    fitness = 0.0
+                else:
+                    if self.cfg.save_result_obj: res_sol_dict['result_obj'] = sol.result_obj
+                    fitness = env.fitness(sol)
+
+                res_sol_dict['fitness'] = fitness
+                res_sols_list.append(res_sol_dict)
+                if not solved and fitness == 1.0:
+                    solved = True # just want to save if solved at all
+
+            problem_dict['config'] = OmegaConf.to_container(self.cfg)
+            problem_dict['solutions'] = res_sols_list
+            problem_dict['solved'] = solved
+            problem_dict['time_elapsed'] = time.time() - problem_start_time
+
+            # Save results
+            dir = f'{self.log_dir}/{problem_dict["name"]}/{run_start_time}'
+            pathlib.Path(dir).mkdir(parents=True, exist_ok=True)
+
+            with open(f'{dir}/results.json', 'w') as file:
+                file.write(json.dumps(problem_dict))
+
+        logging.info(f'Successfully ran on {len(problems)}/{len(problems)-num_problem_errors}' +
+                        f' problems and saved results to {self.log_dir}')
+
+
+    def eval_pass_at_k(self, timestamp: str, k: int):
+        """
+        pass@k metric over a subset of run logs
+        
+        Args:
+            timestamp (str): (optional) go through all problems with a run generated with timestamp
+                (if None, go through the latest run for every problem currently in logs)
+            k (int): k for pass@k
+        """
+
+        path = pathlib.Path(self.log_dir)
+        problem_paths = sorted(list(path.iterdir())) # Get all logged problems
+        paks = []
+        for p in problem_paths:
+            n = 0
+            c = 0
+            # Select one of the runs per problem
+            if len(timestamp) == 0:
+                # Get latest run
+                path = pathlib.Path(p)
+                run_paths = sorted(list(path.iterdir())) # Get all the runs per problem
+                run_path = run_paths[-1]
+            else:
+                # Get 'timestamp' run
+                run_path = p / timestamp
+
+            with open(f'{run_path}/results.json', 'r') as f:
+                results = json.load(f)
+                n += len(results['solutions'])
+                c += Counter(sol['fitness'] for sol in results['solutions'])[1.0]
+
+                pak = pass_at_k(n=n, c=c, k=k)
+                paks.append(pak)
+
+        pak_overall = sum(paks) / len(paks)
+        return pak_overall
+
+
+# Load hydra config from yaml files and command line arguments.
+@hydra.main(
+    config_name="p3config",
+    version_base="1.2",
+)
+def main(cfg):
+    # Run
+    logging.info("----------------- Config ---------------")
+    logging.info(OmegaConf.to_yaml(cfg))
+    logging.info("-----------------  End -----------------")
+    p3 = P3(cfg)
+
+    if cfg.eval_k > 0: logging.info(f"PASS@K: {p3.eval_pass_at_k(timestamp=cfg.eval_timestamp, k=cfg.eval_k)}")
+    else: p3.run()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/src/openelm/__init__.py b/src/openelm/__init__.py
@@ -3,3 +3,5 @@
 from openelm.elm import ELM
 
 __version__ = importlib_version("openelm")
+
+__all__ = ["ELM"]
diff --git a/src/openelm/benchmarks/benchmark_bugs.py b/src/openelm/benchmarks/benchmark_bugs.py
@@ -13,8 +13,7 @@
 
 from openelm.codegen import model_setup, sample, truncate
 from openelm.configs import BaseConfig
-from openelm.utils.code_eval import eval_completions, mutate_code
-from openelm.utils.diff_eval import apply_diff, split_diff
+from openelm.utils import apply_diff, eval_completions, mutate_code, split_diff
 
 
 @dataclass
@@ -54,7 +53,7 @@ def __init__(self, cfg: BenchmarkBugsConfig):
 
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-        self.device = torch.device("cuda" if cfg.cuda else "cpu")
+        self.device = torch.device("cuda")
         self.model, self.tokenizer, self.device = model_setup(cfg, self.device)
 
     def benchmark_parity(self, n_bugs, **kwargs):

diff --git a/src/openelm/benchmarks/benchmark_crossover.py b/src/openelm/benchmarks/benchmark_crossover.py
@@ -86,7 +86,7 @@ def __init__(self, cfg: BenchmarkCrossoverConfig):
 
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-        self.device = torch.device("cuda" if cfg.cuda else "cpu")
+        self.device = torch.device("cuda")
         self.model, self.tokenizer, self.device = model_setup(cfg, self.device)
 
     def construct_prompt(self, seeds):

diff --git a/src/openelm/benchmarks/benchmark_lm_speed.py b/src/openelm/benchmarks/benchmark_lm_speed.py
@@ -8,11 +8,12 @@
 from tqdm import trange
 
 from openelm.codegen import model_setup, sample
+from openelm.configs import BaseConfig
 from openelm.environments import SQUARE_SEED
 
 
 @dataclass
-class BenchmarkSpeedConfig:
+class BenchmarkSpeedConfig(BaseConfig):
     hydra: Any = field(
         default_factory=lambda: {
             "run": {"dir": "logs/benchmarks/lm_speed/${now:%Y-%m-%d-%H-%M-%S}"}

diff --git a/src/openelm/codegen/codegen_utilities.py b/src/openelm/codegen/codegen_utilities.py
@@ -1,11 +1,14 @@
 import os
 import random
 import re
+from typing import Optional
 
 import numpy as np
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from openelm.configs import ModelConfig
+
 
 def set_seed(seed=None, deterministic=True) -> int:
     if seed is None:
@@ -66,37 +69,46 @@ def find_re(string, pattern, start_pos):
         return completion
 
 
-def model_setup(cfg, device=None):
+def model_setup(cfg: ModelConfig, device=None, codegen_tokenizer: bool = True):
     set_seed(cfg.seed, deterministic=True)
     if device is None:
-        device = torch.device("cuda" if cfg.cuda else "cpu")
+        device = torch.device("cuda")
     use_fp16 = True
     if not cfg.fp16 or device.type == "cpu":
         use_fp16 = False
 
-    if cfg.model.startswith("codegen-16B"):
+    if "codegen-16B" in cfg.model_path:
         use_fp16 = True
 
-    tokenizer = AutoTokenizer.from_pretrained(cfg.model)
-    tokenizer.padding_side = "left"
-    tokenizer.pad_token = 50256
+    tokenizer = AutoTokenizer.from_pretrained(cfg.model_path)
+    if codegen_tokenizer:
+        tokenizer.padding_side = "left"
+        tokenizer.pad_token = 50256
 
-    model_path = cfg.model
     if cfg.gpus > 1:
         model = torch.nn.DataParallel(
-            create_model(model_path, fp16=use_fp16), device_ids=list(range(cfg.gpus))
+            create_model(cfg.model_path, fp16=use_fp16),
+            device_ids=list(range(cfg.gpus)),
         ).to(device)
     else:
-        model = create_model(model_path, fp16=use_fp16).to(device)
+        model = create_model(cfg.model_path, fp16=use_fp16).to(device)
     return model, tokenizer, device
 
 
 def sample(
-    batch, cfg, model, tokenizer, decode: bool = True, starting_idx=None, **kwargs
+    batch,
+    cfg: ModelConfig,
+    model,
+    tokenizer,
+    decode: bool = True,
+    starting_idx: Optional[int] = None,
+    num_return_sequences: Optional[int] = None,
+    **kwargs
 ) -> list[str]:
     """Run a model on a batch of contexts for a particular task."""
-    batch_size = kwargs.get("batch_size", cfg.batch_size)
-    device = kwargs.get("device", torch.device("cuda" if cfg.cuda else "cpu"))
+    if num_return_sequences is None:
+        num_return_sequences = cfg.batch_size
+    device = kwargs.get("device", torch.device("cuda"))
     temperature = kwargs.get("temperature", cfg.temp)
     top_p = kwargs.get("top_p", cfg.top_p)
     gen_max_len = kwargs.get("gen_max_len", cfg.gen_max_len)
@@ -111,7 +123,7 @@ def sample(
             tokens = model.module.generate(
                 **batch,
                 do_sample=True,
-                num_return_sequences=batch_size,
+                num_return_sequences=num_return_sequences,
                 temperature=temperature,
                 max_new_tokens=gen_max_len,
                 top_p=top_p,
@@ -122,7 +134,7 @@ def sample(
             tokens = model.generate(
                 **batch,
                 do_sample=True,
-                num_return_sequences=batch_size,
+                num_return_sequences=num_return_sequences,
                 temperature=temperature,
                 max_new_tokens=gen_max_len,
                 top_p=top_p,
-Original file line number
+Diff line change
@@ Expand Up / @@ -135,6 +135,7 @@ logs/ @@
     archive/
     data/
     *.out
+    outputs
     # generated dot files and tree graphs
     .gv
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,5 @@
		from openelm.elm import ELM

		__version__ = importlib_version("openelm")

		__all__ = ["ELM"]