Skip to content

Commit

Permalink
added fix split, gitignore and download mmlu script
Browse files Browse the repository at this point in the history
  • Loading branch information
heyjustinai committed Jan 29, 2025
1 parent 8d3a047 commit e19b9e9
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 1 deletion.
1 change: 1 addition & 0 deletions end-to-end-use-cases/prompt-migration/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/mmlu_pro_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from datasets import load_dataset
import pandas as pd
import os

def download_mmlu_pro():
# Create output directory if it doesn't exist
output_dir = "mmlu_pro_data"
os.makedirs(output_dir, exist_ok=True)

# Load the dataset
dataset = load_dataset("TIGER-Lab/MMLU-Pro")

# Convert each split to CSV
for split in dataset.keys():
# Convert to pandas DataFrame
df = pd.DataFrame(dataset[split])

# Save to CSV
output_path = os.path.join(output_dir, f"mmlu_pro_{split}.csv")
df.to_csv(output_path, index=False)
print(f"Saved {split} split to {output_path}")
print(f"Number of examples in {split}: {len(df)}")

if __name__ == "__main__":
print("Downloading MMLU-Pro dataset...")
download_mmlu_pro()
print("Download complete!")
28 changes: 28 additions & 0 deletions end-to-end-use-cases/prompt-migration/benchmarks/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,31 @@ def train_val_test_split(
valset=list(map(mapper, validation_docs)),
testset=list(map(mapper, test_docs)),
)


def fixed_split(
dataset: "Dataset",
mapper: t.Callable[[dict], "dspy.Example"],
train_size: int = 1000,
validation_size: int = 200,
) -> TaskDatasets:
"""Split dataset by taking first N examples instead of random sampling.
Args:
dataset: Input dataset
mapper: Function to map dataset examples to dspy.Example
train_size: Number of examples to use for training (default: 1000)
validation_size: Number of examples to use for validation (default: 200)
Returns:
TaskDatasets containing train, validation and test splits
"""
train_docs = dataset.select(range(train_size))
validation_docs = dataset.select(range(train_size, train_size + validation_size))
test_docs = dataset.select(range(train_size + validation_size, len(dataset)))

return TaskDatasets(
trainset=list(map(mapper, train_docs)),
valset=list(map(mapper, validation_docs)),
testset=list(map(mapper, test_docs)),
)
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ def _task_doc_example(doc: TaskDoc) -> dspy.Example:
example = dspy.Example(
question=doc["input_question"],
options=doc["input_choice_list"],
reasoning="",
answer=doc["output_parsed_answer"],
)
example._input_keys = {"question", "options"}
example._output_keys = {"answer"}
example._output_keys = {"reasoning", "answer"}
return example


Expand Down

0 comments on commit e19b9e9

Please sign in to comment.