Add open_orca preprocessing steps (#1664)

* Add open_orca preprocessing steps * Typo
mlcommons · May 21, 2024 · f274a55 · f274a55
1 parent e6a493d
commit f274a55
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 2 deletions.
diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
@@ -112,7 +112,7 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/open_orca ./open_orca -P
 
 ### Unprocessed
 
-You can also download and process the dataset yourself as follows:
+You can also download and process the dataset yourself following the command below:
 
 ```
 # First get the `open-orca` parquet from huggingface
@@ -129,6 +129,12 @@ python3 processorca.py --dataset_pq_path=${OPENORCA_PARQUET} --model_dir=${CHECK
 mv ${EXPORT_DIR}/open_orca_gpt4_tokenized_llama.sampled_24576.pkl ${DATASET_PATH}
 ```
 
+The script will perform the following steps on the original open_orca GPT4 dataset:
+- filter out all queries with non-ascii characters, except for normal unicode quotes and hyphens.
+- filter out all queries with out-of-bound input/output sequence lengths
+- filter out all queries with expected answers shorter than 2 words (known to cause issues for Llama2)
+- filter out all queries with prompts that generate bad output texts using Llama2 models
+- sample equally from the sub-dataset (i.e. COT, NIV, FLAN, T0) and form the final dataset.
 
 ## Run Performance Benchmarks
 

diff --git a/language/llama2-70b/processorca.py b/language/llama2-70b/processorca.py
@@ -24,6 +24,15 @@
 from transformers import LlamaTokenizerFast
 from typing import Dict
 
+__doc__ = """
+This script takes the open_orca GPT4 dataset parquet and perform the following preprocessing and filtering steps:
+1. filter out all queries with non-ascii characters, except for normal unicode quotes and hyphens.
+2. filter out all queries with out-of-bound input/output sequence lengths
+3. filter out all queries with expected answers shorter than 2 words (known to cause issues for Llama2)
+4. filter out all queries with prompts that generate bad output texts using Llama2 models
+4. sample equally from the sub-dataset (i.e. COT, NIV, FLAN, T0) and form the final dataset.
+"""
+
 llama_prompt_system = "<s>[INST] <<SYS>>\n{}\n<</SYS>>\n\n{} [/INST]"
 llama_prompt_no_system = "<s>[INST] {} [/INST]"
 
@@ -105,7 +114,7 @@ def filter_seqlen_oob(self, df: pd.DataFrame) -> pd.DataFrame:
         df['tok_input_length'] = df['tok_input'].apply(lambda x: len(x))
         df['tok_output_length'] = df['tok_output'].apply(lambda x: len(x))
 
-        # Filter based on sequence length (2048, 2048)
+        # Filter based on sequence length
         df = df[df["tok_input_length"] < self.io_token_limit]
         df = df[df["tok_output_length"] < self.io_token_limit]
         return df.reset_index(drop=True)