meta-llama · efenocchi · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -19,3 +19,4 @@ chardet
 openai
 typing-extensions==4.8.0
 tabulate
+deeplake
diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py
@@ -31,4 +31,13 @@ class custom_dataset:
     dataset: str = "custom_dataset"
     file: str = "examples/custom_dataset.py"
     train_split: str = "train"
-    test_split: str = "validation"
+    test_split: str = "validation"
+
+
+@dataclass
+class raft_dataset:
+    dataset: str = "raft_dataset"
+    train_split: str = "train"
+    test_split: str = "validation"
+    dataset_path: str = "hub://manufe/raft_format_dataset_financebench"
+    val_quantity: int = 1000
diff --git a/src/llama_recipes/datasets/__init__.py b/src/llama_recipes/datasets/__init__.py
@@ -1,6 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+
 from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset
 from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset
-from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
+from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
+from llama_recipes.datasets.raft_dataset import InstructionRAFTDataset as get_raft_dataset
diff --git a/src/llama_recipes/datasets/raft_dataset.py b/src/llama_recipes/datasets/raft_dataset.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from torch.utils.data import Dataset
+import copy
+import json
+import tqdm
+import torch
+from torch.utils.data import Dataset
+import deeplake
+
+PROMPT = """Given the instruction containing context and the question, provide the logical reasoning that led you to the answer.
+        Please use the format of: ##Reason: reason ##Answer: answer.
+        ###Instruction: {instruction}\n\n### Response:"""
+
+
+class InstructionRAFTDataset(Dataset):
+    def __init__(self, dataset_config, tokenizer, partition="train"):
+        self.tokenizer = tokenizer
+
+        val_quantity = dataset_config.val_quantity
+        ds = deeplake.load(dataset_config.dataset_path)
+
+        if partition == "train":
+            ds = ds[val_quantity:]
+            self.ann = ds
+        else:
+            ds = ds[:val_quantity]
+            self.ann = ds
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        if index < len(self):
+
+            IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss
+
+            column_map = self.ann.tensors.keys()
+            ann = {}
+            for el in column_map:  # {"column_name" : "value"}
+                ann[el] = self.ann[el][index].text().astype(str)
+
+            prompt = PROMPT.format_map(ann)
+
+            example = prompt + ann["cot_answer"]
+            prompt = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64)
+            example = self.tokenizer.encode(example)
+            example.append(self.tokenizer.eos_token_id)
+            example = torch.tensor(
+                example, dtype=torch.int64
+            )
+            labels = copy.deepcopy(example)
+            labels[: len(prompt)] = -1
+            example_mask = example.ge(0)
+            label_mask = labels.ge(0)
+            example[~example_mask] = 0
+            labels[~label_mask] = IGNORE_INDEX
+
+            return {
+                "input_ids": example.tolist(),
+                "labels": labels.tolist(),
+                "attention_mask": example_mask.tolist(),
+            }
+        else:
+            raise IndexError
diff --git a/src/llama_recipes/utils/dataset_utils.py b/src/llama_recipes/utils/dataset_utils.py
@@ -11,6 +11,7 @@
     get_grammar_dataset,
     get_alpaca_dataset,
     get_samsum_dataset,
+    get_raft_dataset
 )
 
 
@@ -54,6 +55,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
     "grammar_dataset": get_grammar_dataset,
     "samsum_dataset": get_samsum_dataset,
     "custom_dataset": get_custom_dataset,
+    "raft_dataset": get_raft_dataset,
 }