From a535a2b80c02fcb62a4248cc3b4dbae36766bfb7 Mon Sep 17 00:00:00 2001
From: efenocchi <emanuele.fenocchi@gmail.com>
Date: Mon, 17 Jun 2024 09:16:49 +0200
Subject: [PATCH 1/3] raft dataset

---
 src/llama_recipes/configs/datasets.py      | 11 +++-
 src/llama_recipes/datasets/__init__.py     |  4 +-
 src/llama_recipes/datasets/raft_dataset.py | 72 ++++++++++++++++++++++
 src/llama_recipes/utils/dataset_utils.py   |  2 +
 4 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 src/llama_recipes/datasets/raft_dataset.py

diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py
index 0c41d0a4d..b5e0dc8bd 100644
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -31,4 +31,13 @@ class custom_dataset:
     dataset: str = "custom_dataset"
     file: str = "examples/custom_dataset.py"
     train_split: str = "train"
-    test_split: str = "validation"
\ No newline at end of file
+    test_split: str = "validation"
+
+
+@dataclass
+class raft_dataset:
+    dataset: str = "raft_dataset"
+    train_split: str = "train"
+    test_split: str = "validation"
+    dataset_path: str = "hub://manufe/raft_format_dataset_finance"
+    val_quantity: int = 1000
diff --git a/src/llama_recipes/datasets/__init__.py b/src/llama_recipes/datasets/__init__.py
index 57d237688..490848a7f 100644
--- a/src/llama_recipes/datasets/__init__.py
+++ b/src/llama_recipes/datasets/__init__.py
@@ -1,6 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+
 from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset
 from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset
-from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
\ No newline at end of file
+from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
+from llama_recipes.datasets.raft_dataset import InstructionRAFTDataset as get_raft_dataset
diff --git a/src/llama_recipes/datasets/raft_dataset.py b/src/llama_recipes/datasets/raft_dataset.py
new file mode 100644
index 000000000..ec5119c53
--- /dev/null
+++ b/src/llama_recipes/datasets/raft_dataset.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from torch.utils.data import Dataset
+import copy
+import json
+import tqdm
+import torch
+from torch.utils.data import Dataset
+import deeplake
+
+PROMPT = """Given the instruction containing context and the question, provide the logical reasoning that led you to the answer.
+        Please use the format of: ##Reason: reason ##Answer: answer.
+        ###Instruction: {instruction}\n\n### Response:"""
+
+
+class InstructionRAFTDataset(Dataset):
+    def __init__(self, dataset_config, tokenizer, partition="train"):
+        self.tokenizer = tokenizer
+
+        val_quantity = dataset_config.val_quantity
+        ds = deeplake.dataset(dataset_config.dataset_path)
+
+        if partition == "train":
+            ds = ds[val_quantity:]
+            self.ann = ds
+        else:
+            ds = ds[:val_quantity]
+            self.ann = ds
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        if index < len(self):
+
+            IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss
+
+            column_map = self.ann.tensors.keys()
+            ann = {}
+            for el in column_map:  # {"column_name" : "value"}
+                ann[el] = self.ann[el][index].text().astype(str)
+
+            prompt = PROMPT.format_map(ann)
+
+            example = prompt + ann["cot_answer"]
+            prompt_ids = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64)
+            example_ids = self.tokenizer.encode(example)
+            example_ids.append(self.tokenizer.eos_token_id)
+            example_ids = torch.tensor(example_ids, dtype=torch.int64)
+
+            labels = copy.deepcopy(example_ids)
+            labels[: len(prompt_ids)] = -1
+
+            example_mask = example_ids.ge(0)
+            label_mask = labels.ge(0)
+
+            example_ids[~example_mask] = 0
+            labels[~label_mask] = IGNORE_INDEX
+
+            return {
+                "input_ids": example_ids.tolist(),
+                "labels": labels.tolist(),
+                "attention_mask": example_mask.tolist(),
+            }
+        else:
+            raise IndexError
diff --git a/src/llama_recipes/utils/dataset_utils.py b/src/llama_recipes/utils/dataset_utils.py
index 47baeefe0..1e7226965 100644
--- a/src/llama_recipes/utils/dataset_utils.py
+++ b/src/llama_recipes/utils/dataset_utils.py
@@ -11,6 +11,7 @@
     get_grammar_dataset,
     get_alpaca_dataset,
     get_samsum_dataset,
+    get_raft_dataset
 )
 
 
@@ -54,6 +55,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
     "grammar_dataset": get_grammar_dataset,
     "samsum_dataset": get_samsum_dataset,
     "custom_dataset": get_custom_dataset,
+    "raft_dataset": get_raft_dataset,
 }
 
 

From 8d7699e24923b9c55e1471874fc28c6b6dd44f19 Mon Sep 17 00:00:00 2001
From: efenocchi <emanuele.fenocchi@gmail.com>
Date: Mon, 17 Jun 2024 13:40:31 +0200
Subject: [PATCH 2/3] Deep Lake package

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index df2c66fd7..6ad082b7c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ chardet
 openai
 typing-extensions==4.8.0
 tabulate
+deeplake
\ No newline at end of file

From a7777fe7acf7558878070d2133bea8342b576722 Mon Sep 17 00:00:00 2001
From: efenocchi <emanuele.fenocchi@gmail.com>
Date: Mon, 17 Jun 2024 14:59:16 +0200
Subject: [PATCH 3/3] changed variable names

---
 src/llama_recipes/configs/datasets.py      |  2 +-
 src/llama_recipes/datasets/raft_dataset.py | 25 +++++++++++-----------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py
index b5e0dc8bd..5c1c2c80f 100644
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -39,5 +39,5 @@ class raft_dataset:
     dataset: str = "raft_dataset"
     train_split: str = "train"
     test_split: str = "validation"
-    dataset_path: str = "hub://manufe/raft_format_dataset_finance"
+    dataset_path: str = "hub://manufe/raft_format_dataset_financebench"
     val_quantity: int = 1000
diff --git a/src/llama_recipes/datasets/raft_dataset.py b/src/llama_recipes/datasets/raft_dataset.py
index ec5119c53..aaecb3aee 100644
--- a/src/llama_recipes/datasets/raft_dataset.py
+++ b/src/llama_recipes/datasets/raft_dataset.py
@@ -24,7 +24,7 @@ def __init__(self, dataset_config, tokenizer, partition="train"):
         self.tokenizer = tokenizer
 
         val_quantity = dataset_config.val_quantity
-        ds = deeplake.dataset(dataset_config.dataset_path)
+        ds = deeplake.load(dataset_config.dataset_path)
 
         if partition == "train":
             ds = ds[val_quantity:]
@@ -49,22 +49,21 @@ def __getitem__(self, index):
             prompt = PROMPT.format_map(ann)
 
             example = prompt + ann["cot_answer"]
-            prompt_ids = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64)
-            example_ids = self.tokenizer.encode(example)
-            example_ids.append(self.tokenizer.eos_token_id)
-            example_ids = torch.tensor(example_ids, dtype=torch.int64)
-
-            labels = copy.deepcopy(example_ids)
-            labels[: len(prompt_ids)] = -1
-
-            example_mask = example_ids.ge(0)
+            prompt = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64)
+            example = self.tokenizer.encode(example)
+            example.append(self.tokenizer.eos_token_id)
+            example = torch.tensor(
+                example, dtype=torch.int64
+            )
+            labels = copy.deepcopy(example)
+            labels[: len(prompt)] = -1
+            example_mask = example.ge(0)
             label_mask = labels.ge(0)
-
-            example_ids[~example_mask] = 0
+            example[~example_mask] = 0
             labels[~label_mask] = IGNORE_INDEX
 
             return {
-                "input_ids": example_ids.tolist(),
+                "input_ids": example.tolist(),
                 "labels": labels.tolist(),
                 "attention_mask": example_mask.tolist(),
             }