From a535a2b80c02fcb62a4248cc3b4dbae36766bfb7 Mon Sep 17 00:00:00 2001 From: efenocchi Date: Mon, 17 Jun 2024 09:16:49 +0200 Subject: [PATCH 1/3] raft dataset --- src/llama_recipes/configs/datasets.py | 11 +++- src/llama_recipes/datasets/__init__.py | 4 +- src/llama_recipes/datasets/raft_dataset.py | 72 ++++++++++++++++++++++ src/llama_recipes/utils/dataset_utils.py | 2 + 4 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 src/llama_recipes/datasets/raft_dataset.py diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py index 0c41d0a4d..b5e0dc8bd 100644 --- a/src/llama_recipes/configs/datasets.py +++ b/src/llama_recipes/configs/datasets.py @@ -31,4 +31,13 @@ class custom_dataset: dataset: str = "custom_dataset" file: str = "examples/custom_dataset.py" train_split: str = "train" - test_split: str = "validation" \ No newline at end of file + test_split: str = "validation" + + +@dataclass +class raft_dataset: + dataset: str = "raft_dataset" + train_split: str = "train" + test_split: str = "validation" + dataset_path: str = "hub://manufe/raft_format_dataset_finance" + val_quantity: int = 1000 diff --git a/src/llama_recipes/datasets/__init__.py b/src/llama_recipes/datasets/__init__.py index 57d237688..490848a7f 100644 --- a/src/llama_recipes/datasets/__init__.py +++ b/src/llama_recipes/datasets/__init__.py @@ -1,6 +1,8 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset -from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset \ No newline at end of file +from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset +from llama_recipes.datasets.raft_dataset import InstructionRAFTDataset as get_raft_dataset diff --git a/src/llama_recipes/datasets/raft_dataset.py b/src/llama_recipes/datasets/raft_dataset.py new file mode 100644 index 000000000..ec5119c53 --- /dev/null +++ b/src/llama_recipes/datasets/raft_dataset.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from torch.utils.data import Dataset +import copy +import json +import tqdm +import torch +from torch.utils.data import Dataset +import deeplake + +PROMPT = """Given the instruction containing context and the question, provide the logical reasoning that led you to the answer. + Please use the format of: ##Reason: reason ##Answer: answer. + ###Instruction: {instruction}\n\n### Response:""" + + +class InstructionRAFTDataset(Dataset): + def __init__(self, dataset_config, tokenizer, partition="train"): + self.tokenizer = tokenizer + + val_quantity = dataset_config.val_quantity + ds = deeplake.dataset(dataset_config.dataset_path) + + if partition == "train": + ds = ds[val_quantity:] + self.ann = ds + else: + ds = ds[:val_quantity] + self.ann = ds + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + if index < len(self): + + IGNORE_INDEX = -100 # The default setting in CrossEntropyLoss + + column_map = self.ann.tensors.keys() + ann = {} + for el in column_map: # {"column_name" : "value"} + ann[el] = self.ann[el][index].text().astype(str) + + prompt = PROMPT.format_map(ann) + + example = prompt + ann["cot_answer"] + prompt_ids = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64) + example_ids = self.tokenizer.encode(example) + example_ids.append(self.tokenizer.eos_token_id) + example_ids = torch.tensor(example_ids, dtype=torch.int64) + + labels = copy.deepcopy(example_ids) + labels[: len(prompt_ids)] = -1 + + example_mask = example_ids.ge(0) + label_mask = labels.ge(0) + + example_ids[~example_mask] = 0 + labels[~label_mask] = IGNORE_INDEX + + return { + "input_ids": example_ids.tolist(), + "labels": labels.tolist(), + "attention_mask": example_mask.tolist(), + } + else: + raise IndexError diff --git a/src/llama_recipes/utils/dataset_utils.py b/src/llama_recipes/utils/dataset_utils.py index 47baeefe0..1e7226965 100644 --- a/src/llama_recipes/utils/dataset_utils.py +++ b/src/llama_recipes/utils/dataset_utils.py @@ -11,6 +11,7 @@ get_grammar_dataset, get_alpaca_dataset, get_samsum_dataset, + get_raft_dataset ) @@ -54,6 +55,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): "grammar_dataset": get_grammar_dataset, "samsum_dataset": get_samsum_dataset, "custom_dataset": get_custom_dataset, + "raft_dataset": get_raft_dataset, } From 8d7699e24923b9c55e1471874fc28c6b6dd44f19 Mon Sep 17 00:00:00 2001 From: efenocchi Date: Mon, 17 Jun 2024 13:40:31 +0200 Subject: [PATCH 2/3] Deep Lake package --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index df2c66fd7..6ad082b7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ chardet openai typing-extensions==4.8.0 tabulate +deeplake \ No newline at end of file From a7777fe7acf7558878070d2133bea8342b576722 Mon Sep 17 00:00:00 2001 From: efenocchi Date: Mon, 17 Jun 2024 14:59:16 +0200 Subject: [PATCH 3/3] changed variable names --- src/llama_recipes/configs/datasets.py | 2 +- src/llama_recipes/datasets/raft_dataset.py | 25 +++++++++++----------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py index b5e0dc8bd..5c1c2c80f 100644 --- a/src/llama_recipes/configs/datasets.py +++ b/src/llama_recipes/configs/datasets.py @@ -39,5 +39,5 @@ class raft_dataset: dataset: str = "raft_dataset" train_split: str = "train" test_split: str = "validation" - dataset_path: str = "hub://manufe/raft_format_dataset_finance" + dataset_path: str = "hub://manufe/raft_format_dataset_financebench" val_quantity: int = 1000 diff --git a/src/llama_recipes/datasets/raft_dataset.py b/src/llama_recipes/datasets/raft_dataset.py index ec5119c53..aaecb3aee 100644 --- a/src/llama_recipes/datasets/raft_dataset.py +++ b/src/llama_recipes/datasets/raft_dataset.py @@ -24,7 +24,7 @@ def __init__(self, dataset_config, tokenizer, partition="train"): self.tokenizer = tokenizer val_quantity = dataset_config.val_quantity - ds = deeplake.dataset(dataset_config.dataset_path) + ds = deeplake.load(dataset_config.dataset_path) if partition == "train": ds = ds[val_quantity:] @@ -49,22 +49,21 @@ def __getitem__(self, index): prompt = PROMPT.format_map(ann) example = prompt + ann["cot_answer"] - prompt_ids = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64) - example_ids = self.tokenizer.encode(example) - example_ids.append(self.tokenizer.eos_token_id) - example_ids = torch.tensor(example_ids, dtype=torch.int64) - - labels = copy.deepcopy(example_ids) - labels[: len(prompt_ids)] = -1 - - example_mask = example_ids.ge(0) + prompt = torch.tensor(self.tokenizer.encode(prompt), dtype=torch.int64) + example = self.tokenizer.encode(example) + example.append(self.tokenizer.eos_token_id) + example = torch.tensor( + example, dtype=torch.int64 + ) + labels = copy.deepcopy(example) + labels[: len(prompt)] = -1 + example_mask = example.ge(0) label_mask = labels.ge(0) - - example_ids[~example_mask] = 0 + example[~example_mask] = 0 labels[~label_mask] = IGNORE_INDEX return { - "input_ids": example_ids.tolist(), + "input_ids": example.tolist(), "labels": labels.tolist(), "attention_mask": example_mask.tolist(), }