-
Notifications
You must be signed in to change notification settings - Fork 485
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from PacktPublishing/generate-instruct-dataset
feat: Add new dataset instruction generation logic
- Loading branch information
Showing
19 changed files
with
496 additions
and
942 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
parameters: | ||
|
||
test_split_size: 0.1 | ||
push_to_huggingface: true | ||
dataset_id: pauliusztin/llmtwin | ||
mock: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
MOCKED_RESPONSE = """ | ||
[ | ||
{"instruction": "<mocked generated instruction> 1", "answer": "<mocked generated answer> 1"}, | ||
{"instruction": "<mocked generated instruction> 2", "answer": "<mocked generated answer> 2"}, | ||
{"instruction": "<mocked generated instruction> 3", "answer": "<mocked generated answer> 3"} | ||
] | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from sklearn.model_selection import train_test_split | ||
|
||
from llm_engineering.domain.dataset import InstructDataset, InstructDatasetSample, TrainTestSplit | ||
from llm_engineering.domain.types import DataCategory | ||
|
||
|
||
def create_train_test_split( | ||
data: dict[DataCategory, InstructDataset], test_size=0.2, random_state=42 | ||
) -> TrainTestSplit: | ||
train_data = {} | ||
test_data = {} | ||
|
||
for category, dataset in data.items(): | ||
samples = dataset.samples | ||
samples_dicts = [sample.model_dump() for sample in samples] | ||
|
||
train_samples_dicts, test_samples_dicts = train_test_split( | ||
samples_dicts, test_size=test_size, random_state=random_state | ||
) | ||
|
||
train_samples = [InstructDatasetSample(**sample_dict) for sample_dict in train_samples_dicts] | ||
test_samples = [InstructDatasetSample(**sample_dict) for sample_dict in test_samples_dicts] | ||
|
||
train_dataset = InstructDataset(category=category, samples=train_samples) | ||
test_dataset = InstructDataset(category=category, samples=test_samples) | ||
|
||
train_data[category] = train_dataset | ||
test_data[category] = test_dataset | ||
|
||
return TrainTestSplit(train=train_data, test=test_data, test_split_size=test_size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 2 additions & 1 deletion
3
llm_engineering/application/preprocessing/operations/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
from .chunking import chunk_text | ||
from .chunking import chunk_article, chunk_text | ||
from .cleaning import clean_text | ||
|
||
__all__ = [ | ||
"chunk_article", | ||
"chunk_text", | ||
"clean_text", | ||
] |
Oops, something went wrong.