From 4a90acb29a365684fd3311f8394e6fd0fc337173 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 21 Dec 2023 22:14:02 +0530 Subject: [PATCH 1/4] adding hugging face dataset download class --- .../abstract_model_provider.py | 2 +- .../storage_init_container/hugging_face.py | 38 ++++++++++++++++++- .../storage_init_container/requirements.txt | 1 + .../storage_init_container/storage.py | 6 ++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/sdk/python/kubeflow/storage_init_container/abstract_model_provider.py b/sdk/python/kubeflow/storage_init_container/abstract_model_provider.py index 0863c07e6c..392478a346 100644 --- a/sdk/python/kubeflow/storage_init_container/abstract_model_provider.py +++ b/sdk/python/kubeflow/storage_init_container/abstract_model_provider.py @@ -7,5 +7,5 @@ def load_config(self): pass @abstractmethod - def download_model(self): + def download_model_and_tokenizer(self): pass diff --git a/sdk/python/kubeflow/storage_init_container/hugging_face.py b/sdk/python/kubeflow/storage_init_container/hugging_face.py index da26859eaa..ef8dfc1341 100644 --- a/sdk/python/kubeflow/storage_init_container/hugging_face.py +++ b/sdk/python/kubeflow/storage_init_container/hugging_face.py @@ -1,7 +1,7 @@ from abstract_model_provider import modelProvider +from abstract_dataset_provider import datasetProvider from dataclasses import dataclass, field from typing import Literal -import transformers from urllib.parse import urlparse import json from typing import Dict, Any @@ -45,6 +45,8 @@ def load_config(self, serialised_args): def download_model_and_tokenizer(self): # implementation for downloading the model print("downloading model") + import transformers + transformer_type_class = getattr(transformers, self.config.transformer_type) parsed_uri = urlparse(self.config.model_uri) self.model = parsed_uri.netloc + parsed_uri.path @@ -57,3 +59,37 @@ def download_model_and_tokenizer(self): transformers.AutoTokenizer.from_pretrained( self.model, cache_dir=self.config.download_dir ) + + +@dataclass +class HfDatasetParams: + repo_id: str + access_token: str = None + allow_patterns: list[str] = None + ignore_patterns: list[str] = None + download_dir: str = field(default="/workspace/datasets") + + def __post_init__(self): + # Custom checks or validations can be added here + if self.repo_id is None: + raise ValueError("repo_id is None") + + +class HuggingFaceDataset(datasetProvider): + def load_config(self, serialised_args): + self.config = HfDatasetParams(**json.loads(serialised_args)) + + def download_dataset(self): + print("downloading dataset") + import huggingface_hub + from huggingface_hub import snapshot_download + + if self.config.access_token: + huggingface_hub.login(self.config.access_token) + snapshot_download( + repo_id=self.config.repo_id, + repo_type="dataset", + allow_patterns=self.config.allow_patterns, + ignore_patterns=self.config.ignore_patterns, + local_dir=self.config.download_dir, + ) diff --git a/sdk/python/kubeflow/storage_init_container/requirements.txt b/sdk/python/kubeflow/storage_init_container/requirements.txt index 741e8ab077..cc9fcce7fb 100644 --- a/sdk/python/kubeflow/storage_init_container/requirements.txt +++ b/sdk/python/kubeflow/storage_init_container/requirements.txt @@ -4,3 +4,4 @@ torchaudio==2.1.1 einops==0.7.0 transformers_stream_generator==0.0.4 boto3==1.33.9 +huggingface_hub \ No newline at end of file diff --git a/sdk/python/kubeflow/storage_init_container/storage.py b/sdk/python/kubeflow/storage_init_container/storage.py index 703a6d78c5..6f5150a3f8 100644 --- a/sdk/python/kubeflow/storage_init_container/storage.py +++ b/sdk/python/kubeflow/storage_init_container/storage.py @@ -1,5 +1,5 @@ import argparse -from hugging_face import HuggingFace +from hugging_face import HuggingFace, HuggingFaceDataset from s3 import S3 @@ -19,6 +19,10 @@ def dataset_factory(dataset_provider, dataset_provider_args): s3 = S3() s3.load_config(dataset_provider_args) s3.download_dataset() + case "hf": + hf = HuggingFaceDataset() + hf.load_config(dataset_provider_parameters) + hf.download_dataset() case _: return "This is the default case" From 5af363d17ecd1fe900e252f75b3834136d13655f Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Fri, 5 Jan 2024 01:46:50 +0530 Subject: [PATCH 2/4] code review changes --- .../storage_init_container/hugging_face.py | 80 ++++++++++++------- .../storage_init_container/requirements.txt | 4 +- .../kubeflow/storage_init_container/s3.py | 6 +- .../storage_init_container/storage.py | 24 +++--- 4 files changed, 70 insertions(+), 44 deletions(-) diff --git a/sdk/python/kubeflow/storage_init_container/hugging_face.py b/sdk/python/kubeflow/storage_init_container/hugging_face.py index ef8dfc1341..68c39dbaa1 100644 --- a/sdk/python/kubeflow/storage_init_container/hugging_face.py +++ b/sdk/python/kubeflow/storage_init_container/hugging_face.py @@ -3,38 +3,55 @@ from dataclasses import dataclass, field from typing import Literal from urllib.parse import urlparse -import json -from typing import Dict, Any +import json, os +from typing import Dict, Any, Union +from datasets import load_dataset +from peft import LoraConfig +import transformers +from transformers import TrainingArguments +import enum +import huggingface_hub -TRANSFORMER_TYPES = [ - "AutoModelForSequenceClassification", - "AutoModelForTokenClassification", - "AutoModelForQuestionAnswering", - "AutoModelForCausalLM", - "AutoModelForMaskedLM", - "AutoModelForImageClassification", -] + +class TRANSFORMER_TYPES(str, enum.Enum): + """Types of Transformers.""" + + AutoModelForSequenceClassification = "AutoModelForSequenceClassification" + AutoModelForTokenClassification = "AutoModelForTokenClassification" + AutoModelForQuestionAnswering = "AutoModelForQuestionAnswering" + AutoModelForCausalLM = "AutoModelForCausalLM" + AutoModelForMaskedLM = "AutoModelForMaskedLM" + AutoModelForImageClassification = "AutoModelForImageClassification" + + +INIT_CONTAINER_MOUNT_PATH = "/workspace" @dataclass class HuggingFaceModelParams: - access_token: str model_uri: str - transformer_type: Literal[*TRANSFORMER_TYPES] - download_dir: str = field(default="/workspace/models") + transformer_type: TRANSFORMER_TYPES + access_token: str = None + download_dir: str = field(default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "models")) def __post_init__(self): # Custom checks or validations can be added here - if self.transformer_type not in TRANSFORMER_TYPES: - raise ValueError("transformer_type must be one of %s", TRANSFORMER_TYPES) - if self.model_uri is None: - raise ValueError("model_uri cannot be none.") + if self.model_uri == "": + raise ValueError("model_uri cannot be empty.") + + @property + def download_dir(self): + return self.download_dir + + @download_dir.setter + def download_dir(self, value): + raise AttributeError("Cannot modify read-only field 'download_dir'") @dataclass class HuggingFaceTrainParams: - additional_data: Dict[str, Any] = field(default_factory=dict) - peft_config: Dict[str, Any] = field(default_factory=dict) + training_parameters: TrainingArguments = field(default_factory=TrainingArguments) + lora_config: LoraConfig = field(default_factory=LoraConfig) class HuggingFace(modelProvider): @@ -45,8 +62,6 @@ def load_config(self, serialised_args): def download_model_and_tokenizer(self): # implementation for downloading the model print("downloading model") - import transformers - transformer_type_class = getattr(transformers, self.config.transformer_type) parsed_uri = urlparse(self.config.model_uri) self.model = parsed_uri.netloc + parsed_uri.path @@ -67,13 +82,23 @@ class HfDatasetParams: access_token: str = None allow_patterns: list[str] = None ignore_patterns: list[str] = None - download_dir: str = field(default="/workspace/datasets") + download_dir: str = field( + default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "datasets") + ) def __post_init__(self): # Custom checks or validations can be added here if self.repo_id is None: raise ValueError("repo_id is None") + @property + def download_dir(self): + return self.download_dir + + @download_dir.setter + def download_dir(self, value): + raise AttributeError("Cannot modify read-only field 'download_dir'") + class HuggingFaceDataset(datasetProvider): def load_config(self, serialised_args): @@ -81,15 +106,8 @@ def load_config(self, serialised_args): def download_dataset(self): print("downloading dataset") - import huggingface_hub - from huggingface_hub import snapshot_download if self.config.access_token: huggingface_hub.login(self.config.access_token) - snapshot_download( - repo_id=self.config.repo_id, - repo_type="dataset", - allow_patterns=self.config.allow_patterns, - ignore_patterns=self.config.ignore_patterns, - local_dir=self.config.download_dir, - ) + + load_dataset(self.config.repo_id, cache_dir=self.config.download_dir) diff --git a/sdk/python/kubeflow/storage_init_container/requirements.txt b/sdk/python/kubeflow/storage_init_container/requirements.txt index cc9fcce7fb..32f19848ca 100644 --- a/sdk/python/kubeflow/storage_init_container/requirements.txt +++ b/sdk/python/kubeflow/storage_init_container/requirements.txt @@ -4,4 +4,6 @@ torchaudio==2.1.1 einops==0.7.0 transformers_stream_generator==0.0.4 boto3==1.33.9 -huggingface_hub \ No newline at end of file +transformers>=4.35.2 +peft>=0.7.0 +huggingface_hub==0.19.4 \ No newline at end of file diff --git a/sdk/python/kubeflow/storage_init_container/s3.py b/sdk/python/kubeflow/storage_init_container/s3.py index dd69f08e6c..cb5cb91cc0 100644 --- a/sdk/python/kubeflow/storage_init_container/s3.py +++ b/sdk/python/kubeflow/storage_init_container/s3.py @@ -1,6 +1,6 @@ from abstract_dataset_provider import datasetProvider from dataclasses import dataclass, field -import json +import json, os import boto3 from urllib.parse import urlparse @@ -50,6 +50,8 @@ def download_dataset(self): # Download the file s3_client.download_file( - self.config.bucket_name, self.config.file_key, self.config.download_dir + self.config.bucket_name, + self.config.file_key, + os.path.join(self.config.download_dir, self.config.file_key), ) print(f"File downloaded to: {self.config.download_dir}") diff --git a/sdk/python/kubeflow/storage_init_container/storage.py b/sdk/python/kubeflow/storage_init_container/storage.py index 6f5150a3f8..173b81c591 100644 --- a/sdk/python/kubeflow/storage_init_container/storage.py +++ b/sdk/python/kubeflow/storage_init_container/storage.py @@ -3,21 +3,21 @@ from s3 import S3 -def model_factory(model_provider, model_provider_args): +def model_factory(model_provider, model_provider_parameters): match model_provider: case "hf": hf = HuggingFace() - hf.load_config(model_provider_args) + hf.load_config(model_provider_parameters) hf.download_model_and_tokenizer() case _: return "This is the default case" -def dataset_factory(dataset_provider, dataset_provider_args): +def dataset_factory(dataset_provider, dataset_provider_parameters): match dataset_provider: case "s3": s3 = S3() - s3.load_config(dataset_provider_args) + s3.load_config(dataset_provider_parameters) s3.download_dataset() case "hf": hf = HuggingFaceDataset() @@ -31,16 +31,20 @@ def dataset_factory(dataset_provider, dataset_provider_args): parser = argparse.ArgumentParser( description="script for downloading model and datasets to PVC." ) - parser.add_argument("model_provider", type=str, help="name of model provider") + parser.add_argument("--model_provider", type=str, help="name of model provider") parser.add_argument( - "model_provider_args", type=str, help="model provider serialised arguments" + "--model_provider_parameters", + type=str, + help="model provider serialised arguments", ) - parser.add_argument("dataset_provider", type=str, help="name of dataset provider") + parser.add_argument("--dataset_provider", type=str, help="name of dataset provider") parser.add_argument( - "dataset_provider_args", type=str, help="dataset provider serialised arguments" + "--dataset_provider_parameters", + type=str, + help="dataset provider serialised arguments", ) args = parser.parse_args() - model_factory(args.model_provider, args.model_provider_args) - dataset_factory(args.dataset_provider, args.dataset_provider_args) + model_factory(args.model_provider, args.model_provider_parameters) + dataset_factory(args.dataset_provider, args.dataset_provider_parameters) From d5ec8eb8a6d32c098c3899c338dcf9a407bde19e Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Fri, 5 Jan 2024 03:01:27 +0530 Subject: [PATCH 3/4] fixing imports --- sdk/python/kubeflow/storage_init_container/hugging_face.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/kubeflow/storage_init_container/hugging_face.py b/sdk/python/kubeflow/storage_init_container/hugging_face.py index 68c39dbaa1..c136b265d2 100644 --- a/sdk/python/kubeflow/storage_init_container/hugging_face.py +++ b/sdk/python/kubeflow/storage_init_container/hugging_face.py @@ -1,5 +1,5 @@ -from abstract_model_provider import modelProvider -from abstract_dataset_provider import datasetProvider +from kubeflow.storage_init_container.abstract_model_provider import modelProvider +from kubeflow.storage_init_container.abstract_dataset_provider import datasetProvider from dataclasses import dataclass, field from typing import Literal from urllib.parse import urlparse From 8b7f27627318949874fe0c911c7971fac5837547 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Fri, 5 Jan 2024 03:15:25 +0530 Subject: [PATCH 4/4] resolve merge conflict --- sdk/python/kubeflow/storage_init_container/hugging_face.py | 4 ++-- sdk/python/kubeflow/storage_init_container/s3.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/kubeflow/storage_init_container/hugging_face.py b/sdk/python/kubeflow/storage_init_container/hugging_face.py index c136b265d2..b15297c4cf 100644 --- a/sdk/python/kubeflow/storage_init_container/hugging_face.py +++ b/sdk/python/kubeflow/storage_init_container/hugging_face.py @@ -36,7 +36,7 @@ class HuggingFaceModelParams: def __post_init__(self): # Custom checks or validations can be added here - if self.model_uri == "": + if self.model_uri == "" or self.model_uri is None: raise ValueError("model_uri cannot be empty.") @property @@ -88,7 +88,7 @@ class HfDatasetParams: def __post_init__(self): # Custom checks or validations can be added here - if self.repo_id is None: + if self.repo_id == "" or self.repo_id is None: raise ValueError("repo_id is None") @property diff --git a/sdk/python/kubeflow/storage_init_container/s3.py b/sdk/python/kubeflow/storage_init_container/s3.py index cb5cb91cc0..5b1919644b 100644 --- a/sdk/python/kubeflow/storage_init_container/s3.py +++ b/sdk/python/kubeflow/storage_init_container/s3.py @@ -1,4 +1,4 @@ -from abstract_dataset_provider import datasetProvider +from kubeflow.storage_init_container.abstract_dataset_provider import datasetProvider from dataclasses import dataclass, field import json, os import boto3