Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding hugging face dataset download class #1970

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ def load_config(self):
pass

@abstractmethod
def download_model(self):
def download_model_and_tokenizer(self):
pass
96 changes: 75 additions & 21 deletions sdk/python/kubeflow/storage_init_container/hugging_face.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,57 @@
from abstract_model_provider import modelProvider
from kubeflow.storage_init_container.abstract_model_provider import modelProvider
from kubeflow.storage_init_container.abstract_dataset_provider import datasetProvider
from dataclasses import dataclass, field
from typing import Literal
import transformers
from urllib.parse import urlparse
import json
from typing import Dict, Any
import json, os
from typing import Dict, Any, Union
from datasets import load_dataset
from peft import LoraConfig
import transformers
from transformers import TrainingArguments
import enum
import huggingface_hub


class TRANSFORMER_TYPES(str, enum.Enum):
"""Types of Transformers."""

AutoModelForSequenceClassification = "AutoModelForSequenceClassification"
AutoModelForTokenClassification = "AutoModelForTokenClassification"
AutoModelForQuestionAnswering = "AutoModelForQuestionAnswering"
AutoModelForCausalLM = "AutoModelForCausalLM"
AutoModelForMaskedLM = "AutoModelForMaskedLM"
AutoModelForImageClassification = "AutoModelForImageClassification"


TRANSFORMER_TYPES = [
"AutoModelForSequenceClassification",
"AutoModelForTokenClassification",
"AutoModelForQuestionAnswering",
"AutoModelForCausalLM",
"AutoModelForMaskedLM",
"AutoModelForImageClassification",
]
INIT_CONTAINER_MOUNT_PATH = "/workspace"


@dataclass
class HuggingFaceModelParams:
access_token: str
model_uri: str
transformer_type: Literal[*TRANSFORMER_TYPES]
download_dir: str = field(default="/workspace/models")
transformer_type: TRANSFORMER_TYPES
access_token: str = None
download_dir: str = field(default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "models"))

def __post_init__(self):
# Custom checks or validations can be added here
if self.transformer_type not in TRANSFORMER_TYPES:
raise ValueError("transformer_type must be one of %s", TRANSFORMER_TYPES)
if self.model_uri is None:
raise ValueError("model_uri cannot be none.")
if self.model_uri == "" or self.model_uri is None:
raise ValueError("model_uri cannot be empty.")

@property
def download_dir(self):
return self.download_dir

@download_dir.setter
def download_dir(self, value):
raise AttributeError("Cannot modify read-only field 'download_dir'")


@dataclass
class HuggingFaceTrainParams:
additional_data: Dict[str, Any] = field(default_factory=dict)
peft_config: Dict[str, Any] = field(default_factory=dict)
training_parameters: TrainingArguments = field(default_factory=TrainingArguments)
lora_config: LoraConfig = field(default_factory=LoraConfig)


class HuggingFace(modelProvider):
Expand All @@ -57,3 +74,40 @@ def download_model_and_tokenizer(self):
transformers.AutoTokenizer.from_pretrained(
self.model, cache_dir=self.config.download_dir
)


@dataclass
class HfDatasetParams:
repo_id: str
access_token: str = None
allow_patterns: list[str] = None
ignore_patterns: list[str] = None
download_dir: str = field(
default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "datasets")
)

def __post_init__(self):
# Custom checks or validations can be added here
if self.repo_id == "" or self.repo_id is None:
raise ValueError("repo_id is None")

@property
def download_dir(self):
return self.download_dir

@download_dir.setter
def download_dir(self, value):
raise AttributeError("Cannot modify read-only field 'download_dir'")


class HuggingFaceDataset(datasetProvider):
def load_config(self, serialised_args):
self.config = HfDatasetParams(**json.loads(serialised_args))

def download_dataset(self):
print("downloading dataset")
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved

if self.config.access_token:
huggingface_hub.login(self.config.access_token)

load_dataset(self.config.repo_id, cache_dir=self.config.download_dir)
3 changes: 3 additions & 0 deletions sdk/python/kubeflow/storage_init_container/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ torchaudio==2.1.1
einops==0.7.0
transformers_stream_generator==0.0.4
boto3==1.33.9
transformers>=4.35.2
peft>=0.7.0
huggingface_hub==0.19.4
8 changes: 5 additions & 3 deletions sdk/python/kubeflow/storage_init_container/s3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abstract_dataset_provider import datasetProvider
from kubeflow.storage_init_container.abstract_dataset_provider import datasetProvider
from dataclasses import dataclass, field
import json
import json, os
import boto3
from urllib.parse import urlparse

Expand Down Expand Up @@ -50,6 +50,8 @@ def download_dataset(self):

# Download the file
s3_client.download_file(
self.config.bucket_name, self.config.file_key, self.config.download_dir
self.config.bucket_name,
self.config.file_key,
os.path.join(self.config.download_dir, self.config.file_key),
)
print(f"File downloaded to: {self.config.download_dir}")
30 changes: 19 additions & 11 deletions sdk/python/kubeflow/storage_init_container/storage.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
import argparse
from hugging_face import HuggingFace
from hugging_face import HuggingFace, HuggingFaceDataset
from s3 import S3


def model_factory(model_provider, model_provider_args):
def model_factory(model_provider, model_provider_parameters):
match model_provider:
case "hf":
hf = HuggingFace()
hf.load_config(model_provider_args)
hf.load_config(model_provider_parameters)
hf.download_model_and_tokenizer()
case _:
return "This is the default case"


def dataset_factory(dataset_provider, dataset_provider_args):
def dataset_factory(dataset_provider, dataset_provider_parameters):
match dataset_provider:
case "s3":
s3 = S3()
s3.load_config(dataset_provider_args)
s3.load_config(dataset_provider_parameters)
s3.download_dataset()
case "hf":
hf = HuggingFaceDataset()
hf.load_config(dataset_provider_parameters)
deepanker13 marked this conversation as resolved.
Show resolved Hide resolved
hf.download_dataset()
case _:
return "This is the default case"

Expand All @@ -27,16 +31,20 @@ def dataset_factory(dataset_provider, dataset_provider_args):
parser = argparse.ArgumentParser(
description="script for downloading model and datasets to PVC."
)
parser.add_argument("model_provider", type=str, help="name of model provider")
parser.add_argument("--model_provider", type=str, help="name of model provider")
parser.add_argument(
"model_provider_args", type=str, help="model provider serialised arguments"
"--model_provider_parameters",
type=str,
help="model provider serialised arguments",
)

parser.add_argument("dataset_provider", type=str, help="name of dataset provider")
parser.add_argument("--dataset_provider", type=str, help="name of dataset provider")
parser.add_argument(
"dataset_provider_args", type=str, help="dataset provider serialised arguments"
"--dataset_provider_parameters",
type=str,
help="dataset provider serialised arguments",
)
args = parser.parse_args()

model_factory(args.model_provider, args.model_provider_args)
dataset_factory(args.dataset_provider, args.dataset_provider_args)
model_factory(args.model_provider, args.model_provider_parameters)
dataset_factory(args.dataset_provider, args.dataset_provider_parameters)