Skip to content

Commit

Permalink
only annotate LoadHF with splits. LoadFromHFSpace learns only from it…
Browse files Browse the repository at this point in the history
…s mapping, and can do that in runtime

Signed-off-by: dafnapension <[email protected]>
  • Loading branch information
dafnapension committed Feb 8, 2025
1 parent 19f81dc commit 6f1b886
Show file tree
Hide file tree
Showing 27 changed files with 36 additions and 98 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
},
splits=["model_answer", "questions"],
),
preprocess_steps=[
# region Question file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.arena_hard_hf_space_processing_steps",
Expand Down Expand Up @@ -70,7 +69,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
add_to_catalog(
card,
"cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.arena_hard_hf_space_processing_steps",
Expand Down Expand Up @@ -55,7 +54,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
add_to_catalog(
card,
"cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_mean_judgment_gpt4_judge",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.arena_hard_hf_space_processing_steps",
Expand Down Expand Up @@ -48,7 +47,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
add_to_catalog(
card,
"cards.arena_hard.response_assessment.pairwise_comparative_rating.first_game_only_gpt_4_judge",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
},
splits=["judgment", "model_answer", "questions"],
),
preprocess_steps=[
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@
"data_files": {
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
},
"splits": [
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.arena_hard_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.arena_hard_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.arena_hard_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.pairwise_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
"questions": "data/mt_bench/question.jsonl",
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
},
"splits": [
"judgment",
"model_answer",
"questions"
]
}
},
"preprocess_steps": [
"operators.mt_bench.rating_hf_space_processing_steps",
Expand Down
24 changes: 18 additions & 6 deletions src/unitxt/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def limit_dataset(
return dataset

# returns Dict when split names are not known in advance, and just the the single split dataset - if known
# flake8: noqa: C901
def load_dataset(
self, split: str, streaming=None, disable_memory_caching=False
) -> Union[IterableDatasetDict, IterableDataset, Dataset, DatasetDict]:
Expand All @@ -279,6 +280,10 @@ def load_dataset(
if streaming is None:
streaming = self.is_streaming()

# try to optimize when not too dangerous
if self.get_limit() <= 100:
streaming = True

with tempfile.TemporaryDirectory() as dir_to_be_deleted:
if settings.disable_hf_datasets_cache:
cache_dir = dir_to_be_deleted
Expand All @@ -299,14 +304,25 @@ def load_dataset(
"num_proc": self.num_proc,
}
try:
# load the dataset and verify that it is useful
dataset = hf_load_dataset(**kwargs)
if isinstance(dataset, (Dataset, IterableDataset)):
next(iter(dataset))
else:
for k in dataset.keys():
next(iter(dataset[k]))
break
except ValueError as e:
if "trust_remote_code" in str(e):
raise ValueError(
f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
) from e

except NotImplementedError:
except:
current_streaming = kwargs["streaming"]
logger.info(
f"needed to swap streaming from {current_streaming} to {not current_streaming} for path {self.path}"
)
# try the opposite way of streaming
kwargs["streaming"] = not kwargs["streaming"]
dataset = hf_load_dataset(**kwargs)
Expand Down Expand Up @@ -358,8 +374,6 @@ def load_iterables(
# log once for all splits, as they are limited the same
if self.get_limit() is not None:
self.log_limited_loading()
if self.get_limit() < 100:
self.streaming = True
if self.filtering_lambda is not None:
self.log_filter_load()

Expand Down Expand Up @@ -408,7 +422,6 @@ class LoadCSV(Loader):

files: Dict[str, str]
chunksize: int = 1000
loader_limit: Optional[int] = None
streaming: bool = True
sep: str = ","
compression: Optional[str] = None
Expand Down Expand Up @@ -901,7 +914,7 @@ class LoadFromHFSpace(LoadHF):
token_env: Optional[str] = None
requirements_list: List[str] = ["huggingface_hub"]

streaming: bool = True
streaming = True

def _get_token(self) -> Optional[Union[bool, str]]:
if self.token_env:
Expand Down Expand Up @@ -1069,7 +1082,6 @@ class LoadFromAPI(Loader):

urls: Dict[str, str]
chunksize: int = 100000
loader_limit: Optional[int] = None
streaming: bool = False
api_key_env_var: str = "SQL_API_KEY"
headers: Optional[Dict[str, Any]] = None
Expand Down
4 changes: 2 additions & 2 deletions utils/.secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@
"filename": "src/unitxt/loaders.py",
"hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
"is_verified": false,
"line_number": 608,
"line_number": 621,
"is_secret": false
}
],
Expand Down Expand Up @@ -184,5 +184,5 @@
}
]
},
"generated_at": "2025-02-06T23:16:13Z"
"generated_at": "2025-02-08T13:05:57Z"
}
Loading

0 comments on commit 6f1b886

Please sign in to comment.