only annotate LoadHF with splits. LoadFromHFSpace learns only from it…

…s mapping, and can do that in runtime Signed-off-by: dafnapension <[email protected]>
IBM · Feb 8, 2025 · 6f1b886 · 6f1b886
1 parent 19f81dc
commit 6f1b886
Show file tree

Hide file tree

Showing 27 changed files with 36 additions and 98 deletions.
diff --git a/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py
@@ -21,7 +21,6 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
         },
-        splits=["model_answer", "questions"],
     ),
     preprocess_steps=[
         # region Question file

diff --git a/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
@@ -22,7 +22,6 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -70,7 +69,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",

diff --git a/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
@@ -23,7 +23,6 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -55,7 +54,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_mean_judgment_gpt4_judge",

diff --git a/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
@@ -20,7 +20,6 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -48,7 +47,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.first_game_only_gpt_4_judge",

diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py
@@ -20,7 +20,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py b/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py
@@ -20,7 +20,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py
@@ -20,7 +20,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py b/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py
@@ -20,7 +20,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/prepare/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.py
@@ -19,7 +19,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/...are/cards/mt_bench/response_assessment/rating/multi_turn_with_reference_gpt4_judgement.py b/...are/cards/mt_bench/response_assessment/rating/multi_turn_with_reference_gpt4_judgement.py
@@ -19,7 +19,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/prepare/cards/mt_bench/response_assessment/rating/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/rating/single_turn_gpt4_judgement.py
@@ -15,7 +15,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/...re/cards/mt_bench/response_assessment/rating/single_turn_with_reference_gpt4_judgement.py b/...re/cards/mt_bench/response_assessment/rating/single_turn_with_reference_gpt4_judgement.py
@@ -19,7 +19,6 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
-        splits=["judgment", "model_answer", "questions"],
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json b/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json
@@ -7,11 +7,7 @@
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
-        },
-        "splits": [
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         {

diff --git a/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
@@ -8,12 +8,7 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.arena_hard_hf_space_processing_steps",

diff --git a/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
@@ -8,12 +8,7 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.arena_hard_hf_space_processing_steps",

diff --git a/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
@@ -8,12 +8,7 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.arena_hard_hf_space_processing_steps",

diff --git a/...log/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.json b/...log/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/...nch/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.json b/...nch/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/...og/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.json b/...og/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/...ch/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.json b/...ch/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.pairwise_hf_space_processing_steps",

diff --git a/src/unitxt/catalog/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.json b/src/unitxt/catalog/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/...g/cards/mt_bench/response_assessment/rating/multi_turn_with_reference_gpt4_judgement.json b/...g/cards/mt_bench/response_assessment/rating/multi_turn_with_reference_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/src/unitxt/catalog/cards/mt_bench/response_assessment/rating/single_turn_gpt4_judgement.json b/src/unitxt/catalog/cards/mt_bench/response_assessment/rating/single_turn_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/.../cards/mt_bench/response_assessment/rating/single_turn_with_reference_gpt4_judgement.json b/.../cards/mt_bench/response_assessment/rating/single_turn_with_reference_gpt4_judgement.json
@@ -8,12 +8,7 @@
             "questions": "data/mt_bench/question.jsonl",
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl"
-        },
-        "splits": [
-            "judgment",
-            "model_answer",
-            "questions"
-        ]
+        }
     },
     "preprocess_steps": [
         "operators.mt_bench.rating_hf_space_processing_steps",

diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -271,6 +271,7 @@ def limit_dataset(
         return dataset
 
     # returns Dict when split names are not known in advance, and just the the single split dataset - if known
+    # flake8: noqa: C901
     def load_dataset(
         self, split: str, streaming=None, disable_memory_caching=False
     ) -> Union[IterableDatasetDict, IterableDataset, Dataset, DatasetDict]:
@@ -279,6 +280,10 @@ def load_dataset(
             if streaming is None:
                 streaming = self.is_streaming()
 
+            # try to optimize when not too dangerous
+            if self.get_limit() <= 100:
+                streaming = True
+
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
                 if settings.disable_hf_datasets_cache:
                     cache_dir = dir_to_be_deleted
@@ -299,14 +304,25 @@ def load_dataset(
                     "num_proc": self.num_proc,
                 }
                 try:
+                    # load the dataset and verify that it is useful
                     dataset = hf_load_dataset(**kwargs)
+                    if isinstance(dataset, (Dataset, IterableDataset)):
+                        next(iter(dataset))
+                    else:
+                        for k in dataset.keys():
+                            next(iter(dataset[k]))
+                            break
                 except ValueError as e:
                     if "trust_remote_code" in str(e):
                         raise ValueError(
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
 
-                except NotImplementedError:
+                except:
+                    current_streaming = kwargs["streaming"]
+                    logger.info(
+                        f"needed to swap streaming from {current_streaming} to {not current_streaming} for path {self.path}"
+                    )
                     # try the opposite way of streaming
                     kwargs["streaming"] = not kwargs["streaming"]
                     dataset = hf_load_dataset(**kwargs)
@@ -358,8 +374,6 @@ def load_iterables(
         # log once for all splits, as they are limited the same
         if self.get_limit() is not None:
             self.log_limited_loading()
-            if self.get_limit() < 100:
-                self.streaming = True
         if self.filtering_lambda is not None:
             self.log_filter_load()
 
@@ -408,7 +422,6 @@ class LoadCSV(Loader):
 
     files: Dict[str, str]
     chunksize: int = 1000
-    loader_limit: Optional[int] = None
     streaming: bool = True
     sep: str = ","
     compression: Optional[str] = None
@@ -901,7 +914,7 @@ class LoadFromHFSpace(LoadHF):
     token_env: Optional[str] = None
     requirements_list: List[str] = ["huggingface_hub"]
 
-    streaming: bool = True
+    streaming = True
 
     def _get_token(self) -> Optional[Union[bool, str]]:
         if self.token_env:
@@ -1069,7 +1082,6 @@ class LoadFromAPI(Loader):
 
     urls: Dict[str, str]
     chunksize: int = 100000
-    loader_limit: Optional[int] = None
     streaming: bool = False
     api_key_env_var: str = "SQL_API_KEY"
     headers: Optional[Dict[str, Any]] = None

diff --git a/utils/.secrets.baseline b/utils/.secrets.baseline
@@ -151,7 +151,7 @@
         "filename": "src/unitxt/loaders.py",
         "hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
         "is_verified": false,
-        "line_number": 608,
+        "line_number": 621,
         "is_secret": false
       }
     ],
@@ -184,5 +184,5 @@
       }
     ]
   },
-  "generated_at": "2025-02-06T23:16:13Z"
+  "generated_at": "2025-02-08T13:05:57Z"
 }