From 06c3235a640d00bf59223ebabf3cb489a2891767 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:31:40 +0100 Subject: [PATCH] update load_dataset doctring (#7301) * update load_dataset doctring * style * minor * drop python 3.8 --- .github/workflows/ci.yml | 6 +- .github/workflows/release-conda.yml | 2 +- setup.py | 3 +- src/datasets/arrow_dataset.py | 2 +- src/datasets/load.py | 121 ++++++++++++++-------------- tests/test_arrow_dataset.py | 41 ++++++---- tests/test_hub.py | 7 +- tests/test_py_utils.py | 7 +- tests/test_search.py | 16 ++-- tests/test_upstream_hub.py | 5 +- 10 files changed, 111 insertions(+), 99 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2951be28289..f5049514706 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -44,10 +44,10 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" - name: Upgrade pip run: python -m pip install --upgrade pip - name: Pin setuptools-scm diff --git a/.github/workflows/release-conda.yml b/.github/workflows/release-conda.yml index fb18ca77cac..c561d57f080 100644 --- a/.github/workflows/release-conda.yml +++ b/.github/workflows/release-conda.yml @@ -25,7 +25,7 @@ jobs: auto-update-conda: true auto-activate-base: false activate-environment: "build-datasets" - python-version: 3.8 + python-version: 3.9 channels: huggingface - name: Setup conda env diff --git a/setup.py b/setup.py index 5038035d4ea..71dfcef3ca3 100644 --- a/setup.py +++ b/setup.py @@ -251,7 +251,7 @@ "datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"], }, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, - python_requires=">=3.8.0", + python_requires=">=3.9.0", install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, classifiers=[ @@ -262,7 +262,6 @@ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 57f3024e53b..589deade653 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -372,7 +372,7 @@ def to_tf_dataset( a small buffer of batches for training. Improves performance by allowing data to be loaded in the background while the model is training. num_workers (`int`, defaults to `0`): - Number of workers to use for loading the dataset. Only supported on Python versions >= 3.8. + Number of workers to use for loading the dataset. num_test_batches (`int`, defaults to `20`): Number of batches to use to infer the output signature of the dataset. The higher this number, the more accurate the signature will be, but the longer it will take to diff --git a/src/datasets/load.py b/src/datasets/load.py index ebdafafcd5f..2f516253db7 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -242,9 +242,11 @@ def __reduce__(self): # to make dynamically created class pickable, see _Initia def get_dataset_builder_class( dataset_module: "DatasetModule", dataset_name: Optional[str] = None ) -> Type[DatasetBuilder]: - with lock_importable_file( - dataset_module.importable_file_path - ) if dataset_module.importable_file_path else nullcontext(): + with ( + lock_importable_file(dataset_module.importable_file_path) + if dataset_module.importable_file_path + else nullcontext() + ): builder_cls = import_main_class(dataset_module.module_path) if dataset_module.builder_configs_parameters.builder_configs: dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name") @@ -1751,42 +1753,36 @@ def load_dataset_builder( _require_default_config_name=True, **config_kwargs, ) -> DatasetBuilder: - """Load a dataset builder from the Hugging Face Hub, or a local dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.) - without downloading the dataset itself. - - You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`]. + """Load a dataset builder which can be used to: - A dataset is a directory that contains: + - Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.) + - Download and prepare the dataset as Arrow files in the cache + - Get a streaming dataset without downloading or caching anything - - some data files in generic formats (JSON, CSV, Parquet, text, etc.) - - and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures. + You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`]. - Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online. + A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly + in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.) Args: path (`str`): Path or name of the dataset. - Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory. - For local datasets: + - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`]) + -> load the dataset builder from supported files in the repository (csv, json, parquet, etc.) + e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files. - - if `path` is a local directory (containing data files only) - -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory + - if `path` is a local directory + -> load the dataset builder from supported files in the directory (csv, json, parquet, etc.) e.g. `'./path/to/directory/with/my/csv/data'`. - - if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory) - -> load the dataset builder from the dataset script - e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`. - - For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`]) - - if `path` is a dataset repository on the HF hub (containing data files only) - -> load a generic dataset builder (csv, text etc.) based on the content of the repository - e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files. - - if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory) - -> load the dataset builder from the dataset script in the dataset repository - e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`. + - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified + (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder") + -> load the dataset builder from the files in `data_files` or `data_dir` + e.g. `'parquet'`. + It can also point to a local dataset script but this is not recommended. name (`str`, *optional*): Defining the name of the dataset configuration. data_dir (`str`, *optional*): @@ -1837,7 +1833,7 @@ def load_dataset_builder( ```py >>> from datasets import load_dataset_builder - >>> ds_builder = load_dataset_builder('rotten_tomatoes') + >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.info.features {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} @@ -1931,61 +1927,55 @@ def load_dataset( You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`]. - A dataset is a directory that contains: - - - some data files in generic formats (JSON, CSV, Parquet, text, etc.). - - and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures. - - Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online. + A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly + in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.) This function does the following under the hood: - 1. Download and import in the library the dataset script from `path` if it's not already cached inside the library. + 1. Load a dataset builder: - If the dataset has no dataset script, then a generic dataset script is imported instead (JSON, CSV, Parquet, text, etc.) + * Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.) + * Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration + * It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet"). - Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset, - contain the path or URL to the original data files and the code to load examples from the original data files. + 2. Run the dataset builder: - You can find the complete list of datasets in the Datasets [Hub](https://huggingface.co/datasets). + In the general case: - 2. Run the dataset script which will: - - * Download the dataset file from the original URL (see the script) if it's not already available locally or cached. + * Download the data files from the dataset if they are not already available locally or cached. * Process and cache the dataset in typed Arrow tables for caching. Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types. They can be directly accessed from disk, loaded in RAM or even streamed over the web. + In the streaming case: + + * Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it. + 3. Return a dataset built from the requested splits in `split` (default: all). - It also allows to load a dataset from a local directory or a dataset repository on the Hugging Face Hub without dataset script. - In this case, it automatically loads all the data files from the directory or the dataset repository. + It can also use a custom dataset builder if the dataset contains a dataset script, but this feature is mostly for backward compatibility. + In this case the dataset script file must be named after the dataset repository or directory and end with ".py". Args: path (`str`): Path or name of the dataset. - Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory. - For local datasets: + - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`]) + -> load the dataset from supported files in the repository (csv, json, parquet, etc.) + e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files. - - if `path` is a local directory (containing data files only) - -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory + - if `path` is a local directory + -> load the dataset from supported files in the directory (csv, json, parquet, etc.) e.g. `'./path/to/directory/with/my/csv/data'`. - - if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory) - -> load the dataset builder from the dataset script - e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`. - For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`]) - - - if `path` is a dataset repository on the HF hub (containing data files only) - -> load a generic dataset builder (csv, text etc.) based on the content of the repository - e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files. - - if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory) - -> load the dataset builder from the dataset script in the dataset repository - e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`. + - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified + (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder") + -> load the dataset from the files in `data_files` or `data_dir` + e.g. `'parquet'`. + It can also point to a local dataset script but this is not recommended. name (`str`, *optional*): Defining the name of the dataset configuration. data_dir (`str`, *optional*): @@ -2072,11 +2062,18 @@ def load_dataset( ```py >>> from datasets import load_dataset - >>> ds = load_dataset('rotten_tomatoes', split='train') + >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train') - # Map data files to splits + # Load a subset or dataset configuration (here 'sst2') + >>> from datasets import load_dataset + >>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train') + + # Manual mapping of data files to splits >>> data_files = {'train': 'train.csv', 'test': 'test.csv'} >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files) + + # Manual selection of a directory to load + >>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name') ``` Load a local dataset: @@ -2090,7 +2087,7 @@ def load_dataset( >>> from datasets import load_dataset >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json') - # Load from a local loading script + # Load from a local loading script (not recommended) >>> from datasets import load_dataset >>> ds = load_dataset('path/to/local/loading_script/loading_script.py', split='train') ``` @@ -2099,7 +2096,7 @@ def load_dataset( ```py >>> from datasets import load_dataset - >>> ds = load_dataset('rotten_tomatoes', split='train', streaming=True) + >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True) ``` Load an image dataset with the `ImageFolder` dataset builder: diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 1e08862031b..6cf8898ce67 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -2717,9 +2717,11 @@ def test_format_vectors(self, in_memory): import tensorflow as tf import torch - with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset( - in_memory, tmp_dir - ) as dset, dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset: + with ( + tempfile.TemporaryDirectory() as tmp_dir, + self._create_dummy_dataset(in_memory, tmp_dir) as dset, + dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset, + ): columns = dset.column_names self.assertIsNotNone(dset[0]) @@ -2770,9 +2772,11 @@ def test_format_ragged_vectors(self, in_memory): import tensorflow as tf import torch - with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset( - in_memory, tmp_dir - ) as dset, dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset: + with ( + tempfile.TemporaryDirectory() as tmp_dir, + self._create_dummy_dataset(in_memory, tmp_dir) as dset, + dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset, + ): columns = dset.column_names self.assertIsNotNone(dset[0]) @@ -2830,9 +2834,11 @@ def test_format_nested(self, in_memory): import tensorflow as tf import torch - with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset( - in_memory, tmp_dir - ) as dset, dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset: + with ( + tempfile.TemporaryDirectory() as tmp_dir, + self._create_dummy_dataset(in_memory, tmp_dir) as dset, + dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset, + ): self.assertDictEqual( dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}}) ) @@ -3224,11 +3230,11 @@ def test_concatenate_mixed_memory_and_disk(self): info1 = DatasetInfo(description="Dataset1") info2 = DatasetInfo(description="Dataset2") with tempfile.TemporaryDirectory() as tmp_dir: - with Dataset.from_dict(data1, info=info1).map( - cache_file_name=os.path.join(tmp_dir, "d1.arrow") - ) as dset1, Dataset.from_dict(data2, info=info2).map( - cache_file_name=os.path.join(tmp_dir, "d2.arrow") - ) as dset2, Dataset.from_dict(data3) as dset3: + with ( + Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, "d1.arrow")) as dset1, + Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, "d2.arrow")) as dset2, + Dataset.from_dict(data3) as dset3, + ): with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset: self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3)) self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"]) @@ -4130,9 +4136,10 @@ def test_dataset_to_json(dataset, tmp_path): ) def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file): method, args, kwargs = method_and_params - with Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, Dataset.from_file( - arrow_file, in_memory=in_memory - ) as reference_dataset: + with ( + Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, + Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset, + ): out = getattr(dataset, method)(*args, **kwargs) dataset = out if out is not None else dataset pickled_dataset = pickle.dumps(dataset) diff --git a/tests/test_hub.py b/tests/test_hub.py index 13c496e0f6f..6890eff7e7e 100644 --- a/tests/test_hub.py +++ b/tests/test_hub.py @@ -84,7 +84,7 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_ - name: train num_bytes: 55 num_examples: 5 - download_size: 726 + download_size: 717 dataset_size: 55 {METADATA_CONFIGS_FIELD}: - config_name: first @@ -105,7 +105,7 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_ - name: train num_bytes: 60 num_examples: 5 - download_size: 732 + download_size: 723 dataset_size: 60 {METADATA_CONFIGS_FIELD}: - config_name: second @@ -115,6 +115,9 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_ --- """), ] + if PYARROW_VERSION < version.parse("18.1.0"): + expected_readmes[0] = expected_readmes[0].replace("download_size: 717", "download_size: 726") + expected_readmes[1] = expected_readmes[1].replace("download_size: 723", "download_size: 732") if PYARROW_VERSION < version.parse("18.0.0"): expected_readmes[0] = expected_readmes[0].replace("download_size: 726", "download_size: 790") expected_readmes[1] = expected_readmes[1].replace("download_size: 732", "download_size: 798") diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py index b768ad54ecd..d9d95969aff 100644 --- a/tests/test_py_utils.py +++ b/tests/test_py_utils.py @@ -116,9 +116,10 @@ class Foo: ], ) def test_map_nested_num_proc(iterable_length, num_proc, expected_num_proc): - with patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested, patch( - "datasets.parallel.parallel.Pool" - ) as mock_multiprocessing_pool: + with ( + patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested, + patch("datasets.parallel.parallel.Pool") as mock_multiprocessing_pool, + ): data_struct = {f"{i}": i for i in range(iterable_length)} _ = map_nested(lambda x: x + 10, data_struct, num_proc=num_proc, parallel_min_length=16) if expected_num_proc == 1: diff --git a/tests/test_search.py b/tests/test_search.py index e1e324949c5..142c92522ea 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -88,9 +88,11 @@ def test_add_elasticsearch_index(self): from elasticsearch import Elasticsearch dset: Dataset = self._create_dummy_dataset() - with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch( - "elasticsearch.client.IndicesClient.create" - ) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk: + with ( + patch("elasticsearch.Elasticsearch.search") as mocked_search, + patch("elasticsearch.client.IndicesClient.create") as mocked_index_create, + patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk, + ): mocked_index_create.return_value = {"acknowledged": True} mocked_bulk.return_value([(True, None)] * 30) mocked_search.return_value = {"hits": {"hits": [{"_score": 1, "_id": 29}]}} @@ -198,9 +200,11 @@ class ElasticSearchIndexTest(TestCase): def test_elasticsearch(self): from elasticsearch import Elasticsearch - with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch( - "elasticsearch.client.IndicesClient.create" - ) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk: + with ( + patch("elasticsearch.Elasticsearch.search") as mocked_search, + patch("elasticsearch.client.IndicesClient.create") as mocked_index_create, + patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk, + ): es_client = Elasticsearch() mocked_index_create.return_value = {"acknowledged": True} index = ElasticSearchIndex(es_client=es_client) diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index dc59c160264..d8350c9e685 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -242,8 +242,9 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo): with temporary_repo() as ds_name: self._api.create_repo(ds_name, token=self._token, repo_type="dataset") num_commits_before_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token)) - with patch("datasets.config.MAX_SHARD_SIZE", "16KB"), patch( - "datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1 + with ( + patch("datasets.config.MAX_SHARD_SIZE", "16KB"), + patch("datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1), ): local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload")