From 5b4123444a7075362c2729a8032632686ca32d7e Mon Sep 17 00:00:00 2001 From: jinzr Date: Wed, 14 Aug 2024 10:40:26 +0800 Subject: [PATCH] minor updates --- lhotse/recipes/spatial_librispeech.py | 91 ++++++++++++++------------- setup.py | 4 -- 2 files changed, 46 insertions(+), 49 deletions(-) diff --git a/lhotse/recipes/spatial_librispeech.py b/lhotse/recipes/spatial_librispeech.py index e32b495cc..6c82a931f 100644 --- a/lhotse/recipes/spatial_librispeech.py +++ b/lhotse/recipes/spatial_librispeech.py @@ -9,20 +9,6 @@ from lhotse.supervision import SupervisionSegment, SupervisionSet from lhotse.utils import Pathlike, resumable_download -try: - import requests -except ImportError: - raise ImportError( - "The Spatial LibriSpeech recipe requires requests dependency to download the dataset. You can install the dependency using: pip install requests" - ) - -try: - import pandas as pd -except ImportError: - raise ImportError( - "The Spatial LibriSpeech recipe requires pandas, pyarrow and fastparquet dependency to parse parquet formatted metadata. You can install the dependencies using: pip install pandas pyarrow fastparquet" - ) - SPATIAL_LIBRISPEECH = ("train", "test") BASE_URL = "https://docs-assets.developer.apple.com/ml-research/datasets/spatial-librispeech/v1" META_DATA_URL = "https://docs-assets.developer.apple.com/ml-research/datasets/spatial-librispeech/v1/metadata.parquet" @@ -40,6 +26,14 @@ def _download_file(url: str) -> bytes: Returns: file_content (bytes): The file content downloaded from the url """ + + try: + import requests + except ImportError: + raise ImportError( + "The Spatial LibriSpeech recipe requires requests dependency to download the dataset. You can install the dependency using: pip install requests" + ) + try: file_content = requests.get(url, allow_redirects=True).content return file_content @@ -66,37 +60,6 @@ def _save_audio_content(target_file: str, file_content: bytes): _save_audio_content(target_file, file_content) -def _download_spatial_librispeech_audio_files( - target_dir: Pathlike, - dataset_parts: Sequence[str], - metadata: pd.DataFrame, - base_url: str, - force_download: bool = False, - num_jobs: int = 1, -): - target_dir = Path(target_dir) - target_dir.mkdir(parents=True, exist_ok=True) - - audio_url = f"{base_url}/ambisonics" - from concurrent.futures.thread import ThreadPoolExecutor - - for part in dataset_parts: - part_dir = target_dir / part - part_dir.mkdir(parents=True, exist_ok=True) - - with ThreadPoolExecutor(num_jobs) as ex: - for sample_id, split in tqdm( - zip(metadata["sample_id"], metadata["split"]), - total=len(metadata["sample_id"]), - ): - if split not in dataset_parts: - continue - recording_path = target_dir / split / f"{sample_id:06}.flac" - recording_url = f"{audio_url}/{sample_id:06}.flac" - if not recording_path.exists() or force_download: - ex.submit(_download_and_save_audio, recording_path, recording_url) - - def download_spatial_librispeech( target_dir: Pathlike = ".", dataset_parts: Union[str, Sequence[str]] = SPATIAL_LIBRISPEECH, @@ -113,6 +76,44 @@ def download_spatial_librispeech( :param base_url: str, the url of the resource. :return: the path to downloaded and extracted directory with data. """ + + try: + import pandas as pd + except ImportError: + raise ImportError( + "The Spatial LibriSpeech recipe requires pandas, pyarrow and fastparquet dependency to parse parquet formatted metadata. You can install the dependencies using: pip install pandas pyarrow fastparquet" + ) + + def _download_spatial_librispeech_audio_files( + target_dir: Pathlike, + dataset_parts: Sequence[str], + metadata: pd.DataFrame, + base_url: str, + force_download: bool = False, + num_jobs: int = 1, + ): + target_dir = Path(target_dir) + target_dir.mkdir(parents=True, exist_ok=True) + + audio_url = f"{base_url}/ambisonics" + from concurrent.futures.thread import ThreadPoolExecutor + + for part in dataset_parts: + part_dir = target_dir / part + part_dir.mkdir(parents=True, exist_ok=True) + + with ThreadPoolExecutor(num_jobs) as ex: + for sample_id, split in tqdm( + zip(metadata["sample_id"], metadata["split"]), + total=len(metadata["sample_id"]), + ): + if split not in dataset_parts: + continue + recording_path = target_dir / split / f"{sample_id:06}.flac" + recording_url = f"{audio_url}/{sample_id:06}.flac" + if not recording_path.exists() or force_download: + ex.submit(_download_and_save_audio, recording_path, recording_url) + target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) if dataset_parts == "all": diff --git a/setup.py b/setup.py index ca93f5aac..b96a3e36e 100644 --- a/setup.py +++ b/setup.py @@ -203,10 +203,6 @@ def mark_lhotse_version(version: str) -> None: "black==22.3.0", "isort==5.10.1", "pre-commit>=2.17.0,<=2.19.0", - "pandas>2.0.0,<2.3.0", - "pyarrow==17.0.0", - "fastparquet>2023.10.0,<2024.6.0", - "requests==2.31.0", ] orjson_requires = ["orjson>=3.6.6"] webdataset_requires = ["webdataset==0.2.5"]