From eb7fda0095fde5548805469134c2dce9f0069647 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 1 Nov 2024 14:38:13 -0700 Subject: [PATCH 01/10] Add API for parsing butler dataset URIs (butler and ivo) --- python/lsst/daf/butler/_butler.py | 80 +++++++++++++++++++++++++++++++ tests/test_simpleButler.py | 47 +++++++++++++++++- 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 2fa502bb78..8b12776e8b 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -29,6 +29,9 @@ __all__ = ["Butler"] +import os +import urllib.parse +import uuid from abc import abstractmethod from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence from contextlib import AbstractContextManager @@ -526,6 +529,83 @@ def get_known_repos(cls) -> set[str]: """ return ButlerRepoIndex.get_known_repos() + @classmethod + def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: + """Extract the butler label and dataset ID from a dataset URI. + + Parameters + ---------- + uri : `str` + The dataset URI to parse. + + Returns + ------- + label : `str` + The label associated with the butler repository from which this + dataset originates. + dataset_id : `DatasetId` + The ID of the dataset. + + Notes + ----- + Supports dataset URIs of the forms ``ivo://rubin/butler_label/UUID`` + and ``butler://butler_label/UUID``. In ``ivo`` URIs the butler label + can include ``/`` and the leading ``/`` is always stripped. If the + repository label starts with ``/`` then it must be doubled up. e.g., + + ivo://rubin//repo/main/82d79caa-0823-4300-9874-67b737367ee0 + + will return a label of ``/repo/main``. + + This method does not attempt to check that the dataset exists in the + labeled butler. + """ + parsed = urllib.parse.urlparse(uri) + if parsed.scheme == "ivo": + # TODO: Validate netloc component. + label, id_ = os.path.split(parsed.path) + # Strip the leading /. + label = label[1:] + elif parsed.scheme == "butler": + label = parsed.netloc + # Need to strip the leading /. + id_ = parsed.path[1:] + else: + raise ValueError(f"Unrecognized URI scheme: {uri!r}") + if not label: + raise ValueError(f"No butler repository label found in uri {uri!r}") + try: + dataset_id = uuid.UUID(hex=id_) + except Exception as e: + e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}") + raise + + return label, dataset_id + + @classmethod + def get_dataset_from_uri(cls, uri: str) -> DatasetRef | None: + """Get the dataset associated with the given dataset URI. + + Parameters + ---------- + uri : `str` + The URI associated with a dataset. + + Returns + ------- + ref : `DatasetRef` or `None` + The dataset associated with that URI, or `None` if the UUID + is valid but the dataset is not known to this butler. + + Notes + ----- + It might be possible to pass in an optional ``LabeledButlerFactory`` + but how would a caller know the right access token to supply? + """ + label, dataset_id = cls.parse_dataset_uri(uri) + butler = cls.from_config(label) + return butler.get_dataset(dataset_id) + @abstractmethod def _caching_context(self) -> AbstractContextManager[None]: """Context manager that enables caching.""" diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 595b08957f..0a44c39da5 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -54,7 +54,7 @@ from lsst.daf.butler.datastore.file_templates import FileTemplate from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory from lsst.daf.butler.tests import DatastoreMock -from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir +from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir try: from lsst.daf.butler.tests.server import create_test_server @@ -882,10 +882,55 @@ def makeButler(self, writeable: bool = False) -> Butler: registryConfig = RegistryConfig(config.get("registry")) _RegistryFactory(registryConfig).create_from_config() + # Write the YAML file so that some tests can recreate butler from it. + config.dumpToUri(os.path.join(self.root, "butler.yaml")) butler = Butler.from_config(config, writeable=writeable) DatastoreMock.apply(butler) return butler + def test_dataset_uris(self): + """Test that dataset URIs can be parsed and retrieved.""" + butler = self.makeButler(writeable=True) + butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) + butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile)) + + butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") + self.assertIsInstance(ref, DatasetRef) + + # Get the butler root for the URI. + config_dir = butler._config["root"] + + # Read it via a repo label and a path. + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file: + label = "test_repo" + index_file.write(f"{label}: {config_dir}\n") + index_file.flush() + with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}): + for dataset_uri in ( + f"ivo://rubin/{config_dir}/{ref.id}", + f"ivo://rubin/{config_dir}/butler.yaml/{ref.id}", + f"butler://{label}/{ref.id}", + f"ivo://rubin/{label}/{ref.id}", + ): + ref2 = Butler.get_dataset_from_uri(dataset_uri) + self.assertEqual(ref, ref2) + + # Non existent dataset. + missing_id = str(ref.id).replace("2", "3") + no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") + self.assertIsNone(no_ref) + + # Test some failure modes. + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("ivo://rubin/1234") + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("butler://label/1234") + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("butler://1234") + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("https://something.edu/1234") + class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase): """Run tests against DirectButler implementation using the From babd1fba898b790fdc2fa12324fd13750c58729a Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 7 Nov 2024 17:28:27 -0700 Subject: [PATCH 02/10] Add support for using a LabeledButlerFactory to retrieve the butler --- python/lsst/daf/butler/_butler.py | 23 +++++++++++++------ .../daf/butler/_labeled_butler_factory.py | 15 +++++++++++- tests/test_simpleButler.py | 7 ++++++ 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 8b12776e8b..e6cecf32d7 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -63,6 +63,7 @@ from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle from ._file_dataset import FileDataset + from ._labeled_butler_factory import LabeledButlerFactoryProtocol from ._storage_class import StorageClass from ._timespan import Timespan from .datastore import DatasetRefURIs @@ -583,27 +584,35 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: return label, dataset_id @classmethod - def get_dataset_from_uri(cls, uri: str) -> DatasetRef | None: + def get_dataset_from_uri( + cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None + ) -> DatasetRef | None: """Get the dataset associated with the given dataset URI. Parameters ---------- uri : `str` The URI associated with a dataset. + factory : `LabeledButlerFactoryProtocol` or `None`, optional + Bound factory function that will be given the butler label + and receive a `Butler`. Returns ------- ref : `DatasetRef` or `None` The dataset associated with that URI, or `None` if the UUID is valid but the dataset is not known to this butler. - - Notes - ----- - It might be possible to pass in an optional ``LabeledButlerFactory`` - but how would a caller know the right access token to supply? """ label, dataset_id = cls.parse_dataset_uri(uri) - butler = cls.from_config(label) + butler: Butler | None = None + if factory is not None: + # If the label is not recognized, it might be a path. + try: + butler = factory(label) + except KeyError: + pass + if butler is None: + butler = cls.from_config(label) return butler.get_dataset(dataset_id) @abstractmethod diff --git a/python/lsst/daf/butler/_labeled_butler_factory.py b/python/lsst/daf/butler/_labeled_butler_factory.py index 40f887ed0a..3941722527 100644 --- a/python/lsst/daf/butler/_labeled_butler_factory.py +++ b/python/lsst/daf/butler/_labeled_butler_factory.py @@ -25,9 +25,10 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -__all__ = ("LabeledButlerFactory",) +__all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol") from collections.abc import Callable, Mapping +from typing import Protocol from lsst.resources import ResourcePathExpression @@ -42,6 +43,12 @@ instance.""" +class LabeledButlerFactoryProtocol(Protocol): + """Callable to retrieve a butler from a label.""" + + def __call__(self, label: str) -> Butler: ... + + class LabeledButlerFactory: """Factory for efficiently instantiating Butler instances from the repository index file. This is intended for use from long-lived services @@ -83,6 +90,12 @@ def __init__(self, repositories: Mapping[str, str] | None = None) -> None: # This may be overridden by unit tests. self._preload_direct_butler_cache = True + def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol: + def create(label: str) -> Butler: + return self.create_butler(label=label, access_token=access_token) + + return create + def create_butler(self, *, label: str, access_token: str | None) -> Butler: """Create a Butler instance. diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 0a44c39da5..ef8275e166 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -48,6 +48,7 @@ DatasetId, DatasetRef, DatasetType, + LabeledButlerFactory, StorageClass, Timespan, ) @@ -907,6 +908,9 @@ def test_dataset_uris(self): index_file.write(f"{label}: {config_dir}\n") index_file.flush() with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}): + butler_factory = LabeledButlerFactory() + factory = butler_factory.bind(access_token=None) + for dataset_uri in ( f"ivo://rubin/{config_dir}/{ref.id}", f"ivo://rubin/{config_dir}/butler.yaml/{ref.id}", @@ -916,6 +920,9 @@ def test_dataset_uris(self): ref2 = Butler.get_dataset_from_uri(dataset_uri) self.assertEqual(ref, ref2) + ref2 = Butler.get_dataset_from_uri(dataset_uri, factory=factory) + self.assertEqual(ref, ref2) + # Non existent dataset. missing_id = str(ref.id).replace("2", "3") no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") From e5a22011e66fd0dba324f25a9b7d95f9736fa8a2 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 26 Nov 2024 12:46:35 -0700 Subject: [PATCH 03/10] Switch IVOID to using a query string for the dataset UUID --- python/lsst/daf/butler/_butler.py | 15 ++++++++------- tests/test_simpleButler.py | 25 ++++++++++++++----------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index e6cecf32d7..971a624c33 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -549,12 +549,13 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: Notes ----- - Supports dataset URIs of the forms ``ivo://rubin/butler_label/UUID`` + Supports dataset URIs of the forms + ``ivo://rubin.lsst/datasets?butler_label/UUID`` and ``butler://butler_label/UUID``. In ``ivo`` URIs the butler label - can include ``/`` and the leading ``/`` is always stripped. If the - repository label starts with ``/`` then it must be doubled up. e.g., + can include ``/`` and the trailing ``/`` before the UUID is always + stripped. - ivo://rubin//repo/main/82d79caa-0823-4300-9874-67b737367ee0 + ivo://rubin.lsst/datasets?/repo/main/UUID will return a label of ``/repo/main``. @@ -564,9 +565,9 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: parsed = urllib.parse.urlparse(uri) if parsed.scheme == "ivo": # TODO: Validate netloc component. - label, id_ = os.path.split(parsed.path) - # Strip the leading /. - label = label[1:] + if parsed.path != "/datasets": + raise ValueError(f"Unrecognized path in IVOID {uri}. Expected 'datasets' got {parsed.path!r}") + label, id_ = os.path.split(parsed.query) elif parsed.scheme == "butler": label = parsed.netloc # Need to strip the leading /. diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index ef8275e166..ad077c9ecb 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -912,10 +912,10 @@ def test_dataset_uris(self): factory = butler_factory.bind(access_token=None) for dataset_uri in ( - f"ivo://rubin/{config_dir}/{ref.id}", - f"ivo://rubin/{config_dir}/butler.yaml/{ref.id}", + f"ivo://rubin.lsst/datasets?{config_dir}/{ref.id}", + f"ivo://rubin.lsst/datasets?{config_dir}/butler.yaml/{ref.id}", f"butler://{label}/{ref.id}", - f"ivo://rubin/{label}/{ref.id}", + f"ivo://rubin.lsst/datasets?{label}/{ref.id}", ): ref2 = Butler.get_dataset_from_uri(dataset_uri) self.assertEqual(ref, ref2) @@ -929,14 +929,17 @@ def test_dataset_uris(self): self.assertIsNone(no_ref) # Test some failure modes. - with self.assertRaises(ValueError): - Butler.parse_dataset_uri("ivo://rubin/1234") - with self.assertRaises(ValueError): - Butler.parse_dataset_uri("butler://label/1234") - with self.assertRaises(ValueError): - Butler.parse_dataset_uri("butler://1234") - with self.assertRaises(ValueError): - Butler.parse_dataset_uri("https://something.edu/1234") + for dataset_uri in ( + "butler://label/1234", # Bad UUID. + "butler://1234", # No label. + "ivo://rubin/1234", # No query part and bad UUID and no label. + "ivo://rubin/datasets/dr1/82d79caa-0823-4300-9874-67b737367ee0", # No query part. + "ivo://rubin/datasets?dr1/1234", # Bad UUID. + "ivo://rubin.lsst/butler?dr1/82d79caa-0823-4300-9874-67b737367ee0", # Not datasets. + "https://something.edu/1234", # Wrong scheme. + ): + with self.assertRaises(ValueError): + Butler.parse_dataset_uri(dataset_uri) class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase): From dd8c18426ac71225489fe77d888f860b0e46ddc1 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 26 Nov 2024 13:06:29 -0700 Subject: [PATCH 04/10] Change det_dataset_from_uri so it returns a Butler Otherwise there is no easy way for the caller to get the dataset: butler, ref = Butler.get_dataset_from_uri(uri) thing = butler.get(ref) --- python/lsst/daf/butler/_butler.py | 9 ++++++--- tests/test_simpleButler.py | 14 +++++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 971a624c33..27d55ce318 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -587,7 +587,7 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: @classmethod def get_dataset_from_uri( cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None - ) -> DatasetRef | None: + ) -> tuple[Butler, DatasetRef | None]: """Get the dataset associated with the given dataset URI. Parameters @@ -596,10 +596,13 @@ def get_dataset_from_uri( The URI associated with a dataset. factory : `LabeledButlerFactoryProtocol` or `None`, optional Bound factory function that will be given the butler label - and receive a `Butler`. + and receive a `Butler`. If this is not provided the label + will be tried directly. Returns ------- + butler : `Butler` + Butler object associated with this URI. ref : `DatasetRef` or `None` The dataset associated with that URI, or `None` if the UUID is valid but the dataset is not known to this butler. @@ -614,7 +617,7 @@ def get_dataset_from_uri( pass if butler is None: butler = cls.from_config(label) - return butler.get_dataset(dataset_id) + return butler, butler.get_dataset(dataset_id) @abstractmethod def _caching_context(self) -> AbstractContextManager[None]: diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index ad077c9ecb..012ff0c39a 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -917,15 +917,23 @@ def test_dataset_uris(self): f"butler://{label}/{ref.id}", f"ivo://rubin.lsst/datasets?{label}/{ref.id}", ): - ref2 = Butler.get_dataset_from_uri(dataset_uri) + new_butler, ref2 = Butler.get_dataset_from_uri(dataset_uri) self.assertEqual(ref, ref2) + # The returned butler needs to have the datastore mocked. + DatastoreMock.apply(new_butler) + dataset_id, _ = butler.get(ref2) + self.assertEqual(dataset_id, ref.id) - ref2 = Butler.get_dataset_from_uri(dataset_uri, factory=factory) + factory_butler, ref2 = Butler.get_dataset_from_uri(dataset_uri, factory=factory) self.assertEqual(ref, ref2) + # The returned butler needs to have the datastore mocked. + DatastoreMock.apply(factory_butler) + dataset_id, _ = factory_butler.get(ref2) + self.assertEqual(dataset_id, ref.id) # Non existent dataset. missing_id = str(ref.id).replace("2", "3") - no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") + _, no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") self.assertIsNone(no_ref) # Test some failure modes. From ea001a3715ee0fb81cb8e679ca4ba1d75487b755 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 3 Dec 2024 15:18:17 -0700 Subject: [PATCH 05/10] Change IVO parser to use from described in DMTN-302 --- python/lsst/daf/butler/_butler.py | 31 +++++++++++++++++++++---------- tests/test_simpleButler.py | 15 +++++++++------ 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 27d55ce318..74bb22ed16 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -29,7 +29,6 @@ __all__ = ["Butler"] -import os import urllib.parse import uuid from abc import abstractmethod @@ -550,30 +549,42 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: Notes ----- Supports dataset URIs of the forms - ``ivo://rubin.lsst/datasets?butler_label/UUID`` - and ``butler://butler_label/UUID``. In ``ivo`` URIs the butler label - can include ``/`` and the trailing ``/`` before the UUID is always - stripped. + ``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see + DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is + deprecated and can not include ``/`` in the label string. ``ivo`` URIs + can include anything supported by the `Butler` constructor, including + paths to repositories and alias labels. - ivo://rubin.lsst/datasets?/repo/main/UUID + ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID will return a label of ``/repo/main``. This method does not attempt to check that the dataset exists in the labeled butler. + + Since the IVOID can be issued by any publisher to represent a Butler + dataset there is no validation of the path or netloc component of the + URI. The only requirement is that there are ``id`` and ``repo`` keys + in the ``ivo`` URI query component. """ parsed = urllib.parse.urlparse(uri) if parsed.scheme == "ivo": - # TODO: Validate netloc component. - if parsed.path != "/datasets": - raise ValueError(f"Unrecognized path in IVOID {uri}. Expected 'datasets' got {parsed.path!r}") - label, id_ = os.path.split(parsed.query) + # Do not validate the netloc or the path values. + qs = urllib.parse.parse_qs(parsed.query) + if "repo" not in qs or "id" not in qs: + raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.") + if len(qs["repo"]) != 1 or len(qs["id"]) != 1: + raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}") + label = qs["repo"][0] + id_ = qs["id"][0] elif parsed.scheme == "butler": label = parsed.netloc # Need to strip the leading /. id_ = parsed.path[1:] else: raise ValueError(f"Unrecognized URI scheme: {uri!r}") + # Strip trailing/leading whitespace from label. + label = label.strip() if not label: raise ValueError(f"No butler repository label found in uri {uri!r}") try: diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 012ff0c39a..5ad73ceab8 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -912,10 +912,10 @@ def test_dataset_uris(self): factory = butler_factory.bind(access_token=None) for dataset_uri in ( - f"ivo://rubin.lsst/datasets?{config_dir}/{ref.id}", - f"ivo://rubin.lsst/datasets?{config_dir}/butler.yaml/{ref.id}", + f"ivo://org.rubinobs/usdac/test?repo={config_dir}&id={ref.id}", + f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}/butler.yaml&id={ref.id}", f"butler://{label}/{ref.id}", - f"ivo://rubin.lsst/datasets?{label}/{ref.id}", + f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}", ): new_butler, ref2 = Butler.get_dataset_from_uri(dataset_uri) self.assertEqual(ref, ref2) @@ -939,11 +939,14 @@ def test_dataset_uris(self): # Test some failure modes. for dataset_uri in ( "butler://label/1234", # Bad UUID. - "butler://1234", # No label. + "butler://1234", # No UUID. + "butler:///1234", # No label. "ivo://rubin/1234", # No query part and bad UUID and no label. "ivo://rubin/datasets/dr1/82d79caa-0823-4300-9874-67b737367ee0", # No query part. - "ivo://rubin/datasets?dr1/1234", # Bad UUID. - "ivo://rubin.lsst/butler?dr1/82d79caa-0823-4300-9874-67b737367ee0", # Not datasets. + "ivo://org.rubinobs/datasets?repo=dr1&id=1234", # Bad UUID. + "ivo://org.rubinobs/butler?release=dr1&id=82d79caa-0823-4300-9874-67b737367ee0", # No repo key. + "ivo://org.rubinobs/butler?repo=dr1&repo=dr2&id=82d79caa-0823-4300-9874-67b737367ee0", # 2 vals. + "ivo://org.rubinobs/something?repo=%20&id=82d79caa-0823-4300-9874-67b737367ee0", # no repo. "https://something.edu/1234", # Wrong scheme. ): with self.assertRaises(ValueError): From a5155b4883b1108a0919a823f3d750268d6e5225 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 3 Dec 2024 16:07:20 -0700 Subject: [PATCH 06/10] Add news fragment --- doc/changes/DM-47325.feature.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 doc/changes/DM-47325.feature.rst diff --git a/doc/changes/DM-47325.feature.rst b/doc/changes/DM-47325.feature.rst new file mode 100644 index 0000000000..3fe2ac51b8 --- /dev/null +++ b/doc/changes/DM-47325.feature.rst @@ -0,0 +1,5 @@ +Added two new APIs for handling Butler dataset URIs. +``Butler.parse_dataset_uri`` parses a URI and returns the butler repository label and associated UUID. +``Butler.get_dataset_from_uri`` will parse a URI and attempt to retrieve the ``DatasetRef``. +URIs should be of the form IVOA identifiers as described in `DMTN-302 `_. +Deprecated ``butler://`` URIs are still supported but should not be used in new systems. From 3cd5774743f0bcfb4405b9aa055b10d357cfdd76 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 5 Dec 2024 13:50:31 -0700 Subject: [PATCH 07/10] Allow IVO:// and BUTLER:// URI schemes to be used The URI scheme and netloc are meant to be case insensitive. Since the butler URI scheme relies on butler labels which are not case-insensitive, the netloc is not downcased in that situation. --- python/lsst/daf/butler/_butler.py | 7 ++++--- python/lsst_daf_butler.dist-info/METADATA | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 python/lsst_daf_butler.dist-info/METADATA diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 74bb22ed16..382ea4871e 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -568,7 +568,8 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: in the ``ivo`` URI query component. """ parsed = urllib.parse.urlparse(uri) - if parsed.scheme == "ivo": + parsed_scheme = parsed.scheme.lower() + if parsed_scheme == "ivo": # Do not validate the netloc or the path values. qs = urllib.parse.parse_qs(parsed.query) if "repo" not in qs or "id" not in qs: @@ -577,8 +578,8 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}") label = qs["repo"][0] id_ = qs["id"][0] - elif parsed.scheme == "butler": - label = parsed.netloc + elif parsed_scheme == "butler": + label = parsed.netloc # Butler label is case sensitive. # Need to strip the leading /. id_ = parsed.path[1:] else: diff --git a/python/lsst_daf_butler.dist-info/METADATA b/python/lsst_daf_butler.dist-info/METADATA new file mode 100644 index 0000000000..ca0bc054af --- /dev/null +++ b/python/lsst_daf_butler.dist-info/METADATA @@ -0,0 +1,3 @@ +Metadata-Version: 1.0 +Name: lsst-daf-butler +Version: g57cedf6216+76f9c43fa5 From 85c354567a23a1fa8562fcf6ba3d46a11b5f8807 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 5 Dec 2024 13:53:19 -0700 Subject: [PATCH 08/10] Ensure that URI parsing test unquotes The %2F becomes a / --- tests/test_simpleButler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 5ad73ceab8..be1153294f 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -913,7 +913,7 @@ def test_dataset_uris(self): for dataset_uri in ( f"ivo://org.rubinobs/usdac/test?repo={config_dir}&id={ref.id}", - f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}/butler.yaml&id={ref.id}", + f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}%2Fbutler.yaml&id={ref.id}", f"butler://{label}/{ref.id}", f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}", ): From 970e68bc201f7b55bf8ce04025521e014c21c0e5 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 5 Dec 2024 13:57:39 -0700 Subject: [PATCH 09/10] Add docstring for bind method --- python/lsst/daf/butler/_labeled_butler_factory.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/lsst/daf/butler/_labeled_butler_factory.py b/python/lsst/daf/butler/_labeled_butler_factory.py index 3941722527..141ddc0eee 100644 --- a/python/lsst/daf/butler/_labeled_butler_factory.py +++ b/python/lsst/daf/butler/_labeled_butler_factory.py @@ -91,6 +91,21 @@ def __init__(self, repositories: Mapping[str, str] | None = None) -> None: self._preload_direct_butler_cache = True def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol: + """Create a callable factory function for generating Butler instances + with out needing to specify access tokans again. + + Parameters + ---------- + access_token : `str` or `None` + An optional access token to use for authentication with the Butler. + + Returns + ------- + bound : `LabeledButlerFactoryProtocol` + A callable that takes a label as input and returns a Butler + instance. + """ + def create(label: str) -> Butler: return self.create_butler(label=label, access_token=access_token) From 2cbb8f05adcca8785d63becc6fdfaa0f5fee9ba7 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 5 Dec 2024 15:23:49 -0700 Subject: [PATCH 10/10] Return dataclasses rather than tuple --- python/lsst/daf/butler/_butler.py | 43 +++++++++++++++++++------------ tests/test_simpleButler.py | 20 +++++++------- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 382ea4871e..dcb978589e 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -29,6 +29,7 @@ __all__ = ["Butler"] +import dataclasses import urllib.parse import uuid from abc import abstractmethod @@ -74,6 +75,19 @@ _LOG = getLogger(__name__) +@dataclasses.dataclass +class ParsedButlerDatasetURI: + label: str + dataset_id: uuid.UUID + uri: str + + +@dataclasses.dataclass +class SpecificButlerDataset: + butler: Butler + dataset: DatasetRef | None + + class Butler(LimitedButler): # numpydoc ignore=PR02 """Interface for data butler and factory for Butler instances. @@ -530,7 +544,7 @@ def get_known_repos(cls) -> set[str]: return ButlerRepoIndex.get_known_repos() @classmethod - def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: + def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI: """Extract the butler label and dataset ID from a dataset URI. Parameters @@ -540,11 +554,9 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: Returns ------- - label : `str` + parsed : `ParsedButlerDatasetURI` The label associated with the butler repository from which this - dataset originates. - dataset_id : `DatasetId` - The ID of the dataset. + dataset originates and the ID of the dataset. Notes ----- @@ -594,12 +606,12 @@ def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}") raise - return label, dataset_id + return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri) @classmethod def get_dataset_from_uri( cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None - ) -> tuple[Butler, DatasetRef | None]: + ) -> SpecificButlerDataset: """Get the dataset associated with the given dataset URI. Parameters @@ -613,23 +625,22 @@ def get_dataset_from_uri( Returns ------- - butler : `Butler` - Butler object associated with this URI. - ref : `DatasetRef` or `None` - The dataset associated with that URI, or `None` if the UUID - is valid but the dataset is not known to this butler. + result : `SpecificButlerDataset` + The butler associated with this URI and the dataset itself. + The dataset can be `None` if the UUID is valid but the dataset + is not known to this butler. """ - label, dataset_id = cls.parse_dataset_uri(uri) + parsed = cls.parse_dataset_uri(uri) butler: Butler | None = None if factory is not None: # If the label is not recognized, it might be a path. try: - butler = factory(label) + butler = factory(parsed.label) except KeyError: pass if butler is None: - butler = cls.from_config(label) - return butler, butler.get_dataset(dataset_id) + butler = cls.from_config(parsed.label) + return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id)) @abstractmethod def _caching_context(self) -> AbstractContextManager[None]: diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index be1153294f..1ffa7941ee 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -917,24 +917,24 @@ def test_dataset_uris(self): f"butler://{label}/{ref.id}", f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}", ): - new_butler, ref2 = Butler.get_dataset_from_uri(dataset_uri) - self.assertEqual(ref, ref2) + result = Butler.get_dataset_from_uri(dataset_uri) + self.assertEqual(result.dataset, ref) # The returned butler needs to have the datastore mocked. - DatastoreMock.apply(new_butler) - dataset_id, _ = butler.get(ref2) + DatastoreMock.apply(result.butler) + dataset_id, _ = result.butler.get(result.dataset) self.assertEqual(dataset_id, ref.id) - factory_butler, ref2 = Butler.get_dataset_from_uri(dataset_uri, factory=factory) - self.assertEqual(ref, ref2) + factory_result = Butler.get_dataset_from_uri(dataset_uri, factory=factory) + self.assertEqual(factory_result.dataset, ref) # The returned butler needs to have the datastore mocked. - DatastoreMock.apply(factory_butler) - dataset_id, _ = factory_butler.get(ref2) + DatastoreMock.apply(factory_result.butler) + dataset_id, _ = factory_result.butler.get(factory_result.dataset) self.assertEqual(dataset_id, ref.id) # Non existent dataset. missing_id = str(ref.id).replace("2", "3") - _, no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") - self.assertIsNone(no_ref) + result = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") + self.assertIsNone(result.dataset) # Test some failure modes. for dataset_uri in (