Skip to content

Commit

Permalink
Merge pull request #1113 from lsst/tickets/DM-47325
Browse files Browse the repository at this point in the history
DM-47325: Add API for parsing butler dataset URIs (butler and ivo)
  • Loading branch information
timj authored Dec 6, 2024
2 parents d632886 + 2cbb8f0 commit d66c8bb
Show file tree
Hide file tree
Showing 5 changed files with 220 additions and 2 deletions.
5 changes: 5 additions & 0 deletions doc/changes/DM-47325.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Added two new APIs for handling Butler dataset URIs.
``Butler.parse_dataset_uri`` parses a URI and returns the butler repository label and associated UUID.
``Butler.get_dataset_from_uri`` will parse a URI and attempt to retrieve the ``DatasetRef``.
URIs should be of the form IVOA identifiers as described in `DMTN-302 <https://dmtn-302.lsst.io>`_.
Deprecated ``butler://`` URIs are still supported but should not be used in new systems.
116 changes: 116 additions & 0 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

__all__ = ["Butler"]

import dataclasses
import urllib.parse
import uuid
from abc import abstractmethod
from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
from contextlib import AbstractContextManager
Expand Down Expand Up @@ -60,6 +63,7 @@
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._file_dataset import FileDataset
from ._labeled_butler_factory import LabeledButlerFactoryProtocol
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs
Expand All @@ -71,6 +75,19 @@
_LOG = getLogger(__name__)


@dataclasses.dataclass
class ParsedButlerDatasetURI:
label: str
dataset_id: uuid.UUID
uri: str


@dataclasses.dataclass
class SpecificButlerDataset:
butler: Butler
dataset: DatasetRef | None


class Butler(LimitedButler): # numpydoc ignore=PR02
"""Interface for data butler and factory for Butler instances.
Expand Down Expand Up @@ -526,6 +543,105 @@ def get_known_repos(cls) -> set[str]:
"""
return ButlerRepoIndex.get_known_repos()

@classmethod
def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI:
"""Extract the butler label and dataset ID from a dataset URI.
Parameters
----------
uri : `str`
The dataset URI to parse.
Returns
-------
parsed : `ParsedButlerDatasetURI`
The label associated with the butler repository from which this
dataset originates and the ID of the dataset.
Notes
-----
Supports dataset URIs of the forms
``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see
DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is
deprecated and can not include ``/`` in the label string. ``ivo`` URIs
can include anything supported by the `Butler` constructor, including
paths to repositories and alias labels.
ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID
will return a label of ``/repo/main``.
This method does not attempt to check that the dataset exists in the
labeled butler.
Since the IVOID can be issued by any publisher to represent a Butler
dataset there is no validation of the path or netloc component of the
URI. The only requirement is that there are ``id`` and ``repo`` keys
in the ``ivo`` URI query component.
"""
parsed = urllib.parse.urlparse(uri)
parsed_scheme = parsed.scheme.lower()
if parsed_scheme == "ivo":
# Do not validate the netloc or the path values.
qs = urllib.parse.parse_qs(parsed.query)
if "repo" not in qs or "id" not in qs:
raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.")
if len(qs["repo"]) != 1 or len(qs["id"]) != 1:
raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}")
label = qs["repo"][0]
id_ = qs["id"][0]
elif parsed_scheme == "butler":
label = parsed.netloc # Butler label is case sensitive.
# Need to strip the leading /.
id_ = parsed.path[1:]
else:
raise ValueError(f"Unrecognized URI scheme: {uri!r}")
# Strip trailing/leading whitespace from label.
label = label.strip()
if not label:
raise ValueError(f"No butler repository label found in uri {uri!r}")
try:
dataset_id = uuid.UUID(hex=id_)
except Exception as e:
e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
raise

return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri)

@classmethod
def get_dataset_from_uri(
cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None
) -> SpecificButlerDataset:
"""Get the dataset associated with the given dataset URI.
Parameters
----------
uri : `str`
The URI associated with a dataset.
factory : `LabeledButlerFactoryProtocol` or `None`, optional
Bound factory function that will be given the butler label
and receive a `Butler`. If this is not provided the label
will be tried directly.
Returns
-------
result : `SpecificButlerDataset`
The butler associated with this URI and the dataset itself.
The dataset can be `None` if the UUID is valid but the dataset
is not known to this butler.
"""
parsed = cls.parse_dataset_uri(uri)
butler: Butler | None = None
if factory is not None:
# If the label is not recognized, it might be a path.
try:
butler = factory(parsed.label)
except KeyError:
pass
if butler is None:
butler = cls.from_config(parsed.label)
return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id))

@abstractmethod
def _caching_context(self) -> AbstractContextManager[None]:
"""Context manager that enables caching."""
Expand Down
30 changes: 29 additions & 1 deletion python/lsst/daf/butler/_labeled_butler_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

__all__ = ("LabeledButlerFactory",)
__all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")

from collections.abc import Callable, Mapping
from typing import Protocol

from lsst.resources import ResourcePathExpression

Expand All @@ -42,6 +43,12 @@
instance."""


class LabeledButlerFactoryProtocol(Protocol):
"""Callable to retrieve a butler from a label."""

def __call__(self, label: str) -> Butler: ...


class LabeledButlerFactory:
"""Factory for efficiently instantiating Butler instances from the
repository index file. This is intended for use from long-lived services
Expand Down Expand Up @@ -83,6 +90,27 @@ def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
# This may be overridden by unit tests.
self._preload_direct_butler_cache = True

def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
"""Create a callable factory function for generating Butler instances
with out needing to specify access tokans again.
Parameters
----------
access_token : `str` or `None`
An optional access token to use for authentication with the Butler.
Returns
-------
bound : `LabeledButlerFactoryProtocol`
A callable that takes a label as input and returns a Butler
instance.
"""

def create(label: str) -> Butler:
return self.create_butler(label=label, access_token=access_token)

return create

def create_butler(self, *, label: str, access_token: str | None) -> Butler:
"""Create a Butler instance.
Expand Down
3 changes: 3 additions & 0 deletions python/lsst_daf_butler.dist-info/METADATA
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Metadata-Version: 1.0
Name: lsst-daf-butler
Version: g57cedf6216+76f9c43fa5
68 changes: 67 additions & 1 deletion tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,14 @@
DatasetId,
DatasetRef,
DatasetType,
LabeledButlerFactory,
StorageClass,
Timespan,
)
from lsst.daf.butler.datastore.file_templates import FileTemplate
from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory
from lsst.daf.butler.tests import DatastoreMock
from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir
from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir

try:
from lsst.daf.butler.tests.server import create_test_server
Expand Down Expand Up @@ -882,10 +883,75 @@ def makeButler(self, writeable: bool = False) -> Butler:
registryConfig = RegistryConfig(config.get("registry"))
_RegistryFactory(registryConfig).create_from_config()

# Write the YAML file so that some tests can recreate butler from it.
config.dumpToUri(os.path.join(self.root, "butler.yaml"))
butler = Butler.from_config(config, writeable=writeable)
DatastoreMock.apply(butler)
return butler

def test_dataset_uris(self):
"""Test that dataset URIs can be parsed and retrieved."""
butler = self.makeButler(writeable=True)
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile))

butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
self.assertIsInstance(ref, DatasetRef)

# Get the butler root for the URI.
config_dir = butler._config["root"]

# Read it via a repo label and a path.
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file:
label = "test_repo"
index_file.write(f"{label}: {config_dir}\n")
index_file.flush()
with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}):
butler_factory = LabeledButlerFactory()
factory = butler_factory.bind(access_token=None)

for dataset_uri in (
f"ivo://org.rubinobs/usdac/test?repo={config_dir}&id={ref.id}",
f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}%2Fbutler.yaml&id={ref.id}",
f"butler://{label}/{ref.id}",
f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}",
):
result = Butler.get_dataset_from_uri(dataset_uri)
self.assertEqual(result.dataset, ref)
# The returned butler needs to have the datastore mocked.
DatastoreMock.apply(result.butler)
dataset_id, _ = result.butler.get(result.dataset)
self.assertEqual(dataset_id, ref.id)

factory_result = Butler.get_dataset_from_uri(dataset_uri, factory=factory)
self.assertEqual(factory_result.dataset, ref)
# The returned butler needs to have the datastore mocked.
DatastoreMock.apply(factory_result.butler)
dataset_id, _ = factory_result.butler.get(factory_result.dataset)
self.assertEqual(dataset_id, ref.id)

# Non existent dataset.
missing_id = str(ref.id).replace("2", "3")
result = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}")
self.assertIsNone(result.dataset)

# Test some failure modes.
for dataset_uri in (
"butler://label/1234", # Bad UUID.
"butler://1234", # No UUID.
"butler:///1234", # No label.
"ivo://rubin/1234", # No query part and bad UUID and no label.
"ivo://rubin/datasets/dr1/82d79caa-0823-4300-9874-67b737367ee0", # No query part.
"ivo://org.rubinobs/datasets?repo=dr1&id=1234", # Bad UUID.
"ivo://org.rubinobs/butler?release=dr1&id=82d79caa-0823-4300-9874-67b737367ee0", # No repo key.
"ivo://org.rubinobs/butler?repo=dr1&repo=dr2&id=82d79caa-0823-4300-9874-67b737367ee0", # 2 vals.
"ivo://org.rubinobs/something?repo=%20&id=82d79caa-0823-4300-9874-67b737367ee0", # no repo.
"https://something.edu/1234", # Wrong scheme.
):
with self.assertRaises(ValueError):
Butler.parse_dataset_uri(dataset_uri)


class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase):
"""Run tests against DirectButler implementation using the
Expand Down

0 comments on commit d66c8bb

Please sign in to comment.