From 3d42102151188b33e7d1d5349edcb0e3512804e4 Mon Sep 17 00:00:00 2001 From: Andy Salnikov Date: Mon, 11 Nov 2024 14:34:08 -0800 Subject: [PATCH 1/5] Fix SQL type of the run column in case it is set to NULL. --- .../datasets/byDimensions/_manager.py | 4 +- .../lsst/daf/butler/tests/butler_queries.py | 37 +++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py index ca75ef5a5e..9d28b7d45b 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py @@ -1635,7 +1635,7 @@ def _get_calibs_table(self, table: DynamicTables) -> sqlalchemy.Table: def _create_case_expression_for_collections( collections: Iterable[CollectionRecord], id_column: sqlalchemy.ColumnElement -) -> sqlalchemy.Case | sqlalchemy.Null: +) -> sqlalchemy.ColumnElement: """Return a SQLAlchemy Case expression that converts collection IDs to collection names for the given set of collections. @@ -1661,6 +1661,6 @@ def _create_case_expression_for_collections( # cases, e.g. we start with a list of valid collections but they are # all filtered out by higher-level code on the basis of collection # summaries. - return sqlalchemy.null() + return sqlalchemy.cast(sqlalchemy.null(), sqlalchemy.String) return sqlalchemy.case(mapping, value=id_column) diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py index 4c1367fcf7..08007e4d3a 100644 --- a/python/lsst/daf/butler/tests/butler_queries.py +++ b/python/lsst/daf/butler/tests/butler_queries.py @@ -1334,6 +1334,43 @@ def test_materialization_find_first(self) -> None: _ = list(m_query.datasets("skyMap", collections)) _ = list(m_query.datasets("calexp", collections)) + def test_materialization_no_results(self) -> None: + """Test querying for datasets when materialized table is empty.""" + butler = self.make_butler("ci_hsc-subset.yaml", "ci_hsc-subset-skymap.yaml") + + run = "HSC/runs/ci_hsc/20240806T180642Z" + + # Register a dataset type but do not add any datasets. + butler.registry.registerDatasetType( + DatasetType("nothing", ["visit", "detector"], "int", universe=butler.dimensions) + ) + + collections = [run] + with butler.query() as query: + query = query.join_dimensions( + [ + "instrument", + "physical_filter", + "band", + "visit", + "detector", + "day_obs", + "skymap", + "tract", + ] + ) + query = query.join_dataset_search("calexp", collections) + query = query.join_dataset_search("nothing", collections) + query = query.where({}, "instrument='HSC' AND skymap='discrete/ci_hsc'", bind=None) + no_results = "\n".join(query.explain_no_results()) + self.assertIn("No datasets of type 'nothing'", no_results) + + m_query = query.materialize() + result = m_query.datasets("nothing") + self.assertFalse(result.any()) + no_results = "\n".join(result.explain_no_results()) + self.assertIn("No datasets of type 'nothing'", no_results) + def test_timespan_results(self) -> None: """Test returning dimension records that include timespans.""" butler = self.make_butler("base.yaml", "spatial.yaml") From 488a32da90f6f9ef8d253e24a10a135ec595ad58 Mon Sep 17 00:00:00 2001 From: Andy Salnikov Date: Thu, 21 Nov 2024 09:44:32 -0800 Subject: [PATCH 2/5] Skip some dataset searches joins with materialized queries. When building queries on top of materialized tables we can avoid joining dataset searches that are already included in materializations, except when we need something from tags/calibs tables. --- .../daf/butler/direct_query_driver/_driver.py | 23 +++++++++++++------ python/lsst/daf/butler/queries/_query.py | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/direct_query_driver/_driver.py b/python/lsst/daf/butler/direct_query_driver/_driver.py index 1121327360..0862c8be35 100644 --- a/python/lsst/daf/butler/direct_query_driver/_driver.py +++ b/python/lsst/daf/butler/direct_query_driver/_driver.py @@ -799,13 +799,22 @@ def apply_initial_query_joins( select_builder.joins, materialization_key, materialization_dimensions ) ) - # Process dataset joins (not including any union dataset). - for dataset_search in joins_analysis.datasets.values(): - self.join_dataset_search( - select_builder.joins, - dataset_search, - joins_analysis.columns.dataset_fields[dataset_search.name], - ) + # Process dataset joins (not including any union dataset). Datasets + # searches included in materialization can be skipped unless we need + # something from their tables. + materialized_datasets = set() + for m_state in self._materializations.values(): + materialized_datasets.update(m_state.datasets) + for dataset_type_name, dataset_search in joins_analysis.datasets.items(): + if ( + dataset_type_name not in materialized_datasets + or dataset_type_name in select_builder.columns.dataset_fields + ): + self.join_dataset_search( + select_builder.joins, + dataset_search, + joins_analysis.columns.dataset_fields[dataset_search.name], + ) # Join in dimension element tables that we know we need relationships # or columns from. for element in joins_analysis.iter_mandatory(union_dataset_dimensions): diff --git a/python/lsst/daf/butler/queries/_query.py b/python/lsst/daf/butler/queries/_query.py index 75f8d8bfe7..086ad7b3cf 100644 --- a/python/lsst/daf/butler/queries/_query.py +++ b/python/lsst/daf/butler/queries/_query.py @@ -508,7 +508,7 @@ def materialize( "Expand the dimensions or drop this dataset type in the arguments to materialize to " "avoid this error." ) - tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name]) + tree = tree.join_dataset(dataset_type_name, dataset_search) return Query(self._driver, tree) def join_dataset_search( From e9c7ef5e70184efb9834827b6c3401ff22924914 Mon Sep 17 00:00:00 2001 From: Andy Salnikov Date: Thu, 9 Jan 2025 14:49:34 -0800 Subject: [PATCH 3/5] Set cursor_tuple_fraction parameter for Postgres --- .../butler/registry/databases/postgresql.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/python/lsst/daf/butler/registry/databases/postgresql.py b/python/lsst/daf/butler/registry/databases/postgresql.py index cc60bac5b6..ec99f56e73 100644 --- a/python/lsst/daf/butler/registry/databases/postgresql.py +++ b/python/lsst/daf/butler/registry/databases/postgresql.py @@ -246,20 +246,21 @@ def _transaction( # PostgreSQL actually considers SET TRANSACTION to be a # fundamentally different statement from SET (they have their # own distinct doc pages, at least). - if not (self.isWriteable() or for_temp_tables): + with closing(connection.connection.cursor()) as cursor: # PostgreSQL permits writing to temporary tables inside # read-only transactions, but it doesn't permit creating # them. - with closing(connection.connection.cursor()) as cursor: + if not (self.isWriteable() or for_temp_tables): cursor.execute("SET TRANSACTION READ ONLY") - cursor.execute("SET TIME ZONE 0") - else: - with closing(connection.connection.cursor()) as cursor: - # Make timestamps UTC, because we didn't use TIMESTAMPZ - # for the column type. When we can tolerate a schema - # change, we should change that type and remove this - # line. - cursor.execute("SET TIME ZONE 0") + # Make timestamps UTC, because we didn't use TIMESTAMPZ + # for the column type. When we can tolerate a schema + # change, we should change that type and remove this + # line. + cursor.execute("SET TIME ZONE 0") + # Using server-side cursors with complex queries frequently + # generates suboptimal query plan, setting + # cursor_tuple_fraction=1 helps for those cases. + cursor.execute("SET cursor_tuple_fraction = 1") yield is_new, connection @contextmanager From 1eab6b0e5144fe0b4649ef1ec9215b480f23dec9 Mon Sep 17 00:00:00 2001 From: Andy Salnikov Date: Wed, 22 Jan 2025 13:44:31 -0800 Subject: [PATCH 4/5] Add indices to temporary tables. The indices are created on combinations of dimensions. This should help with the performance of follow-up queries in graph builder. --- .../daf/butler/direct_query_driver/_driver.py | 4 ++- .../direct_query_driver/_sql_builders.py | 26 +++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/python/lsst/daf/butler/direct_query_driver/_driver.py b/python/lsst/daf/butler/direct_query_driver/_driver.py index 0862c8be35..faf1586e17 100644 --- a/python/lsst/daf/butler/direct_query_driver/_driver.py +++ b/python/lsst/daf/butler/direct_query_driver/_driver.py @@ -311,7 +311,9 @@ def materialize( # sql_select, _ = plan.finish_select(return_columns=False) table = self._exit_stack.enter_context( - self.db.temporary_table(make_table_spec(plan.final_columns, self.db, plan.postprocessing)) + self.db.temporary_table( + make_table_spec(plan.final_columns, self.db, plan.postprocessing, make_indices=True) + ) ) self.db.insert(table, select=sql_select) if key is None: diff --git a/python/lsst/daf/butler/direct_query_driver/_sql_builders.py b/python/lsst/daf/butler/direct_query_driver/_sql_builders.py index 1a537d5d28..3f0d0dba32 100644 --- a/python/lsst/daf/butler/direct_query_driver/_sql_builders.py +++ b/python/lsst/daf/butler/direct_query_driver/_sql_builders.py @@ -37,6 +37,8 @@ import sqlalchemy from .. import ddl +from ..dimensions import DimensionGroup +from ..dimensions._group import SortedSequenceSet from ..nonempty_mapping import NonemptyMapping from ..queries import tree as qt from ._postprocessing import Postprocessing @@ -638,7 +640,7 @@ def to_select_builder( def make_table_spec( - columns: qt.ColumnSet, db: Database, postprocessing: Postprocessing | None + columns: qt.ColumnSet, db: Database, postprocessing: Postprocessing | None, *, make_indices: bool = False ) -> ddl.TableSpec: """Make a specification that can be used to create a table to store this query's outputs. @@ -652,6 +654,8 @@ def make_table_spec( postprocessing : `Postprocessing` Struct representing post-query processing in Python, which may require additional columns in the query results. + make_indices : `bool`, optional + If `True` add indices for groups of columns. Returns ------- @@ -659,11 +663,13 @@ def make_table_spec( Table specification for this query's result columns (including those from `postprocessing` and `SqlJoinsBuilder.special`). """ + indices = _make_table_indices(columns.dimensions) if make_indices else [] results = ddl.TableSpec( [ columns.get_column_spec(logical_table, field).to_sql_spec(name_shrinker=db.name_shrinker) for logical_table, field in columns - ] + ], + indexes=indices, ) if postprocessing: for element in postprocessing.iter_missing(columns): @@ -679,3 +685,19 @@ def make_table_spec( ddl.FieldSpec(name=SqlSelectBuilder.EMPTY_COLUMNS_NAME, dtype=SqlSelectBuilder.EMPTY_COLUMNS_TYPE) ) return results + + +def _make_table_indices(dimensions: DimensionGroup) -> list[ddl.IndexSpec]: + + index_columns: list[SortedSequenceSet] = [] + for dimension in dimensions.required: + minimal_group = dimensions.universe[dimension].minimal_group.required + + for idx in range(len(index_columns)): + if index_columns[idx] <= minimal_group: + index_columns[idx] = minimal_group + break + else: + index_columns.append(minimal_group) + + return [ddl.IndexSpec(*columns) for columns in index_columns] From 5903c60979dc37c0767784ecc7abbc023c326551 Mon Sep 17 00:00:00 2001 From: Andy Salnikov Date: Tue, 28 Jan 2025 14:29:57 -0800 Subject: [PATCH 5/5] Add Query option to suppress DISTINCT in skypix overlaps. This is a non-public API for now, solely for graph builder use. --- .../daf/butler/direct_query_driver/_driver.py | 20 +++++++++++++---- python/lsst/daf/butler/queries/_query.py | 22 ++++++++++++++++--- python/lsst/daf/butler/queries/driver.py | 4 ++++ .../lsst/daf/butler/queries/result_specs.py | 5 +++++ .../daf/butler/registry/dimensions/static.py | 10 ++++++--- .../butler/registry/interfaces/_dimensions.py | 4 ++++ .../daf/butler/remote_butler/_query_driver.py | 2 ++ .../server/handlers/_external_query.py | 1 + .../daf/butler/remote_butler/server_models.py | 1 + .../lsst/daf/butler/tests/butler_queries.py | 20 +++++++++++++++++ tests/test_query_interface.py | 1 + 11 files changed, 80 insertions(+), 10 deletions(-) diff --git a/python/lsst/daf/butler/direct_query_driver/_driver.py b/python/lsst/daf/butler/direct_query_driver/_driver.py index faf1586e17..486e85e8ea 100644 --- a/python/lsst/daf/butler/direct_query_driver/_driver.py +++ b/python/lsst/daf/butler/direct_query_driver/_driver.py @@ -217,6 +217,7 @@ def execute(self, result_spec: ResultSpec, tree: qt.QueryTree) -> Iterator[Resul final_columns=result_spec.get_result_columns(), order_by=result_spec.order_by, find_first_dataset=result_spec.find_first_dataset, + allow_duplicate_overlaps=result_spec.allow_duplicate_overlaps, ) sql_select, sql_columns = builder.finish_select() if result_spec.order_by: @@ -290,12 +291,15 @@ def materialize( tree: qt.QueryTree, dimensions: DimensionGroup, datasets: frozenset[str], + allow_duplicate_overlaps: bool = False, key: qt.MaterializationKey | None = None, ) -> qt.MaterializationKey: # Docstring inherited. if self._exit_stack is None: raise RuntimeError("QueryDriver context must be entered before 'materialize' is called.") - plan = self.build_query(tree, qt.ColumnSet(dimensions)) + plan = self.build_query( + tree, qt.ColumnSet(dimensions), allow_duplicate_overlaps=allow_duplicate_overlaps + ) # Current implementation ignores 'datasets' aside from remembering # them, because figuring out what to put in the temporary table for # them is tricky, especially if calibration collections are involved. @@ -403,7 +407,7 @@ def count( def any(self, tree: qt.QueryTree, *, execute: bool, exact: bool) -> bool: # Docstring inherited. - builder = self.build_query(tree, qt.ColumnSet(tree.dimensions)) + builder = self.build_query(tree, qt.ColumnSet(tree.dimensions), allow_duplicate_overlaps=True) if not all(d.collection_records for d in builder.joins_analysis.datasets.values()): return False if not execute: @@ -449,6 +453,7 @@ def build_query( order_by: Iterable[qt.OrderExpression] = (), find_first_dataset: str | qt.AnyDatasetType | None = None, analyze_only: bool = False, + allow_duplicate_overlaps: bool = False, ) -> QueryBuilder: """Convert a query description into a nearly-complete builder object for the SQL version of that query. @@ -472,6 +477,9 @@ def build_query( builder, but do not call methods that build its SQL form. This can be useful for obtaining diagnostic information about the query that would be generated. + allow_duplicate_overlaps : `bool`, optional + If set to `True` then query will be allowed to generate + non-distinct rows for spatial overlaps. Returns ------- @@ -544,7 +552,7 @@ def build_query( # SqlSelectBuilder and Postprocessing with spatial/temporal constraints # potentially transformed by the dimensions manager (but none of the # rest of the analysis reflected in that SqlSelectBuilder). - query_tree_analysis = self._analyze_query_tree(tree) + query_tree_analysis = self._analyze_query_tree(tree, allow_duplicate_overlaps) # The "projection" columns differ from the final columns by not # omitting any dimension keys (this keeps queries for different result # types more similar during construction), including any columns needed @@ -591,7 +599,7 @@ def build_query( builder.apply_find_first(self) return builder - def _analyze_query_tree(self, tree: qt.QueryTree) -> QueryTreeAnalysis: + def _analyze_query_tree(self, tree: qt.QueryTree, allow_duplicate_overlaps: bool) -> QueryTreeAnalysis: """Analyze a `.queries.tree.QueryTree` as the first step in building a SQL query. @@ -605,6 +613,9 @@ def _analyze_query_tree(self, tree: qt.QueryTree) -> QueryTreeAnalysis: tree_analysis : `QueryTreeAnalysis` Struct containing additional information need to build the joins stage of a query. + allow_duplicate_overlaps : `bool`, optional + If set to `True` then query will be allowed to generate + non-distinct rows for spatial overlaps. Notes ----- @@ -634,6 +645,7 @@ def _analyze_query_tree(self, tree: qt.QueryTree) -> QueryTreeAnalysis: tree.predicate, tree.get_joined_dimension_groups(), collection_analysis.calibration_dataset_types, + allow_duplicate_overlaps, ) # Extract the data ID implied by the predicate; we can use the governor # dimensions in that to constrain the collections we search for diff --git a/python/lsst/daf/butler/queries/_query.py b/python/lsst/daf/butler/queries/_query.py index 086ad7b3cf..0471b3efc9 100644 --- a/python/lsst/daf/butler/queries/_query.py +++ b/python/lsst/daf/butler/queries/_query.py @@ -106,6 +106,12 @@ def __init__(self, driver: QueryDriver, tree: QueryTree | None = None): tree = make_identity_query_tree(driver.universe) super().__init__(driver, tree) + # If ``_allow_duplicate_overlaps`` is set to `True` then query will be + # allowed to generate non-distinct rows for spatial overlaps. This is + # not a part of public API for now, to be used by graph builder as + # optimization. + self._allow_duplicate_overlaps: bool = False + @property def constraint_dataset_types(self) -> Set[str]: """The names of all dataset types joined into the query. @@ -218,7 +224,11 @@ def data_ids( dimensions = self._driver.universe.conform(dimensions) if not dimensions <= self._tree.dimensions: tree = tree.join_dimensions(dimensions) - result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False) + result_spec = DataCoordinateResultSpec( + dimensions=dimensions, + include_dimension_records=False, + allow_duplicate_overlaps=self._allow_duplicate_overlaps, + ) return DataCoordinateQueryResults(self._driver, tree, result_spec) def datasets( @@ -284,6 +294,7 @@ def datasets( storage_class_name=storage_class_name, include_dimension_records=False, find_first=find_first, + allow_duplicate_overlaps=self._allow_duplicate_overlaps, ) return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec) @@ -308,7 +319,9 @@ def dimension_records(self, element: str) -> DimensionRecordQueryResults: tree = self._tree if element not in tree.dimensions.elements: tree = tree.join_dimensions(self._driver.universe[element].minimal_group) - result_spec = DimensionRecordResultSpec(element=self._driver.universe[element]) + result_spec = DimensionRecordResultSpec( + element=self._driver.universe[element], allow_duplicate_overlaps=self._allow_duplicate_overlaps + ) return DimensionRecordQueryResults(self._driver, tree, result_spec) def general( @@ -445,6 +458,7 @@ def general( dimension_fields=dimension_fields_dict, dataset_fields=dataset_fields_dict, find_first=find_first, + allow_duplicate_overlaps=self._allow_duplicate_overlaps, ) return GeneralQueryResults(self._driver, tree=tree, spec=result_spec) @@ -495,7 +509,9 @@ def materialize( dimensions = self._tree.dimensions else: dimensions = self._driver.universe.conform(dimensions) - key = self._driver.materialize(self._tree, dimensions, datasets) + key = self._driver.materialize( + self._tree, dimensions, datasets, allow_duplicate_overlaps=self._allow_duplicate_overlaps + ) tree = make_identity_query_tree(self._driver.universe).join_materialization( key, dimensions=dimensions ) diff --git a/python/lsst/daf/butler/queries/driver.py b/python/lsst/daf/butler/queries/driver.py index 22703c73c1..a27381a4a1 100644 --- a/python/lsst/daf/butler/queries/driver.py +++ b/python/lsst/daf/butler/queries/driver.py @@ -209,6 +209,7 @@ def materialize( tree: QueryTree, dimensions: DimensionGroup, datasets: frozenset[str], + allow_duplicate_overlaps: bool = False, ) -> MaterializationKey: """Execute a query tree, saving results to temporary storage for use in later queries. @@ -222,6 +223,9 @@ def materialize( datasets : `frozenset` [ `str` ] Names of dataset types whose ID columns may be materialized. It is implementation-defined whether they actually are. + allow_duplicate_overlaps : `bool`, optional + If set to `True` then query will be allowed to generate + non-distinct rows for spatial overlaps. Returns ------- diff --git a/python/lsst/daf/butler/queries/result_specs.py b/python/lsst/daf/butler/queries/result_specs.py index baf131d865..462fc12bf5 100644 --- a/python/lsst/daf/butler/queries/result_specs.py +++ b/python/lsst/daf/butler/queries/result_specs.py @@ -62,6 +62,11 @@ class ResultSpecBase(pydantic.BaseModel, ABC): limit: int | None = None """Maximum number of rows to return, or `None` for no bound.""" + allow_duplicate_overlaps: bool = False + """If set to True the queries are allowed to returnd duplicate rows for + spatial overlaps. + """ + def validate_tree(self, tree: QueryTree) -> None: """Check that this result object is consistent with a query tree. diff --git a/python/lsst/daf/butler/registry/dimensions/static.py b/python/lsst/daf/butler/registry/dimensions/static.py index e18c1cb94b..bfd6cbcf2e 100644 --- a/python/lsst/daf/butler/registry/dimensions/static.py +++ b/python/lsst/daf/butler/registry/dimensions/static.py @@ -487,9 +487,10 @@ def process_query_overlaps( predicate: qt.Predicate, join_operands: Iterable[DimensionGroup], calibration_dataset_types: Set[str | qt.AnyDatasetType], + allow_duplicates: bool = False, ) -> tuple[qt.Predicate, SqlSelectBuilder, Postprocessing]: overlaps_visitor = _CommonSkyPixMediatedOverlapsVisitor( - self._db, dimensions, calibration_dataset_types, self._overlap_tables + self._db, dimensions, calibration_dataset_types, self._overlap_tables, allow_duplicates ) new_predicate = overlaps_visitor.run(predicate, join_operands) return new_predicate, overlaps_visitor.builder, overlaps_visitor.postprocessing @@ -1025,6 +1026,7 @@ def __init__( dimensions: DimensionGroup, calibration_dataset_types: Set[str | qt.AnyDatasetType], overlap_tables: Mapping[str, tuple[sqlalchemy.Table, sqlalchemy.Table]], + allow_duplicates: bool, ): super().__init__(dimensions, calibration_dataset_types) self.builder: SqlSelectBuilder = SqlJoinsBuilder(db=db).to_select_builder(qt.ColumnSet(dimensions)) @@ -1032,6 +1034,7 @@ def __init__( self.common_skypix = dimensions.universe.commonSkyPix self.overlap_tables: Mapping[str, tuple[sqlalchemy.Table, sqlalchemy.Table]] = overlap_tables self.common_skypix_overlaps_done: set[DatabaseDimensionElement] = set() + self.allow_duplicates = allow_duplicates def visit_spatial_constraint( self, @@ -1081,7 +1084,8 @@ def visit_spatial_constraint( joins_builder.where(sqlalchemy.or_(*sql_where_or)) self.builder.join( joins_builder.to_select_builder( - qt.ColumnSet(element.minimal_group).drop_implied_dimension_keys(), distinct=True + qt.ColumnSet(element.minimal_group).drop_implied_dimension_keys(), + distinct=not self.allow_duplicates, ).into_joins_builder(postprocessing=None) ) # Short circuit here since the SQL WHERE clause has already @@ -1145,7 +1149,7 @@ def visit_spatial_join( .join(self._make_common_skypix_overlap_joins_builder(b)) .to_select_builder( qt.ColumnSet(a.minimal_group | b.minimal_group).drop_implied_dimension_keys(), - distinct=True, + distinct=not self.allow_duplicates, ) .into_joins_builder(postprocessing=None) ) diff --git a/python/lsst/daf/butler/registry/interfaces/_dimensions.py b/python/lsst/daf/butler/registry/interfaces/_dimensions.py index a6cffd5f2b..27450af8eb 100644 --- a/python/lsst/daf/butler/registry/interfaces/_dimensions.py +++ b/python/lsst/daf/butler/registry/interfaces/_dimensions.py @@ -402,6 +402,7 @@ def process_query_overlaps( predicate: Predicate, join_operands: Iterable[DimensionGroup], calibration_dataset_types: Set[str | AnyDatasetType], + allow_duplicates: bool = False, ) -> tuple[Predicate, SqlSelectBuilder, Postprocessing]: """Process a query's WHERE predicate and dimensions to handle spatial and temporal overlaps. @@ -424,6 +425,9 @@ def process_query_overlaps( `..queries.tree.AnyDatasetType` ] The names of dataset types that have been joined into the query via a search that includes at least one calibration collection. + allow_duplicates : `bool` + If set to `True` then query will be allowed to return non-distinct + rows. Returns ------- diff --git a/python/lsst/daf/butler/remote_butler/_query_driver.py b/python/lsst/daf/butler/remote_butler/_query_driver.py index 47cec7c248..0e6aa4d91b 100644 --- a/python/lsst/daf/butler/remote_butler/_query_driver.py +++ b/python/lsst/daf/butler/remote_butler/_query_driver.py @@ -163,6 +163,7 @@ def materialize( tree: QueryTree, dimensions: DimensionGroup, datasets: frozenset[str], + allow_duplicate_overlaps: bool = False, ) -> MaterializationKey: key = uuid4() self._stored_query_inputs.append( @@ -171,6 +172,7 @@ def materialize( tree=SerializedQueryTree(tree.model_copy(deep=True)), dimensions=dimensions.to_simple(), datasets=datasets, + allow_duplicate_overlaps=allow_duplicate_overlaps, ), ) return key diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external_query.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external_query.py index 159994446c..72f03856cd 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external_query.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external_query.py @@ -195,6 +195,7 @@ def _get_query_context(factory: Factory, query: QueryInputs) -> Iterator[_QueryC DimensionGroup.from_simple(input.dimensions, butler.dimensions), frozenset(input.datasets), key=input.key, + allow_duplicate_overlaps=input.allow_duplicate_overlaps, ) elif input.type == "upload": driver.upload_data_coordinates( diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 4f76e605bc..b04c935efc 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -243,6 +243,7 @@ class MaterializedQuery(pydantic.BaseModel): tree: SerializedQueryTree dimensions: SerializedDimensionGroup datasets: list[str] + allow_duplicate_overlaps: bool = False class DataCoordinateUpload(pydantic.BaseModel): diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py index 08007e4d3a..5faaedfae7 100644 --- a/python/lsst/daf/butler/tests/butler_queries.py +++ b/python/lsst/daf/butler/tests/butler_queries.py @@ -690,6 +690,26 @@ def test_dataset_constrained_record_query(self) -> None: doomed=True, ) + def test_duplicate_overlaps(self) -> None: + """Test for query option that enables duplicate rows in queries that + use skypix overalps. + """ + butler = self.make_butler("base.yaml", "spatial.yaml") + butler.registry.defaults = RegistryDefaults(instrument="Cam1", skymap="SkyMap1") + with butler.query() as query: + + data_ids = list(query.data_ids(["visit", "detector", "patch"]).where(visit=1, detector=1)) + self.assertCountEqual( + [(data_id["tract"], data_id["patch"]) for data_id in data_ids], [(0, 0), (0, 2)] + ) + + query._allow_duplicate_overlaps = True + data_ids = list(query.data_ids(["visit", "detector", "patch"]).where(visit=1, detector=1)) + self.assertCountEqual( + [(data_id["tract"], data_id["patch"]) for data_id in data_ids], + [(0, 0), (0, 0), (0, 2), (0, 2)], + ) + def test_spatial_overlaps(self) -> None: """Test queries for dimension records with spatial overlaps. diff --git a/tests/test_query_interface.py b/tests/test_query_interface.py index 7ca492fbcb..a9cb4eba30 100644 --- a/tests/test_query_interface.py +++ b/tests/test_query_interface.py @@ -369,6 +369,7 @@ def materialize( tree: qt.QueryTree, dimensions: DimensionGroup, datasets: frozenset[str], + allow_duplicate_overlaps: bool = False, ) -> qd.MaterializationKey: key = uuid.uuid4() self.materializations[key] = (tree, dimensions, datasets)