From d536789c00ca11b60bed940377918c2ba933a62c Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 11 Oct 2024 17:03:51 -0700 Subject: [PATCH 1/6] Refactor table-column lookup in AnVIL plugin --- src/azul/plugins/repository/tdr_anvil/__init__.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index e276fe6b8..7e867dfa0 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -687,11 +687,12 @@ def convert_column(value): else: return [] + _schema_columns = { + table['name']: [column['name'] for column in table['columns']] + for table in anvil_schema['tables'] + } + def _columns(self, entity_type: EntityType) -> set[str]: - table = one( - table for table in anvil_schema['tables'] - if table['name'] == f'anvil_{entity_type}' - ) - entity_columns = {column['name'] for column in table['columns']} - entity_columns.add('datarepo_row_id') - return entity_columns + columns = set(self._schema_columns[f'anvil_{entity_type}']) + columns.add('datarepo_row_id') + return columns From 989d07fc7c03fa5a943cc20369c16868e9734553 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 11 Oct 2024 16:05:51 -0700 Subject: [PATCH 2/6] Fix incorrect canned table names --- .../plugins/repository/tdr_anvil/__init__.py | 5 +++-- ...09d-46a4-b845-7584df39263b.tables.tdr.json | 20 +++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 7e867dfa0..c7811d3d4 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -379,8 +379,9 @@ def _bundle_entity(self, bundle_fqid: TDRAnvilBundleFQID) -> KeyReference: return bundle_entity def _full_table_name(self, source: TDRSourceSpec, table_name: str) -> str: - if not table_name.startswith('INFORMATION_SCHEMA'): - table_name = 'anvil_' + table_name + prefixed = f'anvil_{table_name}' + if prefixed in self._schema_columns: + table_name = prefixed return super()._full_table_name(source, table_name) def _consolidate_by_type(self, entities: Keys) -> MutableKeysByType: diff --git a/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json b/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json index 3c02b7242..422b461de 100644 --- a/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json +++ b/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json @@ -1,6 +1,6 @@ { "tables": { - "activity": { + "anvil_activity": { "rows": [ { "activity_id": "", @@ -12,7 +12,7 @@ } ] }, - "alignmentactivity": { + "anvil_alignmentactivity": { "rows": [ { "activity_type": "", @@ -25,7 +25,7 @@ } ] }, - "assayactivity": { + "anvil_assayactivity": { "rows": [ { "activity_type": "", @@ -39,7 +39,7 @@ } ] }, - "biosample": { + "anvil_biosample": { "rows": [ { "anatomical_site": null, @@ -63,7 +63,7 @@ } ] }, - "dataset": { + "anvil_dataset": { "rows": [ { "consent_group": [ @@ -89,7 +89,7 @@ } ] }, - "diagnosis": { + "anvil_diagnosis": { "rows": [ { "datarepo_row_id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", @@ -135,7 +135,7 @@ } ] }, - "donor": { + "anvil_donor": { "rows": [ { "datarepo_row_id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", @@ -153,7 +153,7 @@ } ] }, - "file": { + "anvil_file": { "rows": [ { "data_modality": [], @@ -202,7 +202,7 @@ } ] }, - "sequencingactivity": { + "anvil_sequencingactivity": { "rows": [ { "activity_type": "Sequencing", @@ -238,7 +238,7 @@ } ] }, - "variantcallingactivity": { + "anvil_variantcallingactivity": { "rows": [ { "activity_type": "", From 1b40ff4b3d6e4b74f81f9fe4feee1722ae9edc1a Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 16 Oct 2024 15:54:55 -0700 Subject: [PATCH 3/6] Use variable for fixed AnVIL version in test --- test/service/test_response_anvil.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/service/test_response_anvil.py b/test/service/test_response_anvil.py index 2d8e2a0ff..9a176000c 100644 --- a/test/service/test_response_anvil.py +++ b/test/service/test_response_anvil.py @@ -1954,7 +1954,7 @@ def test_entity_indices(self): ], 'file_name': '307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz', 'is_supplementary': False, - 'version': '2022-06-01T00:00:00.000000Z', + 'version': self.version, 'uuid': '15b76f9c-6b46-433f-851d-34e89f1b9ba6', 'size': 213021639, 'name': '307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz', @@ -1965,7 +1965,7 @@ def test_entity_indices(self): f'1e269f04-4347-4188-b060-1dcc69e71d67', 'url': str(self.base_url.set( path='/repository/files/15b76f9c-6b46-433f-851d-34e89f1b9ba6', - args=dict(catalog='test', version='2022-06-01T00:00:00.000000Z') + args=dict(catalog='test', version=self.version) )) } ] @@ -2083,7 +2083,7 @@ def test_entity_indices(self): ], 'file_name': '307500.merged.matefixed.sorted.markeddups.recal.bam', 'is_supplementary': False, - 'version': '2022-06-01T00:00:00.000000Z', + 'version': self.version, 'uuid': '3b17377b-16b1-431c-9967-e5d01fc5923f', 'size': 3306845592, 'name': '307500.merged.matefixed.sorted.markeddups.recal.bam', @@ -2094,7 +2094,7 @@ def test_entity_indices(self): f'8b722e88-8103-49c1-b351-e64fa7c6ab37', 'url': str(self.base_url.set( path='/repository/files/3b17377b-16b1-431c-9967-e5d01fc5923f', - args=dict(catalog='test', version='2022-06-01T00:00:00.000000Z') + args=dict(catalog='test', version=self.version) )) } ] From 41562774572cc6f2617ca0dbaa5e2ca492b8c5d6 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 17 Oct 2024 01:41:13 -0700 Subject: [PATCH 4/6] Refactor enumeration of AnVIL activity types --- .../plugins/metadata/anvil/indexer/transform.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 20107fb91..b30886839 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -402,13 +402,16 @@ def _file(self, file: EntityReference) -> MutableJSON: def _only_dataset(self) -> EntityReference: return one(self._entities_by_type['dataset']) - _activity_polymorphic_types = { - 'activity', - 'alignmentactivity', - 'assayactivity', - 'sequencingactivity', - 'variantcallingactivity' - } + @cached_property + def _activity_polymorphic_types(self) -> AbstractSet[str]: + from azul.plugins.metadata.anvil import ( + anvil_schema, + ) + return { + table['name'].removeprefix('anvil_') + for table in anvil_schema['tables'] + if table['name'].endswith('activity') + } @classmethod def inner_entity_id(cls, entity_type: EntityType, entity: JSON) -> EntityID: From 55d2e2afb9b76624853c91c6861c73862c3c33fd Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 17 Oct 2024 14:42:20 -0700 Subject: [PATCH 5/6] [r] Track table names in AnVIL bundles (#6639) --- src/azul/indexer/document.py | 9 +- src/azul/plugins/metadata/anvil/bundle.py | 4 + .../metadata/anvil/indexer/transform.py | 58 +++--- .../plugins/repository/tdr_anvil/__init__.py | 192 +++++++++--------- ...2783-aeb6-afea-e022897f4dcf.tdr.anvil.json | 2 +- ...5d80-a242-accb-840921351cd5.tdr.anvil.json | 8 +- ...2-e274-affe-aabc-eb3db63ad068.results.json | 58 +++--- ...e274-affe-aabc-eb3db63ad068.tdr.anvil.json | 46 ++--- test/indexer/test_anvil.py | 12 +- test/integration_test.py | 18 +- test/service/test_manifest.py | 2 +- test/service/test_response_anvil.py | 4 +- 12 files changed, 205 insertions(+), 208 deletions(-) diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index db57cad72..3f2860f09 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -1528,13 +1528,10 @@ class Replica(Document[ReplicaCoordinates[E]]): """ #: The type of replica, i.e., what sort of metadata document from the - #: underlying data repository we are storing a copy of. Conceptually related - #: to the entity type, but its value may be different from the entity type. - #: For example, AnVIL replicas use the name of the data table that contains - #: the entity, e.g. 'anvil_file', instead of just 'file'. + #: underlying data repository we are storing a copy of. In practice, this is + #: the same as `self.coordinates.entity.entity_type`, but this isn't + #: necessarily the case. #: - #: We can't model replica types as entity types because we want to hold all - #: replicas in a single index per catalog to facilitate quick retrieval. #: Typically, all replicas of the same type have similar shapes, just like #: contributions for entities of the same type. However, mixing replicas of #: different types results in an index containing documents of heterogeneous diff --git a/src/azul/plugins/metadata/anvil/bundle.py b/src/azul/plugins/metadata/anvil/bundle.py index e28801857..8df46cb8c 100644 --- a/src/azul/plugins/metadata/anvil/bundle.py +++ b/src/azul/plugins/metadata/anvil/bundle.py @@ -116,6 +116,10 @@ def to_entity_link(self, @attrs.define(kw_only=True) class AnvilBundle(Bundle[BUNDLE_FQID], ABC): + # The `entity_type` attribute of these keys contains the entities' BigQuery + # table name (e.g. `anvil_sequencingactivity`), not the entity type used for + # the contributions (e.g. `activities`). The metadata plugin converts from + # the former to the latter during transformation. entities: dict[EntityReference, MutableJSON] = attrs.field(factory=dict) links: set[EntityLink] = attrs.field(factory=set) diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index b30886839..858899bd5 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -188,20 +188,26 @@ def _transform(self, raise NotImplementedError def _replicate(self, entity: EntityReference) -> tuple[str, JSON]: - return f'anvil_{entity.entity_type}', self.bundle.entities[entity] + return entity.entity_type, self.bundle.entities[entity] - def _pluralize(self, entity_type: str) -> str: - if entity_type == 'diagnosis': + def _convert_entity_type(self, entity_type: str) -> str: + assert entity_type == 'bundle' or entity_type.startswith('anvil_'), entity_type + if entity_type == 'anvil_diagnosis': + # Irregular plural form return 'diagnoses' + elif entity_type.endswith('activity'): + # Polymorphic. Could be `anvil_sequencingactivity`, + # `anvil_assayactivity`, `anvil_activity`, etc + return 'activities' else: - return pluralize(entity_type) + return pluralize(entity_type.removeprefix('anvil_')) def _contains(self, partition: BundlePartition, entity: EntityReference ) -> bool: return ( - self._pluralize(entity.entity_type).endswith(self.entity_type()) + self._convert_entity_type(entity.entity_type) == self.entity_type() and partition.contains(UUID(entity.entity_id)) ) @@ -358,7 +364,7 @@ def _activity(self, activity: EntityReference) -> MutableJSON: field_types = self._activity_types() common_fields = { 'activity_table': activity.entity_type, - 'activity_id': metadata[f'{activity.entity_type}_id'] + 'activity_id': metadata[f'{activity.entity_type.removeprefix("anvil_")}_id'] } # Activities are unique in that they may not contain every field defined # in their field types due to polymorphism, so we need to pad the field @@ -400,7 +406,7 @@ def _file(self, file: EntityReference) -> MutableJSON: uuid=file.entity_id) def _only_dataset(self) -> EntityReference: - return one(self._entities_by_type['dataset']) + return one(self._entities_by_type['anvil_dataset']) @cached_property def _activity_polymorphic_types(self) -> AbstractSet[str]: @@ -408,7 +414,7 @@ def _activity_polymorphic_types(self) -> AbstractSet[str]: anvil_schema, ) return { - table['name'].removeprefix('anvil_') + table['name'] for table in anvil_schema['tables'] if table['name'].endswith('activity') } @@ -465,11 +471,11 @@ def _transform(self, entity: EntityReference) -> Iterable[Contribution]: self._entities_by_type[activity_type] for activity_type in self._activity_polymorphic_types )), - biosamples=self._entities(self._biosample, self._entities_by_type['biosample']), + biosamples=self._entities(self._biosample, self._entities_by_type['anvil_biosample']), datasets=[self._dataset(self._only_dataset())], - diagnoses=self._entities(self._diagnosis, self._entities_by_type['diagnosis']), - donors=self._entities(self._donor, self._entities_by_type['donor']), - files=self._entities(self._file, self._entities_by_type['file']) + diagnoses=self._entities(self._diagnosis, self._entities_by_type['anvil_diagnosis']), + donors=self._entities(self._donor, self._entities_by_type['anvil_donor']), + files=self._entities(self._file, self._entities_by_type['anvil_file']) ) yield self._contribution(contents, entity.entity_id) @@ -517,11 +523,11 @@ def _transform(self, entity: EntityReference) -> Iterable[Contribution]: linked = self._linked_entities(entity) contents = dict( activities=[self._activity(entity)], - biosamples=self._entities(self._biosample, linked['biosample']), + biosamples=self._entities(self._biosample, linked['anvil_biosample']), datasets=[self._dataset(self._only_dataset())], - diagnoses=self._entities(self._diagnosis, linked['diagnosis']), - donors=self._entities(self._donor, linked['donor']), - files=self._entities(self._file, linked['file']) + diagnoses=self._entities(self._diagnosis, linked['anvil_diagnosis']), + donors=self._entities(self._donor, linked['anvil_donor']), + files=self._entities(self._file, linked['anvil_file']) ) yield self._contribution(contents, entity.entity_id) @@ -541,9 +547,9 @@ def _transform(self, entity: EntityReference) -> Iterable[Contribution]: )), biosamples=[self._biosample(entity)], datasets=[self._dataset(self._only_dataset())], - diagnoses=self._entities(self._diagnosis, linked['diagnosis']), - donors=self._entities(self._donor, linked['donor']), - files=self._entities(self._file, linked['file']), + diagnoses=self._entities(self._diagnosis, linked['anvil_diagnosis']), + donors=self._entities(self._donor, linked['anvil_donor']), + files=self._entities(self._file, linked['anvil_file']), ) yield self._contribution(contents, entity.entity_id) @@ -589,11 +595,11 @@ def _transform(self, entity: EntityReference) -> Iterable[Contribution]: linked[activity_type] for activity_type in self._activity_polymorphic_types )), - biosamples=self._entities(self._biosample, linked['biosample']), + biosamples=self._entities(self._biosample, linked['anvil_biosample']), datasets=[self._dataset(self._only_dataset())], - diagnoses=self._entities(self._diagnosis, linked['diagnosis']), + diagnoses=self._entities(self._diagnosis, linked['anvil_diagnosis']), donors=[self._donor(entity)], - files=self._entities(self._file, linked['file']), + files=self._entities(self._file, linked['anvil_file']), ) yield self._contribution(contents, entity.entity_id) @@ -613,10 +619,10 @@ def _transform(self, linked[activity_type] for activity_type in self._activity_polymorphic_types )), - biosamples=self._entities(self._biosample, linked['biosample']), + biosamples=self._entities(self._biosample, linked['anvil_biosample']), datasets=[self._dataset(self._only_dataset())], - diagnoses=self._entities(self._diagnosis, linked['diagnosis']), - donors=self._entities(self._donor, linked['donor']), + diagnoses=self._entities(self._diagnosis, linked['anvil_diagnosis']), + donors=self._entities(self._donor, linked['anvil_donor']), files=[self._file(entity)], ) yield self._contribution(contents, entity.entity_id) @@ -630,5 +636,5 @@ def _transform(self, # redundant and impractically large. Therefore, we leave the # hub IDs field empty for datasets and rely on the tenet # that every file is an implicit hub of its parent dataset. - file_hub=None if linked_entity.entity_type == 'dataset' else entity.entity_id, + file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id, ) diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index c7811d3d4..530850366 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -72,7 +72,7 @@ KeyLinks = set[KeyLink] -class BundleEntityType(Enum): +class BundleType(Enum): """ AnVIL snapshots have no inherent notion of a "bundle". When indexing these snapshots, we dynamically construct bundles by selecting individual entities @@ -110,22 +110,22 @@ class BundleEntityType(Enum): dataset fields during aggregation. This bundle contains only a single dataset entity with only the `description` field populated. """ - primary: EntityType = 'biosample' - supplementary: EntityType = 'file' - duos: EntityType = 'dataset' + primary = 'anvil_biosample' + supplementary = 'anvil_file' + duos = 'anvil_dataset' class TDRAnvilBundleFQIDJSON(SourcedBundleFQIDJSON): - entity_type: str + table_name: str @attrs.frozen(kw_only=True) class TDRAnvilBundleFQID(TDRBundleFQID): - entity_type: BundleEntityType = attrs.field(converter=BundleEntityType) + table_name: BundleType = attrs.field(converter=BundleType) def to_json(self) -> TDRAnvilBundleFQIDJSON: return dict(super().to_json(), - entity_type=self.entity_type.value) + table_name=self.table_name.value) class TDRAnvilBundle(AnvilBundle[TDRAnvilBundleFQID], TDRBundle): @@ -142,7 +142,7 @@ def add_entity(self, assert entity not in self.entities, entity metadata = dict(row, version=version) - if entity.entity_type == 'file': + if entity.entity_type == 'anvil_file': drs_uri = row['file_ref'] # Validate URI syntax DRSURI.parse(drs_uri) @@ -172,10 +172,10 @@ def _version(self): def _count_subgraphs(self, source: TDRSourceSpec) -> int: rows = self._run_sql(f''' SELECT COUNT(*) AS count - FROM {backtick(self._full_table_name(source, BundleEntityType.primary.value))} + FROM {backtick(self._full_table_name(source, BundleType.primary.value))} UNION ALL SELECT COUNT(*) AS count - FROM {backtick(self._full_table_name(source, BundleEntityType.supplementary.value))} + FROM {backtick(self._full_table_name(source, BundleType.supplementary.value))} WHERE is_supplementary ''') return sum(row['count'] for row in rows) @@ -185,15 +185,15 @@ def _list_bundles(self, prefix: str ) -> list[TDRAnvilBundleFQID]: spec = source.spec - primary = BundleEntityType.primary.value - supplementary = BundleEntityType.supplementary.value - duos = BundleEntityType.duos.value + primary = BundleType.primary.value + supplementary = BundleType.supplementary.value + duos = BundleType.duos.value rows = list(self._run_sql(f''' - SELECT datarepo_row_id, {primary!r} AS entity_type + SELECT datarepo_row_id, {primary!r} AS table_name FROM {backtick(self._full_table_name(spec, primary))} WHERE STARTS_WITH(datarepo_row_id, '{prefix}') UNION ALL - SELECT datarepo_row_id, {supplementary!r} AS entity_type + SELECT datarepo_row_id, {supplementary!r} AS table_name FROM {backtick(self._full_table_name(spec, supplementary))} AS supp WHERE supp.is_supplementary AND STARTS_WITH(datarepo_row_id, '{prefix}') ''' + ( @@ -201,7 +201,7 @@ def _list_bundles(self, if config.duos_service_url is None else f''' UNION ALL - SELECT datarepo_row_id, {duos!r} AS entity_type + SELECT datarepo_row_id, {duos!r} AS table_name FROM {backtick(self._full_table_name(spec, duos))} ''' ))) @@ -218,7 +218,7 @@ def _list_bundles(self, # single dataset. This verification is performed independently and # concurrently for every partition, but only one partition actually # emits the bundle. - if row['entity_type'] == duos: + if row['table_name'] == duos: require(0 == duos_count) duos_count += 1 # Ensure that one partition will always contain the DUOS bundle @@ -229,43 +229,43 @@ def _list_bundles(self, source=source, uuid=bundle_uuid, version=self._version, - entity_type=BundleEntityType(row['entity_type']) + table_name=BundleType(row['table_name']) )) return bundles def resolve_bundle(self, fqid: SourcedBundleFQIDJSON) -> TDRAnvilBundleFQID: - if 'entity_type' not in fqid: - # Resolution of bundles without entity type is expensive, so we only - # support it during canning. - assert not config.is_in_lambda, ('Bundle FQID lacks entity type', fqid) + if 'table_name' not in fqid: + # Resolution of bundles without the table name is expensive, so we + # only support it during canning. + assert not config.is_in_lambda, ('Bundle FQID lacks table name', fqid) source = self.source_from_json(fqid['source']) entity_id = uuids.change_version(fqid['uuid'], self.bundle_uuid_version, self.datarepo_row_uuid_version) rows = self._run_sql(' UNION ALL '.join(( f''' - SELECT {entity_type.value!r} AS entity_type - FROM {backtick(self._full_table_name(source.spec, entity_type.value))} + SELECT {bundle_type.value!r} AS table_name + FROM {backtick(self._full_table_name(source.spec, bundle_type.value))} WHERE datarepo_row_id = {entity_id!r} ''' - for entity_type in BundleEntityType + for bundle_type in BundleType ))) fqid = {**fqid, **one(rows)} return super().resolve_bundle(fqid) def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: - if bundle_fqid.entity_type is BundleEntityType.primary: + if bundle_fqid.table_name is BundleType.primary: log.info('Bundle %r is a primary bundle', bundle_fqid.uuid) return self._primary_bundle(bundle_fqid) - elif bundle_fqid.entity_type is BundleEntityType.supplementary: + elif bundle_fqid.table_name is BundleType.supplementary: log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid) return self._supplementary_bundle(bundle_fqid) - elif bundle_fqid.entity_type is BundleEntityType.duos: + elif bundle_fqid.table_name is BundleType.duos: assert config.duos_service_url is not None, bundle_fqid log.info('Bundle %r is a DUOS bundle', bundle_fqid.uuid) return self._duos_bundle(bundle_fqid) else: - assert False, bundle_fqid.entity_type + assert False, bundle_fqid.table_name def _primary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: source = bundle_fqid.source @@ -296,15 +296,15 @@ def _primary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: result = TDRAnvilBundle(fqid=bundle_fqid) entities_by_key: dict[KeyReference, EntityReference] = {} for entity_type, typed_keys in sorted(keys_by_type.items()): - pk_column = entity_type + '_id' + pk_column = entity_type.removeprefix('anvil_') + '_id' rows = self._retrieve_entities(source.spec, entity_type, typed_keys) - if entity_type == 'donor': + if entity_type == 'anvil_donor': # We expect that the foreign key `part_of_dataset_id` is # redundant for biosamples and donors. To simplify our queries, # we do not follow the latter during the graph traversal. # Here, we validate our expectation. Note that the key is an # array for biosamples, but not for donors. - dataset_id: Key = one(keys_by_type['dataset']) + dataset_id: Key = one(keys_by_type['anvil_dataset']) for row in rows: donor_dataset_id = row['part_of_dataset_id'] require(donor_dataset_id == dataset_id, donor_dataset_id, dataset_id) @@ -321,24 +321,24 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund self.bundle_uuid_version, self.datarepo_row_uuid_version) source = bundle_fqid.source.spec - bundle_entity_type = bundle_fqid.entity_type.value + table_name = bundle_fqid.table_name.value result = TDRAnvilBundle(fqid=bundle_fqid) - columns = self._columns(bundle_entity_type) + columns = self._columns(table_name) bundle_entity = dict(one(self._run_sql(f''' SELECT {', '.join(sorted(columns))} - FROM {backtick(self._full_table_name(source, bundle_entity_type))} + FROM {backtick(self._full_table_name(source, table_name))} WHERE datarepo_row_id = '{entity_id}' '''))) - linked_entity_type = 'dataset' - columns = self._columns(linked_entity_type) + dataset_table = 'anvil_dataset' + columns = self._columns(dataset_table) linked_entity = dict(one(self._run_sql(f''' SELECT {', '.join(sorted(columns))} - FROM {backtick(self._full_table_name(source, linked_entity_type))} + FROM {backtick(self._full_table_name(source, dataset_table))} '''))) link_args = {} for entity_type, row, arg in [ - (bundle_entity_type, bundle_entity, 'outputs'), - (linked_entity_type, linked_entity, 'inputs') + ('anvil_file', bundle_entity, 'outputs'), + (dataset_table, linked_entity, 'inputs') ]: entity_ref = EntityReference(entity_type=entity_type, entity_id=row['datarepo_row_id']) result.add_entity(entity_ref, self._version, row) @@ -352,7 +352,7 @@ def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: entity_id = change_version(bundle_fqid.uuid, self.bundle_uuid_version, self.datarepo_row_uuid_version) - entity = EntityReference(entity_type=bundle_fqid.entity_type.value, + entity = EntityReference(entity_type='anvil_dataset', entity_id=entity_id) bundle = TDRAnvilBundle(fqid=bundle_fqid) bundle.add_entity(entity=entity, @@ -366,27 +366,21 @@ def _bundle_entity(self, bundle_fqid: TDRAnvilBundleFQID) -> KeyReference: entity_id = uuids.change_version(bundle_uuid, self.bundle_uuid_version, self.datarepo_row_uuid_version) - entity_type = bundle_fqid.entity_type.value - pk_column = entity_type + '_id' + table_name = bundle_fqid.table_name.value + pk_column = table_name.removeprefix('anvil_') + '_id' bundle_entity = one(self._run_sql(f''' SELECT {pk_column} - FROM {backtick(self._full_table_name(source.spec, entity_type))} + FROM {backtick(self._full_table_name(source.spec, table_name))} WHERE datarepo_row_id = '{entity_id}' '''))[pk_column] - bundle_entity = KeyReference(key=bundle_entity, entity_type=entity_type) + bundle_entity = KeyReference(key=bundle_entity, entity_type=table_name) log.info('Bundle UUID %r resolved to primary key %r in table %r', - bundle_uuid, bundle_entity.key, entity_type) + bundle_uuid, bundle_entity.key, table_name) return bundle_entity - def _full_table_name(self, source: TDRSourceSpec, table_name: str) -> str: - prefixed = f'anvil_{table_name}' - if prefixed in self._schema_columns: - table_name = prefixed - return super()._full_table_name(source, table_name) - def _consolidate_by_type(self, entities: Keys) -> MutableKeysByType: result = { - table['name'].removeprefix('anvil_'): set() + table['name']: set() for table in anvil_schema['tables'] } for e in entities: @@ -398,8 +392,8 @@ def _follow_upstream(self, entities: KeysByType ) -> KeyLinks: return set.union( - self._upstream_from_files(source, entities['file']), - self._upstream_from_biosamples(source, entities['biosample']), + self._upstream_from_files(source, entities['anvil_file']), + self._upstream_from_biosamples(source, entities['anvil_biosample']), # The direction of the edges linking donors to diagnoses is # contentious. Currently, we model diagnoses as being upstream from # donors. This is counterintuitive, but has two important practical @@ -426,7 +420,7 @@ def _follow_upstream(self, # entities that are upstream from donors are datasets, which do not # perform a traversal and are treated as being linked to every # entity in the bundle regardless of the edges in the graph. - self._diagnoses_from_donors(source, entities['donor']) + self._diagnoses_from_donors(source, entities['anvil_donor']) ) def _follow_downstream(self, @@ -434,8 +428,8 @@ def _follow_downstream(self, entities: KeysByType ) -> KeyLinks: return set.union( - self._downstream_from_biosamples(source, entities['biosample']), - self._downstream_from_files(source, entities['file']) + self._downstream_from_biosamples(source, entities['anvil_biosample']), + self._downstream_from_files(source, entities['anvil_file']) ) def _upstream_from_biosamples(self, @@ -445,19 +439,19 @@ def _upstream_from_biosamples(self, if biosample_ids: rows = self._run_sql(f''' SELECT b.biosample_id, b.donor_id, b.part_of_dataset_id - FROM {backtick(self._full_table_name(source, 'biosample'))} AS b + FROM {backtick(self._full_table_name(source, 'anvil_biosample'))} AS b WHERE b.biosample_id IN ({', '.join(map(repr, biosample_ids))}) ''') result: KeyLinks = set() for row in rows: - downstream_ref = KeyReference(entity_type='biosample', + downstream_ref = KeyReference(entity_type='anvil_biosample', key=row['biosample_id']) result.add(KeyLink(outputs={downstream_ref}, - inputs={KeyReference(entity_type='dataset', + inputs={KeyReference(entity_type='anvil_dataset', key=one(row['part_of_dataset_id']))})) for donor_id in row['donor_id']: result.add(KeyLink(outputs={downstream_ref}, - inputs={KeyReference(entity_type='donor', + inputs={KeyReference(entity_type='anvil_donor', key=donor_id)})) return result else: @@ -470,53 +464,53 @@ def _upstream_from_files(self, if file_ids: rows = self._run_sql(f''' WITH file AS ( - SELECT f.file_id FROM {backtick(self._full_table_name(source, 'file'))} AS f + SELECT f.file_id FROM {backtick(self._full_table_name(source, 'anvil_file'))} AS f WHERE f.file_id IN ({', '.join(map(repr, file_ids))}) ) SELECT f.file_id AS generated_file_id, - 'alignmentactivity' AS activity_table, + 'anvil_alignmentactivity' AS activity_table, ama.alignmentactivity_id AS activity_id, ama.used_file_id AS uses_file_id, [] AS uses_biosample_id, FROM file AS f - JOIN {backtick(self._full_table_name(source, 'alignmentactivity'))} AS ama + JOIN {backtick(self._full_table_name(source, 'anvil_alignmentactivity'))} AS ama ON f.file_id IN UNNEST(ama.generated_file_id) UNION ALL SELECT f.file_id, - 'assayactivity', + 'anvil_assayactivity', aya.assayactivity_id, [], aya.used_biosample_id, FROM file AS f - JOIN {backtick(self._full_table_name(source, 'assayactivity'))} AS aya + JOIN {backtick(self._full_table_name(source, 'anvil_assayactivity'))} AS aya ON f.file_id IN UNNEST(aya.generated_file_id) UNION ALL SELECT f.file_id, - 'sequencingactivity', + 'anvil_sequencingactivity', sqa.sequencingactivity_id, [], sqa.used_biosample_id, FROM file AS f - JOIN {backtick(self._full_table_name(source, 'sequencingactivity'))} AS sqa + JOIN {backtick(self._full_table_name(source, 'anvil_sequencingactivity'))} AS sqa ON f.file_id IN UNNEST(sqa.generated_file_id) UNION ALL SELECT f.file_id, - 'variantcallingactivity', + 'anvil_variantcallingactivity', vca.variantcallingactivity_id, vca.used_file_id, [] FROM file AS f - JOIN {backtick(self._full_table_name(source, 'variantcallingactivity'))} AS vca + JOIN {backtick(self._full_table_name(source, 'anvil_variantcallingactivity'))} AS vca ON f.file_id IN UNNEST(vca.generated_file_id) UNION ALL SELECT f.file_id, - 'activity', + 'anvil_activity', a.activity_id, a.used_file_id, a.used_biosample_id, FROM file AS f - JOIN {backtick(self._full_table_name(source, 'activity'))} AS a + JOIN {backtick(self._full_table_name(source, 'anvil_activity'))} AS a ON f.file_id IN UNNEST(a.generated_file_id) ''') return { @@ -525,11 +519,11 @@ def _upstream_from_files(self, # The generated link is not a complete representation of the # upstream activity because it does not include generated files # that are not ancestors of the downstream file - outputs={KeyReference(entity_type='file', key=row['generated_file_id'])}, + outputs={KeyReference(entity_type='anvil_file', key=row['generated_file_id'])}, inputs={ KeyReference(entity_type=entity_type, key=key) - for entity_type, column in [('file', 'uses_file_id'), - ('biosample', 'uses_biosample_id')] + for entity_type, column in [('anvil_file', 'uses_file_id'), + ('anvil_biosample', 'uses_biosample_id')] for key in row[column] } ) @@ -545,12 +539,12 @@ def _diagnoses_from_donors(self, if donor_ids: rows = self._run_sql(f''' SELECT dgn.donor_id, dgn.diagnosis_id - FROM {backtick(self._full_table_name(source, 'diagnosis'))} as dgn + FROM {backtick(self._full_table_name(source, 'anvil_diagnosis'))} as dgn WHERE dgn.donor_id IN ({', '.join(map(repr, donor_ids))}) ''') return { - KeyLink(inputs={KeyReference(key=row['diagnosis_id'], entity_type='diagnosis')}, - outputs={KeyReference(key=row['donor_id'], entity_type='donor')}, + KeyLink(inputs={KeyReference(key=row['diagnosis_id'], entity_type='anvil_diagnosis')}, + outputs={KeyReference(key=row['donor_id'], entity_type='anvil_donor')}, activity=None) for row in rows } @@ -566,24 +560,24 @@ def _downstream_from_biosamples(self, WITH activities AS ( SELECT sqa.sequencingactivity_id as activity_id, - 'sequencingactivity' as activity_table, + 'anvil_sequencingactivity' as activity_table, sqa.used_biosample_id, sqa.generated_file_id - FROM {backtick(self._full_table_name(source, 'sequencingactivity'))} AS sqa + FROM {backtick(self._full_table_name(source, 'anvil_sequencingactivity'))} AS sqa UNION ALL SELECT aya.assayactivity_id, - 'assayactivity', + 'anvil_assayactivity', aya.used_biosample_id, aya.generated_file_id, - FROM {backtick(self._full_table_name(source, 'assayactivity'))} AS aya + FROM {backtick(self._full_table_name(source, 'anvil_assayactivity'))} AS aya UNION ALL SELECT a.activity_id, - 'activity', + 'anvil_activity', a.used_biosample_id, a.generated_file_id, - FROM {backtick(self._full_table_name(source, 'activity'))} AS a + FROM {backtick(self._full_table_name(source, 'anvil_activity'))} AS a ) SELECT biosample_id, @@ -594,9 +588,9 @@ def _downstream_from_biosamples(self, WHERE biosample_id IN ({', '.join(map(repr, biosample_ids))}) ''') return { - KeyLink(inputs={KeyReference(key=row['biosample_id'], entity_type='biosample')}, + KeyLink(inputs={KeyReference(key=row['biosample_id'], entity_type='anvil_biosample')}, outputs={ - KeyReference(key=output_id, entity_type='file') + KeyReference(key=output_id, entity_type='anvil_file') for output_id in row['generated_file_id'] }, activity=KeyReference(key=row['activity_id'], entity_type=row['activity_table'])) @@ -614,22 +608,22 @@ def _downstream_from_files(self, WITH activities AS ( SELECT ala.alignmentactivity_id AS activity_id, - 'alignmentactivity' AS activity_table, + 'anvil_alignmentactivity' AS activity_table, ala.used_file_id, ala.generated_file_id - FROM {backtick(self._full_table_name(source, 'alignmentactivity'))} AS ala + FROM {backtick(self._full_table_name(source, 'anvil_alignmentactivity'))} AS ala UNION ALL SELECT vca.variantcallingactivity_id, - 'variantcallingactivity', + 'anvil_variantcallingactivity', vca.used_file_id, vca.generated_file_id - FROM {backtick(self._full_table_name(source, 'variantcallingactivity'))} AS vca + FROM {backtick(self._full_table_name(source, 'anvil_variantcallingactivity'))} AS vca UNION ALL SELECT a.activity_id, - 'activity', + 'anvil_activity', a.used_file_id, a.generated_file_id - FROM {backtick(self._full_table_name(source, 'activity'))} AS a + FROM {backtick(self._full_table_name(source, 'anvil_activity'))} AS a ) SELECT used_file_id, @@ -640,9 +634,9 @@ def _downstream_from_files(self, WHERE used_file_id IN ({', '.join(map(repr, file_ids))}) ''') return { - KeyLink(inputs={KeyReference(key=row['used_file_id'], entity_type='file')}, + KeyLink(inputs={KeyReference(key=row['used_file_id'], entity_type='anvil_file')}, outputs={ - KeyReference(key=file_id, entity_type='file') + KeyReference(key=file_id, entity_type='anvil_file') for file_id in row['generated_file_id'] }, activity=KeyReference(key=row['activity_id'], entity_type=row['activity_table'])) @@ -657,9 +651,9 @@ def _retrieve_entities(self, keys: AbstractSet[Key], ) -> MutableJSONs: if keys: - table_name = self._full_table_name(source, entity_type) columns = self._columns(entity_type) - pk_column = entity_type + '_id' + table_name = self._full_table_name(source, entity_type) + pk_column = entity_type.removeprefix('anvil_') + '_id' assert pk_column in columns, entity_type log.debug('Retrieving %i entities of type %r ...', len(keys), entity_type) rows = self._run_sql(f''' @@ -693,7 +687,7 @@ def convert_column(value): for table in anvil_schema['tables'] } - def _columns(self, entity_type: EntityType) -> set[str]: - columns = set(self._schema_columns[f'anvil_{entity_type}']) + def _columns(self, table_name: str) -> set[str]: + columns = set(self._schema_columns[table_name]) columns.add('datarepo_row_id') return columns diff --git a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json index 89a1b7b8e..7da9d5f3c 100644 --- a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json +++ b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json @@ -1,6 +1,6 @@ { "entities": { - "dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { "description": "Study description from DUOS", "version": "2022-06-01T00:00:00.000000Z" } diff --git a/test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json b/test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json index df83b4bbd..e689fbe3f 100644 --- a/test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json +++ b/test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json @@ -1,6 +1,6 @@ { "entities": { - "dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { "consent_group": [ "DS-BDIS" ], @@ -23,7 +23,7 @@ "title": "ANVIL_CMG_UWASH_DS_BDIS", "version": "2022-06-01T00:00:00.000000Z" }, - "file/6b0f6c0f-5d80-4242-accb-840921351cd5": { + "anvil_file/6b0f6c0f-5d80-4242-accb-840921351cd5": { "data_modality": [], "datarepo_row_id": "6b0f6c0f-5d80-4242-accb-840921351cd5", "file_format": ".txt", @@ -46,11 +46,11 @@ "links": [ { "inputs": [ - "dataset/2370f948-2783-4eb6-afea-e022897f4dcf" + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf" ], "activity": null, "outputs": [ - "file/6b0f6c0f-5d80-4242-accb-840921351cd5" + "anvil_file/6b0f6c0f-5d80-4242-accb-840921351cd5" ] } ] diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 015c0978b..5d3fd8243 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -14,7 +14,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -256,7 +256,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -431,7 +431,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "sequencingactivity_1509ef40-d1ba-440d-b298-16b7c173dcd4_deaef50719859a3e58b9696a0ca061cf9750e7bb", + "_id": "anvil_sequencingactivity_1509ef40-d1ba-440d-b298-16b7c173dcd4_deaef50719859a3e58b9696a0ca061cf9750e7bb", "_score": 1.0, "_source": { "entity_id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", @@ -478,7 +478,7 @@ "18b3be87-e26b-4376-0d8d-c1e370e90e07" ], "activity_table": [ - "sequencingactivity" + "anvil_sequencingactivity" ], "activity_type": [ "Sequencing" @@ -690,7 +690,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -865,7 +865,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "file_15b76f9c-6b46-433f-851d-34e89f1b9ba6_aec7ac65991cc6ea974a7a86fc30ec96c081cfcf", + "_id": "anvil_file_15b76f9c-6b46-433f-851d-34e89f1b9ba6_aec7ac65991cc6ea974a7a86fc30ec96c081cfcf", "_score": 1.0, "_source": { "entity_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", @@ -897,7 +897,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "diagnosis_15d85d30-ad4a-4f50-87a8-a27f59dd1b5f_fc4d805825bb627c93588467d22caac1b76f4a17", + "_id": "anvil_diagnosis_15d85d30-ad4a-4f50-87a8-a27f59dd1b5f_fc4d805825bb627c93588467d22caac1b76f4a17", "_score": 1.0, "_source": { "entity_id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", @@ -953,7 +953,7 @@ "a60c5138-3749-f7cb-8714-52d389ad5231" ], "activity_table": [ - "sequencingactivity" + "anvil_sequencingactivity" ], "activity_type": [ "Sequencing" @@ -1252,7 +1252,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -1270,7 +1270,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -1472,7 +1472,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "dataset_2370f948-2783-4eb6-afea-e022897f4dcf_f10ae38d485330bf8fc7a95f7eb06626b984ad6a", + "_id": "anvil_dataset_2370f948-2783-4eb6-afea-e022897f4dcf_f10ae38d485330bf8fc7a95f7eb06626b984ad6a", "_score": 1.0, "_source": { "entity_id": "2370f948-2783-4eb6-afea-e022897f4dcf", @@ -1523,7 +1523,7 @@ "a60c5138-3749-f7cb-8714-52d389ad5231" ], "activity_table": [ - "sequencingactivity" + "anvil_sequencingactivity" ], "activity_type": [ "Sequencing" @@ -1723,7 +1723,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "file_3b17377b-16b1-431c-9967-e5d01fc5923f_669b7664f4b6865a0604285f923995acaf84b4ab", + "_id": "anvil_file_3b17377b-16b1-431c-9967-e5d01fc5923f_669b7664f4b6865a0604285f923995acaf84b4ab", "_score": 1.0, "_source": { "entity_id": "3b17377b-16b1-431c-9967-e5d01fc5923f", @@ -1767,7 +1767,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -1954,7 +1954,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -2184,7 +2184,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "sequencingactivity_816e364e-1193-4e5b-a91a-14e4b009157c_a9ad806d7904ac133ff713446e13e525dc37e263", + "_id": "anvil_sequencingactivity_816e364e-1193-4e5b-a91a-14e4b009157c_a9ad806d7904ac133ff713446e13e525dc37e263", "_score": 1.0, "_source": { "entity_id": "816e364e-1193-4e5b-a91a-14e4b009157c", @@ -2226,7 +2226,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -2421,7 +2421,7 @@ "a60c5138-3749-f7cb-8714-52d389ad5231" ], "activity_table": [ - "sequencingactivity" + "anvil_sequencingactivity" ], "activity_type": [ "Sequencing" @@ -2712,7 +2712,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -2730,7 +2730,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -2932,7 +2932,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "biosample_826dea02-e274-4ffe-aabc-eb3db63ad068_52f78e49c9a6e761dfd87e0d5e719d4e0099b80c", + "_id": "anvil_biosample_826dea02-e274-4ffe-aabc-eb3db63ad068_52f78e49c9a6e761dfd87e0d5e719d4e0099b80c", "_score": 1.0, "_source": { "entity_id": "826dea02-e274-4ffe-aabc-eb3db63ad068", @@ -2987,7 +2987,7 @@ "a60c5138-3749-f7cb-8714-52d389ad5231" ], "activity_table": [ - "sequencingactivity" + "anvil_sequencingactivity" ], "activity_type": [ "Sequencing" @@ -3292,7 +3292,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -3310,7 +3310,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -3512,7 +3512,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "diagnosis_939a4bd3-86ed-4a8a-81f4-fbe0ee673461_f0733af0a2ffd8001c329588ac51db1d72ad33b7", + "_id": "anvil_diagnosis_939a4bd3-86ed-4a8a-81f4-fbe0ee673461_f0733af0a2ffd8001c329588ac51db1d72ad33b7", "_score": 1.0, "_source": { "entity_id": "939a4bd3-86ed-4a8a-81f4-fbe0ee673461", @@ -3568,7 +3568,7 @@ "a60c5138-3749-f7cb-8714-52d389ad5231" ], "activity_table": [ - "sequencingactivity" + "anvil_sequencingactivity" ], "activity_type": [ "Sequencing" @@ -3865,7 +3865,7 @@ "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], "activity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -3883,7 +3883,7 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" ], "activity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", - "activity_table": "sequencingactivity", + "activity_table": "anvil_sequencingactivity", "activity_type": "Sequencing", "assay_type": [ "~null" @@ -4085,7 +4085,7 @@ { "_index": "azul_v2_nadove4_test_replica", "_type": "_doc", - "_id": "donor_bfd991f2-2797-4083-972a-da7c6d7f1b2e_4ac20193963c27e42be696a6006a071f6cee4472", + "_id": "anvil_donor_bfd991f2-2797-4083-972a-da7c6d7f1b2e_4ac20193963c27e42be696a6006a071f6cee4472", "_score": 1.0, "_source": { "entity_id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json index 96d468b36..9c461ee99 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json @@ -1,6 +1,6 @@ { "entities": { - "biosample/826dea02-e274-4ffe-aabc-eb3db63ad068": { + "anvil_biosample/826dea02-e274-4ffe-aabc-eb3db63ad068": { "anatomical_site": null, "apriori_cell_type": [], "biosample_id": "f9d40cf6-37b8-22f3-ce35-0dc614d2452b", @@ -21,7 +21,7 @@ ], "version": "2022-06-01T00:00:00.000000Z" }, - "dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { "consent_group": [ "DS-BDIS" ], @@ -44,7 +44,7 @@ "title": "ANVIL_CMG_UWASH_DS_BDIS", "version": "2022-06-01T00:00:00.000000Z" }, - "diagnosis/15d85d30-ad4a-4f50-87a8-a27f59dd1b5f": { + "anvil_diagnosis/15d85d30-ad4a-4f50-87a8-a27f59dd1b5f": { "datarepo_row_id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", "diagnosis_age_lower_bound": null, "diagnosis_age_unit": null, @@ -66,7 +66,7 @@ ], "version": "2022-06-01T00:00:00.000000Z" }, - "diagnosis/939a4bd3-86ed-4a8a-81f4-fbe0ee673461": { + "anvil_diagnosis/939a4bd3-86ed-4a8a-81f4-fbe0ee673461": { "datarepo_row_id": "939a4bd3-86ed-4a8a-81f4-fbe0ee673461", "diagnosis_age_lower_bound": null, "diagnosis_age_unit": null, @@ -88,7 +88,7 @@ ], "version": "2022-06-01T00:00:00.000000Z" }, - "donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e": { + "anvil_donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e": { "datarepo_row_id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", "donor_id": "1e2bd7e5-f45e-a391-daea-7c060be76acd", "genetic_ancestry": [], @@ -103,7 +103,7 @@ ], "version": "2022-06-01T00:00:00.000000Z" }, - "file/15b76f9c-6b46-433f-851d-34e89f1b9ba6": { + "anvil_file/15b76f9c-6b46-433f-851d-34e89f1b9ba6": { "data_modality": [], "datarepo_row_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", "file_format": ".vcf.gz", @@ -122,7 +122,7 @@ "sha256": "", "crc32": "" }, - "file/3b17377b-16b1-431c-9967-e5d01fc5923f": { + "anvil_file/3b17377b-16b1-431c-9967-e5d01fc5923f": { "data_modality": [], "datarepo_row_id": "3b17377b-16b1-431c-9967-e5d01fc5923f", "file_format": ".bam", @@ -141,7 +141,7 @@ "sha256": "", "crc32": "" }, - "sequencingactivity/1509ef40-d1ba-440d-b298-16b7c173dcd4": { + "anvil_sequencingactivity/1509ef40-d1ba-440d-b298-16b7c173dcd4": { "activity_type": "Sequencing", "assay_type": [], "data_modality": [], @@ -158,7 +158,7 @@ ], "version": "2022-06-01T00:00:00.000000Z" }, - "sequencingactivity/816e364e-1193-4e5b-a91a-14e4b009157c": { + "anvil_sequencingactivity/816e364e-1193-4e5b-a91a-14e4b009157c": { "activity_type": "Sequencing", "assay_type": [], "data_modality": [], @@ -179,56 +179,56 @@ "links": [ { "inputs": [ - "biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" + "anvil_biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" ], - "activity": "sequencingactivity/816e364e-1193-4e5b-a91a-14e4b009157c", + "activity": "anvil_sequencingactivity/816e364e-1193-4e5b-a91a-14e4b009157c", "outputs": [ - "file/3b17377b-16b1-431c-9967-e5d01fc5923f" + "anvil_file/3b17377b-16b1-431c-9967-e5d01fc5923f" ] }, { "inputs": [ - "biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" + "anvil_biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" ], - "activity": "sequencingactivity/1509ef40-d1ba-440d-b298-16b7c173dcd4", + "activity": "anvil_sequencingactivity/1509ef40-d1ba-440d-b298-16b7c173dcd4", "outputs": [ - "file/15b76f9c-6b46-433f-851d-34e89f1b9ba6" + "anvil_file/15b76f9c-6b46-433f-851d-34e89f1b9ba6" ] }, { "inputs": [ - "dataset/2370f948-2783-4eb6-afea-e022897f4dcf" + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf" ], "activity": null, "outputs": [ - "biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" + "anvil_biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" ] }, { "inputs": [ - "diagnosis/15d85d30-ad4a-4f50-87a8-a27f59dd1b5f" + "anvil_diagnosis/15d85d30-ad4a-4f50-87a8-a27f59dd1b5f" ], "activity": null, "outputs": [ - "donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e" + "anvil_donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e" ] }, { "inputs": [ - "diagnosis/939a4bd3-86ed-4a8a-81f4-fbe0ee673461" + "anvil_diagnosis/939a4bd3-86ed-4a8a-81f4-fbe0ee673461" ], "activity": null, "outputs": [ - "donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e" + "anvil_donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e" ] }, { "inputs": [ - "donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e" + "anvil_donor/bfd991f2-2797-4083-972a-da7c6d7f1b2e" ], "activity": null, "outputs": [ - "biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" + "anvil_biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" ] } ] diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 3ad0d3b71..263f833d1 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -42,7 +42,7 @@ tdr_anvil, ) from azul.plugins.repository.tdr_anvil import ( - BundleEntityType, + BundleType, TDRAnvilBundle, TDRAnvilBundleFQID, ) @@ -105,13 +105,13 @@ def bundle_fqid(cls, *, uuid, version=None, - entity_type=BundleEntityType.primary + table_name=BundleType.primary ) -> TDRAnvilBundleFQID: assert version is None, 'All AnVIL bundles should use the same version' return TDRAnvilBundleFQID(source=cls.source, uuid=uuid, version=cls.version, - entity_type=entity_type) + table_name=table_name) @classmethod def primary_bundle(cls) -> TDRAnvilBundleFQID: @@ -120,12 +120,12 @@ def primary_bundle(cls) -> TDRAnvilBundleFQID: @classmethod def supplementary_bundle(cls) -> TDRAnvilBundleFQID: return cls.bundle_fqid(uuid='6b0f6c0f-5d80-a242-accb-840921351cd5', - entity_type=BundleEntityType.supplementary) + table_name=BundleType.supplementary) @classmethod def duos_bundle(cls) -> TDRAnvilBundleFQID: return cls.bundle_fqid(uuid='2370f948-2783-aeb6-afea-e022897f4dcf', - entity_type=BundleEntityType.duos) + table_name=BundleType.duos) class TestAnvilIndexer(AnvilIndexerTestCase, @@ -199,7 +199,7 @@ def tearDown(self): self.index_service.delete_indices(self.catalog) def test_dataset_description(self): - dataset_ref = EntityReference(entity_type='dataset', + dataset_ref = EntityReference(entity_type='anvil_dataset', entity_id='2370f948-2783-4eb6-afea-e022897f4dcf') bundles = [self.primary_bundle(), self.duos_bundle()] for bundle_fqid in bundles: diff --git a/test/integration_test.py b/test/integration_test.py index a1c6b9941..641173336 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -152,7 +152,7 @@ Link, ) from azul.plugins.repository.tdr_anvil import ( - BundleEntityType, + BundleType, TDRAnvilBundleFQID, TDRAnvilBundleFQIDJSON, ) @@ -166,9 +166,6 @@ ManifestFormat, ManifestGenerator, ) -from azul.strings import ( - pluralize, -) from azul.terra import ( ServiceAccountCredentialsProvider, TDRClient, @@ -350,7 +347,7 @@ def _list_managed_access_bundles(self, if not ( # DUOS bundles are too sparse to fulfill the managed access tests config.is_anvil_enabled(catalog) - and cast(TDRAnvilBundleFQID, bundle_fqid).entity_type is BundleEntityType.duos + and cast(TDRAnvilBundleFQID, bundle_fqid).table_name is BundleType.duos ) ) bundle_fqid = self.random.choice(bundle_fqids) @@ -503,8 +500,7 @@ def _test_other_endpoints(self): if config.is_hca_enabled(catalog): bundle_index, project_index = 'bundles', 'projects' elif config.is_anvil_enabled(catalog): - bundle_index = pluralize(BundleEntityType.primary.value) - project_index = 'datasets' + bundle_index, project_index = 'biosamples', 'datasets' else: assert False, catalog service_paths = { @@ -1284,17 +1280,17 @@ def _get_indexed_bundles(self, # and 0 or more other entities. Biosamples only occur in primary # bundles. if len(hit['biosamples']) > 0: - entity_type = BundleEntityType.primary + table_name = BundleType.primary # Supplementary bundles contain only 1 file and 1 dataset. elif len(hit['files']) > 0: - entity_type = BundleEntityType.supplementary + table_name = BundleType.supplementary # DUOS bundles contain only 1 dataset. elif len(hit['datasets']) > 0: - entity_type = BundleEntityType.duos + table_name = BundleType.duos else: assert False, hit bundle_fqid = cast(TDRAnvilBundleFQIDJSON, bundle_fqid) - bundle_fqid['entity_type'] = entity_type.value + bundle_fqid['table_name'] = table_name.value bundle_fqid = self.repository_plugin(catalog).resolve_bundle(bundle_fqid) indexed_fqids.add(bundle_fqid) return indexed_fqids diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index da3bf4b3a..f57b88d01 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -2090,7 +2090,7 @@ def test_verbatim_jsonl_manifest(self): expected = { # Consolidate entities with the same replica (i.e. datasets) json_hash(entity).digest(): { - 'type': 'anvil_' + entity_ref.entity_type, + 'type': entity_ref.entity_type, 'value': entity, } for bundle in self.bundles() diff --git a/test/service/test_response_anvil.py b/test/service/test_response_anvil.py index 9a176000c..2d6fb44ab 100644 --- a/test/service/test_response_anvil.py +++ b/test/service/test_response_anvil.py @@ -65,7 +65,7 @@ def test_entity_indices(self): 'sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051' ], 'activity_id': '18b3be87-e26b-4376-0d8d-c1e370e90e07', - 'activity_table': 'sequencingactivity', + 'activity_table': 'anvil_sequencingactivity', 'activity_type': 'Sequencing', 'assay_type': [ None @@ -186,7 +186,7 @@ def test_entity_indices(self): 'sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b' ], 'activity_id': 'a60c5138-3749-f7cb-8714-52d389ad5231', - 'activity_table': 'sequencingactivity', + 'activity_table': 'anvil_sequencingactivity', 'activity_type': 'Sequencing', 'assay_type': [ None From ef0637e5ea323d4e558d370cdd6bab8eeb6712a6 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 17 Oct 2024 15:33:54 -0700 Subject: [PATCH 6/6] Do not filter AnVIL bundles for obsolete versions --- src/azul/azulclient.py | 10 ++++++---- test/integration_test.py | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/azul/azulclient.py b/src/azul/azulclient.py index 2c1f7d877..67a90a1f6 100644 --- a/src/azul/azulclient.py +++ b/src/azul/azulclient.py @@ -269,10 +269,12 @@ def remote_reindex_partition(self, message: JSON) -> None: validate_uuid_prefix(prefix) source = self.repository_plugin(catalog).source_from_json(source) bundle_fqids = self.list_bundles(catalog, source, prefix) - bundle_fqids = self.filter_obsolete_bundle_versions(bundle_fqids) - log.info('After filtering obsolete versions, ' - '%i bundles remain in prefix %r of source %r in catalog %r', - len(bundle_fqids), prefix, str(source.spec), catalog) + # All AnVIL bundles and entities use the same version + if not config.is_anvil_enabled(catalog): + bundle_fqids = self.filter_obsolete_bundle_versions(bundle_fqids) + log.info('After filtering obsolete versions, ' + '%i bundles remain in prefix %r of source %r in catalog %r', + len(bundle_fqids), prefix, str(source.spec), catalog) messages = ( self.bundle_message(catalog, bundle_fqid) for bundle_fqid in bundle_fqids diff --git a/test/integration_test.py b/test/integration_test.py index 641173336..3d3b635ab 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1300,10 +1300,12 @@ def _assert_catalog_complete(self, bundle_fqids: Set[SourcedBundleFQID] ) -> None: with self.subTest('catalog_complete', catalog=catalog): - expected_fqids = set(self.azul_client.filter_obsolete_bundle_versions(bundle_fqids)) - obsolete_fqids = bundle_fqids - expected_fqids - if obsolete_fqids: - log.debug('Ignoring obsolete bundle versions %r', obsolete_fqids) + expected_fqids = bundle_fqids + if not config.is_anvil_enabled(catalog): + expected_fqids = set(self.azul_client.filter_obsolete_bundle_versions(expected_fqids)) + obsolete_fqids = bundle_fqids - expected_fqids + if obsolete_fqids: + log.debug('Ignoring obsolete bundle versions %r', obsolete_fqids) num_bundles = len(expected_fqids) timeout = 600 log.debug('Expecting bundles %s ', sorted(expected_fqids))