From 26a0d0a4ffe5bf004507c9d1598a5f08b30ecdf0 Mon Sep 17 00:00:00 2001 From: Xuan <65048031+crazy-2020@users.noreply.github.com> Date: Thu, 17 Dec 2020 21:25:28 -0800 Subject: [PATCH] fix: Move 'grouped_tables' into `_retrieve_tables` (#430) Signed-off-by: xuans --- databuilder/extractor/base_bigquery_extractor.py | 1 - databuilder/extractor/bigquery_metadata_extractor.py | 12 +++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/databuilder/extractor/base_bigquery_extractor.py b/databuilder/extractor/base_bigquery_extractor.py index 511f2563d..37deeb49e 100644 --- a/databuilder/extractor/base_bigquery_extractor.py +++ b/databuilder/extractor/base_bigquery_extractor.py @@ -33,7 +33,6 @@ class BaseBigQueryExtractor(Extractor): DEFAULT_PAGE_SIZE = 300 NUM_RETRIES = 3 DATE_LENGTH = 8 - SHARDED_TABLE_KEY_FORMAT = '{dataset_id}/{table_id}' def init(self, conf: ConfigTree) -> None: # should use key_path, or cred_key if the former doesn't exist diff --git a/databuilder/extractor/bigquery_metadata_extractor.py b/databuilder/extractor/bigquery_metadata_extractor.py index 258370111..8c2a54e7b 100644 --- a/databuilder/extractor/bigquery_metadata_extractor.py +++ b/databuilder/extractor/bigquery_metadata_extractor.py @@ -28,10 +28,11 @@ class BigQueryMetadataExtractor(BaseBigQueryExtractor): def init(self, conf: ConfigTree) -> None: BaseBigQueryExtractor.init(self, conf) - self.grouped_tables: Set[str] = set([]) self.iter = iter(self._iterate_over_tables()) def _retrieve_tables(self, dataset: DatasetRef) -> Any: + grouped_tables: Set[str] = set([]) + for page in self._page_table_list_results(dataset): if 'tables' not in page: continue @@ -47,16 +48,13 @@ def _retrieve_tables(self, dataset: DatasetRef) -> Any: # If the last eight characters are digits, we assume the table is of a table date range type # and then we only need one schema definition table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH] - table_id = table_prefix - sharded_table_key = BigQueryMetadataExtractor.SHARDED_TABLE_KEY_FORMAT.format( - dataset_id=tableRef['datasetId'], - table_id=table_id) - if sharded_table_key in self.grouped_tables: + if table_prefix in grouped_tables: # If one table in the date range is processed, then ignore other ones # (it adds too much metadata) continue - self.grouped_tables.add(sharded_table_key) + table_id = table_prefix + grouped_tables.add(table_prefix) table = self.bigquery_service.tables().get( projectId=tableRef['projectId'],