diff --git a/databuilder/models/badge.py b/databuilder/models/badge.py new file mode 100644 index 000000000..e18e28da4 --- /dev/null +++ b/databuilder/models/badge.py @@ -0,0 +1,116 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Optional +import re + +from databuilder.models.neo4j_csv_serde import Neo4jCsvSerializable, NODE_KEY, \ + NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ + RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE + + +class Badge: + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self) -> str: + return 'Badge({!r}, {!r})'.format(self.name, + self.category) + + +class BadgeMetadata(Neo4jCsvSerializable): + """ + Badge model. + """ + BADGE_NODE_LABEL = 'Badge' + BADGE_KEY_FORMAT = '{badge}' + BADGE_CATEGORY = 'category' + + # Relation between entity and badge + BADGE_RELATION_TYPE = 'HAS_BADGE' + INVERSE_BADGE_RELATION_TYPE = 'BADGE_FOR' + + def __init__(self, + db_name: str, + schema: str, + start_label: str, # Table, Dashboard, Column + start_key: str, + badges: List[Badge], + cluster: str = 'gold', # is this what we want as default for badges..? + ): + self.badges = badges + + self.db = db_name.lower() + self.schema = schema.lower() + self.cluster = cluster.lower() + + table_key_pattern = re.compile('[a-z]+://[a-zA-Z0-9_.-]+.[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+') + dashboard_key_pattern = re.compile('[a-z]+_dashboard://[a-zA-Z0-9_.-]+.[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+') + column_key_pattern = re.compile('[a-z]+://[a-zA-Z0-9_.-]+.[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+') + map_label_to_key_pattern = { + 'Table': table_key_pattern, + 'Dashboard': dashboard_key_pattern, + 'Column': column_key_pattern, + } + if start_label in map_label_to_key_pattern.keys(): + self.start_label = start_label + if map_label_to_key_pattern[start_label].match(start_key): + self.start_key = start_key + else: + raise Exception(start_key + ' does not match the key pattern for a ' + start_label) + else: + raise Exception(start_label + ' is not a valid start_label for a Badge relation') + + self._node_iter = iter(self.create_nodes()) + self._relation_iter = iter(self.create_relation()) + + def create_next_node(self) -> Optional[Dict[str, Any]]: + # return the string representation of the data + try: + return next(self._node_iter) + except StopIteration: + return None + + def create_next_relation(self) -> Optional[Dict[str, Any]]: + try: + return next(self._relation_iter) + except StopIteration: + return None + + @staticmethod + def get_badge_key(name: str) -> str: + if not name: + return '' + return BadgeMetadata.BADGE_KEY_FORMAT.format(badge=name) + + def get_metadata_model_key(self) -> str: + return self.start_key + + def create_nodes(self) -> List[Dict[str, Any]]: + """ + Create a list of Neo4j node records + :return: + """ + results = [] + for badge in self.badges: + if badge: + results.append({ + NODE_KEY: self.get_badge_key(badge.name), + NODE_LABEL: self.BADGE_NODE_LABEL, + self.BADGE_CATEGORY: badge.category + }) + return results + + def create_relation(self) -> List[Dict[str, Any]]: + results = [] + for badge in self.badges: + results.append({ + RELATION_START_LABEL: self.start_label, + RELATION_END_LABEL: self.BADGE_NODE_LABEL, + RELATION_START_KEY: self.start_key, + RELATION_END_KEY: self.get_badge_key(badge.name), + RELATION_TYPE: self.BADGE_RELATION_TYPE, + RELATION_REVERSE_TYPE: self.INVERSE_BADGE_RELATION_TYPE, + }) + return results diff --git a/databuilder/models/table_metadata.py b/databuilder/models/table_metadata.py index a26f0e8ff..20c8afc51 100644 --- a/databuilder/models/table_metadata.py +++ b/databuilder/models/table_metadata.py @@ -12,60 +12,12 @@ RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE) from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX from databuilder.models.schema import schema_constant +from databuilder.models.badge import BadgeMetadata, Badge DESCRIPTION_NODE_LABEL_VAL = 'Description' DESCRIPTION_NODE_LABEL = DESCRIPTION_NODE_LABEL_VAL -class BadgeMetadata(Neo4jCsvSerializable): - BADGE_NODE_LABEL = 'Badge' - BADGE_KEY_FORMAT = '{badge}' - BADGE_CATEGORY = 'category' - DASHBOARD_TYPE = 'dashboard' - METRIC_TYPE = 'metric' - - def __init__(self, - name: str, - category: str, - ): - self._name = name - self._category = category - self._nodes = iter([self.create_badge_node(self._name)]) - self._relations: Iterator[Dict[str, Any]] = iter([]) - - def __repr__(self) -> str: - return 'BadgeMetadata({!r}, {!r})'.format(self._name, - self._category) - - @staticmethod - def get_badge_key(name: str) -> str: - if not name: - return '' - return BadgeMetadata.BADGE_KEY_FORMAT.format(badge=name) - - @staticmethod - def create_badge_node(name: str, - category: str = 'column', - ) -> Dict[str, str]: - return {NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, - NODE_KEY: BadgeMetadata.get_badge_key(name), - BadgeMetadata.BADGE_CATEGORY: category} - - def create_next_node(self) -> Optional[Dict[str, Any]]: - # return the string representation of the data - try: - return next(self._nodes) - except StopIteration: - return None - - def create_next_relation(self) -> Optional[Dict[str, Any]]: - # We don't emit any relations for Badge ingestion - try: - return next(self._relations) - except StopIteration: - return None - - class TagMetadata(Neo4jCsvSerializable): TAG_NODE_LABEL = 'Tag' TAG_KEY_FORMAT = '{tag}' @@ -92,7 +44,7 @@ def get_tag_key(name: str) -> str: @staticmethod def create_tag_node(name: str, - tag_type: str =DEFAULT_TYPE + tag_type: str = DEFAULT_TYPE ) -> Dict[str, str]: return {NODE_LABEL: TagMetadata.TAG_NODE_LABEL, NODE_KEY: TagMetadata.get_tag_key(name), @@ -199,10 +151,6 @@ class ColumnMetadata: COLUMN_DESCRIPTION = 'description' COLUMN_DESCRIPTION_FORMAT = '{db}://{cluster}.{schema}/{tbl}/{col}/{description_id}' - # Relation between column and badge - COL_BADGE_RELATION_TYPE = 'HAS_BADGE' - BADGE_COL_RELATION_TYPE = 'BADGE_FOR' - def __init__(self, name: str, description: Union[str, None], @@ -222,7 +170,10 @@ def __init__(self, text=description) self.type = col_type self.sort_order = sort_order - self.badges = badges + if badges: + self.badges = [Badge(badge, 'column') for badge in badges] + else: + self.badges = [] def __repr__(self) -> str: return 'ColumnMetadata({!r}, {!r}, {!r}, {!r}, {!r})'.format(self.name, @@ -427,8 +378,15 @@ def _create_next_node(self) -> Iterator[Any]: # noqa: C901 yield col.description.get_node_dict(node_key) if col.badges: - for badge in col.badges: - yield BadgeMetadata.create_badge_node(badge) + badge_metadata = BadgeMetadata(db_name=self._get_database_key(), + schema=self._get_schema_key(), + start_label=ColumnMetadata.COLUMN_NODE_LABEL, + start_key=self._get_col_key(col), + badges=col.badges, + cluster=self._get_cluster_key()) + badge_nodes = badge_metadata.create_nodes() + for node in badge_nodes: + yield node # Database, cluster, schema others = [NodeTuple(key=self._get_database_key(), @@ -498,17 +456,16 @@ def _create_next_relation(self) -> Iterator[Any]: yield col.description.get_relation(ColumnMetadata.COLUMN_NODE_LABEL, self._get_col_key(col), self._get_col_description_key(col, col.description)) - if col.badges: - for badge in col.badges: - yield { - RELATION_START_LABEL: ColumnMetadata.COLUMN_NODE_LABEL, - RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, - RELATION_START_KEY: self._get_col_key(col), - RELATION_END_KEY: BadgeMetadata.get_badge_key(badge), - RELATION_TYPE: ColumnMetadata.COL_BADGE_RELATION_TYPE, - RELATION_REVERSE_TYPE: ColumnMetadata.BADGE_COL_RELATION_TYPE, - } + badge_metadata = BadgeMetadata(db_name=self._get_database_key(), + schema=self._get_schema_key(), + start_label=ColumnMetadata.COLUMN_NODE_LABEL, + start_key=self._get_col_key(col), + badges=col.badges, + cluster=self._get_cluster_key()) + badge_relations = badge_metadata.create_relation() + for relation in badge_relations: + yield relation others = [ RelTuple(start_label=TableMetadata.DATABASE_NODE_LABEL, diff --git a/databuilder/models/table_owner.py b/databuilder/models/table_owner.py index a18e9736d..3bd78f061 100644 --- a/databuilder/models/table_owner.py +++ b/databuilder/models/table_owner.py @@ -22,7 +22,7 @@ def __init__(self, schema: str, table_name: str, owners: Union[List, str], - cluster: str ='gold', + cluster: str = 'gold', ) -> None: self.db = db_name.lower() self.schema = schema.lower() diff --git a/setup.py b/setup.py index 98afe2a84..846c5295d 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup, find_packages -__version__ = '3.3.2' +__version__ = '3.4.0' requirements = [ diff --git a/tests/unit/extractor/test_hive_table_metadata_extractor.py b/tests/unit/extractor/test_hive_table_metadata_extractor.py index f3c81a458..adca9bfb2 100644 --- a/tests/unit/extractor/test_hive_table_metadata_extractor.py +++ b/tests/unit/extractor/test_hive_table_metadata_extractor.py @@ -96,6 +96,7 @@ def test_extraction_with_single_result(self) -> None: 4), ColumnMetadata('ds', None, 'varchar', 5)], is_view=False) + self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract()) diff --git a/tests/unit/models/test_badge.py b/tests/unit/models/test_badge.py new file mode 100644 index 000000000..e611ae71a --- /dev/null +++ b/tests/unit/models/test_badge.py @@ -0,0 +1,98 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import unittest +from databuilder.models.badge import Badge, BadgeMetadata + +from databuilder.models.neo4j_csv_serde import NODE_KEY, NODE_LABEL, \ + RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ + RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE + +db = 'hive' +SCHEMA = 'BASE' +TABLE = 'TEST' +CLUSTER = 'DEFAULT' +badge1 = Badge('badge1', 'column') +badge2 = Badge('badge2', 'column') + + +class TestBadge(unittest.TestCase): + def setUp(self) -> None: + super(TestBadge, self).setUp() + self.badge_metada = BadgeMetadata(db_name='hive', + schema=SCHEMA, + start_label='Column', + start_key='hive://default.base/test/ds', + cluster=CLUSTER, + badges=[badge1, badge2]) + + def test_get_badge_key(self) -> None: + badge_key = self.badge_metada.get_badge_key(badge1.name) + self.assertEquals(badge_key, badge1.name) + + def test_create_nodes(self) -> None: + nodes = self.badge_metada.create_nodes() + self.assertEquals(len(nodes), 2) + + node1 = { + NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name), + NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, + BadgeMetadata.BADGE_CATEGORY: badge1.category + } + node2 = { + NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name), + NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, + BadgeMetadata.BADGE_CATEGORY: badge2.category + } + + self.assertTrue(node1 in nodes) + self.assertTrue(node2 in nodes) + + def test_bad_key_entity_match(self) -> None: + column_label = 'Column' + table_key = 'hive://default.base/test' + + self.assertRaises(Exception, + BadgeMetadata, + db_name='hive', + schema=SCHEMA, + start_label=column_label, + start_key=table_key, + cluster=CLUSTER, + badges=[badge1, badge2]) + + def test_bad_entity_label(self) -> None: + user_label = 'User' + table_key = 'hive://default.base/test' + self.assertRaises(Exception, + BadgeMetadata, + db_name='hive', + schema=SCHEMA, + start_label=user_label, + start_key=table_key, + cluster=CLUSTER, + badges=[badge1, badge2]) + + def test_create_relation(self) -> None: + relations = self.badge_metada.create_relation() + self.assertEquals(len(relations), 2) + + relation1 = { + RELATION_START_LABEL: self.badge_metada.start_label, + RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, + RELATION_START_KEY: self.badge_metada.start_key, + RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name), + RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, + RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, + } + relation2 = { + RELATION_START_LABEL: self.badge_metada.start_label, + RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, + RELATION_START_KEY: self.badge_metada.start_key, + RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name), + RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, + RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, + } + + self.assertTrue(relation1 in relations) + self.assertTrue(relation2 in relations)