From 351ff814c253444b0125fe32f2a30fb26911d666 Mon Sep 17 00:00:00 2001 From: "xnuinside@gmail.com" Date: Fri, 6 May 2022 16:48:35 +0300 Subject: [PATCH] parse validaly comment statement after schema & table --- .flake8 | 2 +- CHANGELOG.txt | 11 ++++ README.md | 1 + pyproject.toml | 2 +- simple_ddl_parser/ddl_parser.py | 1 + simple_ddl_parser/dialects/snowflake.py | 18 ++++++ simple_ddl_parser/dialects/sql.py | 8 +++ simple_ddl_parser/output/common.py | 17 +++--- simple_ddl_parser/output/dialects.py | 8 +-- simple_ddl_parser/parser.py | 13 ++-- tests/non_statement_tests/test_common.py | 35 +++++++++++ tests/test_simple_ddl_parser.py | 1 + tests/test_snowflake.py | 78 ++++++++++++++++++++++++ tests/test_spark_sql.py | 1 + 14 files changed, 176 insertions(+), 20 deletions(-) diff --git a/.flake8 b/.flake8 index c0660ce..7de7178 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] -exclude = .github,.git,__pycache__,docs/source/conf.py,old,build,dist,simple_ddl_parser/parsetab.py,./test.py,simple_ddl_parser/test.py +exclude = .github,.git,__pycache__,docs/source/conf.py,old,build,dist,tests/,simple_ddl_parser/parsetab.py,./test.py,simple_ddl_parser/test.py max-complexity = 10 max-line-length = 120 ignore = W503, E999 \ No newline at end of file diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 64ca8d3..fc3a1cd 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,14 @@ +**v0.26.2** + +Fixes: +1. Fixed a huge bug for incorrect parsing lines with 'USE' & 'GO' strings inside. +2. Fixed parsing for CREATE SCHEMA for Snowlake & Oracle DDLs + +Improvements: +1. Added COMMENT statement for CREATE TABLE ddl (for SNOWFLAKE dialect support) +2. Added COMMENT statement for CREATE SCHEMA ddl (for SNOWFLAKE dialect support) + + **v0.26.1** Fixes: diff --git a/README.md b/README.md index 51ce00e..97d7f84 100644 --- a/README.md +++ b/README.md @@ -414,6 +414,7 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO - CREATE .. CLONE statements for table, database and schema - CREATE TABLE .. CLUSTER BY .. - CONSTRAINT .. [NOT] ENFORCED +- COMMENT = in CREATE TABLE & CREATE SCHEMA statements ### BigQuery diff --git a/pyproject.toml b/pyproject.toml index 5f47a78..8270451 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "simple-ddl-parser" -version = "0.26.1" +version = "0.26.2" description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl." authors = ["Iuliia Volkova "] license = "MIT" diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py index 9d044ba..d47f3a5 100755 --- a/simple_ddl_parser/ddl_parser.py +++ b/simple_ddl_parser/ddl_parser.py @@ -203,6 +203,7 @@ def set_lexx_tags(self, t: LexToken): def set_last_token(self, t: LexToken): self.lexer.last_token = t.type + return t def p_id(self, p): diff --git a/simple_ddl_parser/dialects/snowflake.py b/simple_ddl_parser/dialects/snowflake.py index d928a21..64b8543 100644 --- a/simple_ddl_parser/dialects/snowflake.py +++ b/simple_ddl_parser/dialects/snowflake.py @@ -14,3 +14,21 @@ def p_expression_cluster_by(self, p): p[0] = p[1] p_list = remove_par(list(p)) p[0]["cluster_by"] = p_list[-1] + + def p_table_comment(self, p): + """expr : expr option_comment + """ + p[0] = p[1] + if p[2]: + p[0].update(p[2]) + + def p_option_comment(self, p): + """option_comment : ID STRING + | ID DQ_STRING + | COMMENT ID STRING + | COMMENT ID DQ_STRING + """ + p_list = remove_par(list(p)) + print(p_list) + if "comment" in p[1].lower(): + p[0] = {"comment": p_list[-1]} diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py index 1c663e7..88399df 100644 --- a/simple_ddl_parser/dialects/sql.py +++ b/simple_ddl_parser/dialects/sql.py @@ -416,6 +416,7 @@ def set_auth_property_in_schema(self, p: List, p_list: List) -> None: def p_c_schema(self, p: List) -> None: """c_schema : CREATE SCHEMA | CREATE ID SCHEMA""" + if len(p) == 4: p[0] = {"remote": True} @@ -424,6 +425,8 @@ def p_create_schema(self, p: List) -> None: | c_schema id id id | c_schema id | c_schema id DOT id + | c_schema id option_comment + | c_schema id DOT id option_comment | c_schema IF NOT EXISTS id | c_schema IF NOT EXISTS id DOT id | create_schema id id id @@ -431,9 +434,14 @@ def p_create_schema(self, p: List) -> None: | create_schema options """ p_list = list(p) + p[0] = {} auth_index = None + if "comment" in p_list[-1]: + p[0].update(p_list[-1]) + del p_list[-1] + self.add_if_not_exists(p[0], p_list) if isinstance(p_list[1], dict): p[0] = p_list[1] diff --git a/simple_ddl_parser/output/common.py b/simple_ddl_parser/output/common.py index f2e800d..f3dbc5b 100644 --- a/simple_ddl_parser/output/common.py +++ b/simple_ddl_parser/output/common.py @@ -145,19 +145,22 @@ def process_alter_and_index_result( def process_entities(tables_dict: Dict, table: Dict, output_mode: str) -> Dict: """process tables, types, sequence and etc. data""" - table_data = init_table_data() - table_data = d.populate_dialects_table_data(output_mode, table_data) - not_table = False + is_it_table = True + if table.get("table_name"): + table_data = init_table_data() + table_data = d.populate_dialects_table_data(output_mode, table_data) table_data.update(table) table_data = set_unique_columns(table_data) else: table_data = table - not_table = True - if not not_table: - table_data = process_not_table_item(table_data, tables_dict) + is_it_table = False + + if is_it_table: + table_data = process_is_it_table_item(table_data, tables_dict) table_data = normalize_ref_columns_in_final_output(table_data) + d.dialects_clean_up(output_mode, table_data) return table_data @@ -183,7 +186,7 @@ def result_format( return final_result -def process_not_table_item(table_data: Dict, tables_dict: Dict) -> Dict: +def process_is_it_table_item(table_data: Dict, tables_dict: Dict) -> Dict: if table_data.get("table_name"): tables_dict[(table_data["table_name"], table_data["schema"])] = table_data else: diff --git a/simple_ddl_parser/output/dialects.py b/simple_ddl_parser/output/dialects.py index e75bb36..8915911 100644 --- a/simple_ddl_parser/output/dialects.py +++ b/simple_ddl_parser/output/dialects.py @@ -13,7 +13,6 @@ "fields_terminated_by", "collection_items_terminated_by", "map_keys_terminated_by", - "comment", ] @@ -145,16 +144,17 @@ def dialects_clean_up(output_mode: str, table_data: Dict) -> Dict: key_cleaning(table_data, output_mode) update_mappers_for_table_properties = {"bigquery": update_bigquery_output} update_table_prop = update_mappers_for_table_properties.get(output_mode) - if update_table_prop: table_data = update_table_prop(table_data) if output_mode == "oracle": - for column in table_data["columns"]: + for column in table_data.get("columns", []): column = add_additional_oracle_keys_in_column(column) elif output_mode == "snowflake": - for column in table_data["columns"]: + # can be no columns if it is a create database or create schema + for column in table_data.get("columns", []): column = add_additional_snowflake_keys_in_column(column) + elif output_mode == "redshift": table_data = process_redshift_dialect(table_data) return table_data diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py index 297e9e4..961c7ef 100755 --- a/simple_ddl_parser/parser.py +++ b/simple_ddl_parser/parser.py @@ -163,16 +163,16 @@ def check_new_statement_start(self, line: str) -> bool: return self.new_statement def check_line_on_skip_words(self) -> bool: - skip_line_words = ["USE", "GO"] + skip_regex = r"^(GO|USE)\b" self.skip = False - for word in skip_line_words: - if self.line.startswith(word): - self.skip = True - break + + if re.match(skip_regex, self.line.upper()): + self.skip = True return self.skip def add_line_to_statement(self) -> str: + if ( self.line and not self.skip @@ -206,7 +206,6 @@ def process_line( self.pre_process_line() self.line = self.line.strip().replace("\n", "").replace("\t", "") - self.skip = self.check_line_on_skip_words() self.parse_set_statement() @@ -214,7 +213,6 @@ def process_line( self.check_new_statement_start(self.line) final_line = self.line.endswith(";") and not self.set_was_in_line - self.add_line_to_statement() if final_line or self.new_statement: @@ -237,6 +235,7 @@ def process_statement(self) -> None: self.statement = None def parse_statement(self) -> None: + _parse_result = yacc.parse(self.statement) if _parse_result: self.tables.append(_parse_result) diff --git a/tests/non_statement_tests/test_common.py b/tests/non_statement_tests/test_common.py index 487367b..6b4c48d 100644 --- a/tests/non_statement_tests/test_common.py +++ b/tests/non_statement_tests/test_common.py @@ -211,3 +211,38 @@ def test_flag_normalize_names_mixed_usage(): 'ddl_properties': [] } assert expected == result + + +def test_parsing_go_and_use_correctly(): + ddl=""" + create TABLE ASIN.EXCLUSION ( + USER_COMMENT VARCHAR(100), + ); + """ + result = DDLParser(ddl, normalize_names=True).run(output_mode="hql") + expected = [{'alter': {}, + 'checks': [], + 'collection_items_terminated_by': None, + 'columns': [{'check': None, + 'default': None, + 'name': 'USER_COMMENT', + 'nullable': True, + 'references': None, + 'size': 100, + 'type': 'VARCHAR', + 'unique': False}], + 'comment': None, + 'external': False, + 'fields_terminated_by': None, + 'index': [], + 'lines_terminated_by': None, + 'location': None, + 'map_keys_terminated_by': None, + 'partitioned_by': [], + 'primary_key': [], + 'row_format': None, + 'schema': 'ASIN', + 'stored_as': None, + 'table_name': 'EXCLUSION', + 'tablespace': None}] + assert expected == result diff --git a/tests/test_simple_ddl_parser.py b/tests/test_simple_ddl_parser.py index 93160c4..f07a48a 100644 --- a/tests/test_simple_ddl_parser.py +++ b/tests/test_simple_ddl_parser.py @@ -1234,6 +1234,7 @@ def test_comments_in_columns(): "tablespace": None, "schema": None, "table_name": "test_table", + "comment": "'This is test table'", } ], "types": [], diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py index b13d561..8c4c922 100644 --- a/tests/test_snowflake.py +++ b/tests/test_snowflake.py @@ -184,3 +184,81 @@ def test_enforced(): "types": [], } assert expected == result + + +def test_table_comment_parsed_validly(): + + ddl=""" + create TABLE ASIN.EXCLUSION ( + USER_COMMENT VARCHAR(100), + PROCESS_SQN NUMBER(10,0) NOT NULL, + constraint PK_EXCLUSION primary key (ASIN) + ) COMMENT ='ASINs to be excluded from the ASIN List File' + ; + """ + result_one = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake") + + + ddl=""" + create TABLE ASIN.EXCLUSION ( + USER_COMMENT VARCHAR(100), + PROCESS_SQN NUMBER(10,0) NOT NULL, + constraint PK_EXCLUSION primary key (ASIN) + ) COMMENT='ASINs to be excluded from the ASIN List File' + ; + """ + result_two = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake") + + expected = [{'alter': {}, + 'checks': [], + 'clone': None, + 'columns': [{'check': None, + 'default': None, + 'name': 'USER_COMMENT', + 'nullable': True, + 'references': None, + 'size': 100, + 'type': 'VARCHAR', + 'unique': False}, + {'check': None, + 'default': None, + 'name': 'PROCESS_SQN', + 'nullable': False, + 'references': None, + 'size': (10, 0), + 'type': 'NUMBER', + 'unique': False}], + 'constraints': {'primary_keys': [{'columns': ['ASIN'], + 'constraint_name': 'PK_EXCLUSION'}]}, + 'comment': "'ASINs to be excluded from the ASIN List File'", + 'index': [], + 'partitioned_by': [], + 'primary_key': ['ASIN'], + 'primary_key_enforced': None, + 'schema': 'ASIN', + 'table_name': 'EXCLUSION', + 'tablespace': None}] + + assert expected == result_one == result_two + + +def test_schema_parsed_normally(): + + ddl=""" + create schema my_schema; + """ + result = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake") + + expected = [{'schema_name': 'my_schema'}] + + assert result == expected + + +def test_comment_on_create_schema(): + + ddl=""" + create schema my_schema comment='this is comment1'; + """ + result = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake") + expected = [{'comment': "'this is comment1'", 'schema_name': 'my_schema'}] + assert result == expected diff --git a/tests/test_spark_sql.py b/tests/test_spark_sql.py index bed9945..f66a58b 100644 --- a/tests/test_spark_sql.py +++ b/tests/test_spark_sql.py @@ -57,6 +57,7 @@ def test_spark_sql_using(): "tablespace": None, "tblproperties": {"'foo'": "'bar'"}, "using": "CSV", + 'comment': "'this is a comment'", } ], "types": [],