From d77fc90e23520baa865685ce38456700bcefd82a Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 14 Sep 2023 18:31:58 -0400 Subject: [PATCH 01/50] Added new section for DQ monitors to DQDL spec - Monitors are used to track additional metrics. - Added new class called DQMonitor, added a field to DQRuleset and added tests. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 8 +- .../ml/dataquality/dqdl/model/DQMonitor.java | 35 ++++++++ .../ml/dataquality/dqdl/model/DQRuleset.java | 18 ++++ .../dqdl/parser/DQDLParserListener.java | 53 ++++++++++- .../dataquality/dqdl/model/DQMonitorTest.java | 88 +++++++++++++++++++ .../dataquality/dqdl/model/DQRulesetTest.java | 33 +++++++ .../dqdl/parser/InvalidDQRulesetTest.java | 22 +++++ 7 files changed, 255 insertions(+), 2 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java create mode 100644 tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index e5cf0d4..6884e18 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -5,6 +5,7 @@ import CommonLexerRules; metadataSectionStart: 'Metadata'; dataSourcesSectionStart: 'DataSources'; rulesSectionStart: 'Rules'; +monitorsSectionStart: 'Monitors'; // Expressions dateNow: 'now()'; @@ -68,6 +69,7 @@ durationBasedCondition: | IN durationExpressionArray; ruleType: IDENTIFIER; +monitorType: IDENTIFIER; parameter: (QUOTED_STRING | INT | DIGIT); condition: @@ -79,6 +81,7 @@ condition: withThresholdCondition: 'with' 'threshold' numberBasedCondition; dqRule: ruleType parameter* condition? withThresholdCondition?; +dqMonitor: monitorType parameter*; topLevelRule: dqRule @@ -87,12 +90,15 @@ topLevelRule: // Rules Definition dqRules: topLevelRule (COMMA topLevelRule)*; +dqMonitors: dqMonitor (COMMA dqMonitor)*; // Top Level Document rules: rulesSectionStart EQUAL_TO LBRAC dqRules RBRAC | rulesSectionStart EQUAL_TO LBRAC RBRAC; // empty array +monitors: monitorsSectionStart EQUAL_TO LBRAC dqMonitors RBRAC; + // This dictionary does not support nested dictionaries. Just strings and arrays. dictionary: LCURL pair (COMMA pair)* RCURL; pair: QUOTED_STRING COLON pairValue; @@ -102,4 +108,4 @@ array: LBRAC QUOTED_STRING (COMMA QUOTED_STRING)* RBRAC; metadata: metadataSectionStart EQUAL_TO dictionary; dataSources: dataSourcesSectionStart EQUAL_TO dictionary; -document: metadata? dataSources? rules; +document: metadata? dataSources? rules monitors?; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java new file mode 100644 index 0000000..7c877a9 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java @@ -0,0 +1,35 @@ +/* + * DQMonitor.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import lombok.AllArgsConstructor; +import lombok.Getter; + +import java.util.Map; + +@AllArgsConstructor +@Getter +public class DQMonitor { + private final String ruleType; + private final Map parameters; + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(ruleType); + + if (parameters != null) { + parameters.values().forEach(p -> sb.append(" ").append("\"").append(p).append("\"")); + } + + return sb.toString(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java index d836b1e..6fb8b0b 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java @@ -30,14 +30,20 @@ public class DQRuleset { private final String primarySourceName; private final List additionalDataSourcesNames; private final List rules; + private final List monitors; private static final String LINE_SEP = System.lineSeparator(); public DQRuleset(final List rules) { + this(rules, new ArrayList<>()); + } + + public DQRuleset(final List rules, final List monitors) { this.metadata = new HashMap<>(); this.primarySourceName = null; this.additionalDataSourcesNames = new ArrayList<>(); this.rules = rules; + this.monitors = monitors; } @Override @@ -81,6 +87,14 @@ public String toString() { .collect(Collectors.joining("," + LINE_SEP)) + LINE_SEP + "]"; + String monitorsStr = ""; + if (!monitors.isEmpty()) { + monitorsStr = "Monitors = [" + LINE_SEP + + monitors.stream() + .map(i -> " " + i) + .collect(Collectors.joining("," + LINE_SEP)) + + LINE_SEP + "]"; + } StringBuilder sb = new StringBuilder(); if (!metadataStr.isEmpty()) { @@ -93,6 +107,10 @@ public String toString() { sb.append(rulesStr); + if (!monitorsStr.isEmpty()) { + sb.append(LINE_SEP).append(LINE_SEP).append(monitorsStr); + } + return sb.toString(); } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 657df11..261741d 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -10,6 +10,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.parser; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQMonitor; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleLogicalOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; @@ -50,6 +51,7 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private String primarySource; private List additionalSources; private final List dqRules = new ArrayList<>(); + private final List dqMonitors = new ArrayList<>(); private static final String METADATA_VERSION_KEY = "Version"; private static final Set ALLOWED_METADATA_KEYS; @@ -73,7 +75,7 @@ public DQDLParserListener(DQDLErrorListener errorListener) { public Either, DQRuleset> getParsedRuleset() { if (errorMessages.isEmpty() && errorListener.getErrorMessages().isEmpty()) { - return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules)); + return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules, dqMonitors)); } else { List allErrorMessages = new ArrayList<>(); allErrorMessages.addAll(errorMessages); @@ -176,6 +178,23 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu } } + @Override + public void enterDqMonitors(DataQualityDefinitionLanguageParser.DqMonitorsContext dqMonitorsContext) { + if (!errorMessages.isEmpty()) { + return; + } + + for (DataQualityDefinitionLanguageParser.DqMonitorContext dmc: dqMonitorsContext.dqMonitor()) { + Either dqMonitorEither = getDQMonitor(dmc); + if (dqMonitorEither.isLeft()) { + errorMessages.add(dqMonitorEither.getLeft()); + return; + } else { + dqMonitors.add(dqMonitorEither.getRight()); + } + } + } + private Either getDQRule( DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { String ruleType = dqRuleContext.ruleType().getText(); @@ -254,6 +273,38 @@ private Either getDQRule( ); } + private Either getDQMonitor( + DataQualityDefinitionLanguageParser.DqMonitorContext dqMonitorContext) { + String monitorType = dqMonitorContext.monitorType().getText(); + List parameters = dqMonitorContext.parameter().stream() + .map(p -> p.getText().replaceAll("\"", "")) + .collect(Collectors.toList()); + + // We just use the DQ Rule names to valid what monitor names to allow. + // This might change closer to re:Invent, but keeping it simple for now. + Optional optionalDQMonitorType = DQRuleType.getRuleType(monitorType, parameters.size()); + + if (!optionalDQMonitorType.isPresent()) { + return Either.fromLeft(String.format("Monitor Type: %s is not valid", monitorType)); + } + + DQRuleType dqRuleType = optionalDQMonitorType.get(); + + if (dqRuleType.getReturnType().equals("BOOLEAN")) { + return Either.fromLeft(String.format("Monitor Type: %s is not supported", monitorType)); + } + + Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters); + + if (errorMessage.isPresent()) { + return Either.fromLeft(String.format(errorMessage.get() + ": %s", monitorType)); + } + + Map parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); + + return Either.fromRight(new DQMonitor(monitorType, parameterMap)); + } + private Either parseCondition( DQRuleType ruleType, String returnType, diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java new file mode 100644 index 0000000..37d733c --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java @@ -0,0 +1,88 @@ +/* + * DQMonitorTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import com.amazonaws.glue.ml.dataquality.dqdl.exception.InvalidDataQualityRulesetException; +import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +public class DQMonitorTest { + DQDLParser parser = new DQDLParser(); + + @Test + void test_singleMonitor() { + String column = "colA"; + String ruleset = String.format("Rules = [ IsComplete \"%s\" ] Monitors = [ Completeness \"%s\" ]", column, column); + + try { + DQRuleset dqRuleset = parser.parse(ruleset); + DQMonitor dqMonitor = dqRuleset.getMonitors().get(0); + assertEquals("Completeness", dqMonitor.getRuleType()); + assertEquals(1, dqMonitor.getParameters().size()); + assertTrue(dqMonitor.getParameters().containsValue(column)); + } catch (InvalidDataQualityRulesetException e) { + fail(e.getMessage()); + } + } + + @ParameterizedTest + @MethodSource("provideRawMonitors") + void test_monitorParsingAndGeneratingWithParser(String monitor) { + try { + DQRuleset dqRuleset = parser.parse(String.format("Rules = [ IsComplete \"colA\" ] Monitors = [ %s ]", monitor)); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals(1, dqRuleset.getMonitors().size()); + + DQMonitor dqMonitor = dqRuleset.getMonitors().get(0); + String dqMonitorAsString = dqMonitor.toString(); + assertEquals(monitor, dqMonitorAsString); + } catch (InvalidDataQualityRulesetException e) { + fail(e.getMessage()); + } + } + + private static Stream provideRawMonitors() { + return Stream.of( + Arguments.of("RowCount"), + Arguments.of("RowCountMatch \"reference\""), + Arguments.of("Completeness \"col_1\""), + Arguments.of("ColumnCount"), + Arguments.of("ColumnCorrelation \"col_1\" \"col_2\""), + Arguments.of("Uniqueness \"col_1\""), + Arguments.of("Sum \"col_A-B.C\""), + Arguments.of("Mean \"col_A-B.CD\""), + Arguments.of("StandardDeviation \"col_A-B.CD\""), + Arguments.of("Entropy \"col_A-B.CD\""), + Arguments.of("DistinctValuesCount \"col_A-B.CD\""), + Arguments.of("UniqueValueRatio \"col_A-B.CD\""), + Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\""), + Arguments.of("ReferentialIntegrity \"col-A,col-B\" \"reference.{col-A1,col-A2}\""), + Arguments.of("DatasetMatch \"reference\" \"ID1,ID2\""), + Arguments.of("DatasetMatch \"reference\" \"ID1,ID2\" \"colA,colB,colC\""), + Arguments.of("DatasetMatch \"reference\" \"ID1->ID11,ID2->ID22\" \"colA->colAA\""), + Arguments.of("SchemaMatch \"ref-1\""), + Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(colB)\""), + Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\""), + Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\""), + Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\""), + Arguments.of("CustomSql \"select count(*) from primary\"") + ); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index 5468a96..8124ad1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -181,6 +181,39 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndNoPrimarySourceToString() { assertEquals(dqdlFormatted, dqRuleset.toString()); } + @Test + void test_isPrimaryCheckWithMetadataAndSourcesAndMonitors() { + String dqdl = "Metadata = { \"Version\": \"1.0\" }" + LINE_SEP + + "DataSources = { \"Primary\": \"orders-table\", \"AdditionalDataSources\": [ \"ref-table\" ] } " + LINE_SEP + + "Rules = [ IsPrimaryKey \"colA\" ] " + LINE_SEP + + "Monitors = [ Completeness \"colA\" ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals("orders-table", dqRuleset.getPrimarySourceName()); + assertEquals(1, dqRuleset.getAdditionalDataSourcesNames().size()); + assertEquals("ref-table", dqRuleset.getAdditionalDataSourcesNames().get(0)); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("IsPrimaryKey", dqRuleset.getRules().get(0).getRuleType()); + assertEquals(1, dqRuleset.getMonitors().size()); + assertEquals("Completeness", dqRuleset.getMonitors().get(0).getRuleType()); + + String dqdlFormatted = + "Metadata = {" + LINE_SEP + + " \"Version\": \"1.0\"" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "DataSources = {" + LINE_SEP + + " \"Primary\": \"orders-table\"," + LINE_SEP + + " \"AdditionalDataSources\": [ \"ref-table\" ]" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "Rules = [" + LINE_SEP + + " IsPrimaryKey \"colA\"" + LINE_SEP + + "]" + LINE_SEP + LINE_SEP + + "Monitors = [" + LINE_SEP + + " Completeness \"colA\"" + LINE_SEP + + "]"; + assertEquals(dqdlFormatted, dqRuleset.toString()); + } + @Disabled void test_jobStatusRuleWithEqualityCheck() { String dqdl = "Rules = [ JobStatus = \"SUCCEEDED\" ]"; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 0603b40..b5662b1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -73,6 +73,17 @@ private static Stream provideInvalidRulesets() { ); } + private static Stream provideInvalidRulesetsWithMonitors() { + return Stream.of( + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ IsComplete \"colA\" ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\", ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\", Foo ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\" > 1.0 ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\", Uniqueness \"colB\" = 1.0 ]") + ); + } + @ParameterizedTest @MethodSource("provideInvalidRulesets") void test_invalidRulesetParsing(String ruleset) { @@ -83,4 +94,15 @@ void test_invalidRulesetParsing(String ruleset) { System.out.println(e.getMessage()); } } + + @ParameterizedTest + @MethodSource("provideInvalidRulesetsWithMonitors") + void test_invalidRulesetWithMonitorsParsing(String ruleset) { + try { + parser.parse(ruleset); + fail("Ruleset validation exception was expected"); + } catch (InvalidDataQualityRulesetException e) { + System.out.println(e.getMessage()); + } + } } From 92228b623947af96bf9b938c3e9b808cae4e23aa Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 19 Sep 2023 15:19:20 -0400 Subject: [PATCH 02/50] Added support for function call and binary operand expressions in number based condition - The function call is resolved via an Evaluator class, which was added. - Each operand has an updated evaluate function. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 18 ++- .../condition/number/AtomicNumberOperand.java | 37 ++++++ .../number/BinaryExpressionOperand.java | 65 ++++++++++ .../condition/number/FunctionCallOperand.java | 57 +++++++++ .../number/NumberBasedCondition.java | 29 +++-- .../condition/number/NumericOperand.java | 31 +++++ .../condition/number/OperandEvaluator.java | 20 +++ .../dqdl/parser/DQDLParserListener.java | 116 +++++++++++++++--- .../ml/dataquality/dqdl/model/DQRuleTest.java | 8 +- .../dqdl/model/condition/ConditionTest.java | 4 +- .../condition/number/NumericOperandTest.java | 110 +++++++++++++++++ 11 files changed, 463 insertions(+), 32 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java create mode 100644 tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 6884e18..265378c 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -20,7 +20,7 @@ dateExpression: | dateNow | LPAREN dateNow dateExpressionOp durationExpression RPAREN; -number: +atomicNumber: DIGIT | NEGATIVE DIGIT | INT @@ -28,6 +28,22 @@ number: | DECIMAL | NEGATIVE DECIMAL; +functionParameters: + number + | number (COMMA number)*; + +functionCall: + IDENTIFIER LPAREN RPAREN + | IDENTIFIER LPAREN functionParameters RPAREN; + +numberOp: '+' | '-' | '/' | '*'; + +number: + number numberOp number + | functionCall + | LPAREN number RPAREN + | atomicNumber; + quotedString: QUOTED_STRING; matchesRegexCondition: 'matches' quotedString; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java new file mode 100644 index 0000000..a433ed0 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java @@ -0,0 +1,37 @@ +/* + * AtomicNumberOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; + +/* + * Atomic number operands are decimal numbers like 1.0, 3.14 etc that can be used in number based conditions. + * They are used for defining static thresholds on rules. + */ +public class AtomicNumberOperand extends NumericOperand { + public AtomicNumberOperand(final String operand) { + super(operand); + } + + @Override + public Double evaluate(DQRule dqRule, OperandEvaluator evaluator) { + return Double.parseDouble(getOperand()); + } + + @Override + public String toString() { + if (this.isParenthesized()) { + return String.format("(%s)", getOperand()); + } else { + return getOperand(); + } + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java new file mode 100644 index 0000000..7a5a470 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java @@ -0,0 +1,65 @@ +/* + * BinaryExpressionOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; + +/* + * A BinaryExpressionOperand is a numerical expression that consists of two operands and an operator. + * The operands can themselves be binary expression operands or atomic number operands or function call operands. + * The operator can be one of: +, -, /, * + * The purpose of this operand is for combining with a dynamic function call operand to create dynamic rule thresholds. + */ +public class BinaryExpressionOperand extends NumericOperand { + private final String operator; + private final NumericOperand operand1; + private final NumericOperand operand2; + + public BinaryExpressionOperand(final String operand, + final String operator, + final NumericOperand operand1, + final NumericOperand operand2, + final boolean isParenthesized) { + super(operand, isParenthesized); + this.operator = operator; + this.operand1 = operand1; + this.operand2 = operand2; + } + + public Double evaluate(DQRule dqRule, OperandEvaluator evaluator) { + Double operand1Evaluated = operand1.evaluate(dqRule, evaluator); + Double operand2Evaluated = operand2.evaluate(dqRule, evaluator); + + switch (operator) { + case "+": + return operand1Evaluated + operand2Evaluated; + case "-": + return operand1Evaluated - operand2Evaluated; + case "/": + return operand1Evaluated / operand2Evaluated; + case "*": + return operand1Evaluated * operand2Evaluated; + default: + throw new IllegalArgumentException("Bad operator"); + } + } + + @Override + public String toString() { + String formatted = String.format("%s %s %s", + this.operand1.toString(), this.operator, this.operand2.toString()); + if (this.isParenthesized()) { + return String.format("(%s)", formatted); + } else { + return formatted; + } + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java new file mode 100644 index 0000000..dae99f9 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java @@ -0,0 +1,57 @@ +/* + * FunctionCallOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import lombok.Getter; + +import java.util.List; +import java.util.stream.Collectors; + +/* + * A Function Call operand is a special operand that takes operands as parameters returns a number. + * The parameters can themselves be function call operands, or atomic number operands or binary expression operands. + * Each function must be implemented by an instance of "OperandEvaluator", provided at the time of evaluation. + * Through the use of function call operands, we introduce the concept of dynamic rules in DQDL. + */ +@Getter +public class FunctionCallOperand extends NumericOperand { + private final String functionName; + private final List operands; + + public FunctionCallOperand(final String operand, + final String functionName, + final List operands) { + super(operand); + this.functionName = functionName; + this.operands = operands; + } + + @Override + public Double evaluate(DQRule dqRule, OperandEvaluator evaluator) { + return evaluator.evaluate( + dqRule, + this.functionName, + this.operands.stream().map(o -> o.evaluate(dqRule, evaluator)).collect(Collectors.toList()) + ); + } + + @Override + public String toString() { + String params = this.operands.stream().map(NumericOperand::toString).collect(Collectors.joining(",")); + String formatted = String.format("%s(%s)", this.functionName, params); + if (this.isParenthesized()) { + return String.format("(%s)", formatted); + } else { + return formatted; + } + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index 7ec0ad9..a3eb9b1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -10,6 +10,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; import lombok.EqualsAndHashCode; @@ -22,20 +23,22 @@ @EqualsAndHashCode(callSuper = true) public class NumberBasedCondition extends Condition { private final NumberBasedConditionOperator operator; - private final List operands; + private final List operands; public NumberBasedCondition(final String conditionAsString, final NumberBasedConditionOperator operator, - final List operands) { + final List operands) { super(conditionAsString); this.operator = operator; this.operands = operands; } - public Boolean evaluate(Double metric) { + public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) { if (operands == null) return false; - List operandsAsDouble = operands.stream().map(Double::parseDouble).collect(Collectors.toList()); + List operandsAsDouble = operands.stream() + .map(operand -> operand.evaluate(dqRule, evaluator)) + .collect(Collectors.toList()); switch (operator) { case BETWEEN: @@ -69,19 +72,23 @@ public String getFormattedCondition() { switch (operator) { case BETWEEN: - return String.format("between %s and %s", operands.get(0), operands.get(1)); + return String.format("between %s and %s", operands.get(0).toString(), operands.get(1).toString()); case GREATER_THAN: - return String.format("> %s", operands.get(0)); + return String.format("> %s", operands.get(0).toString()); case GREATER_THAN_EQUAL_TO: - return String.format(">= %s", operands.get(0)); + return String.format(">= %s", operands.get(0).toString()); case LESS_THAN: - return String.format("< %s", operands.get(0)); + return String.format("< %s", operands.get(0).toString()); case LESS_THAN_EQUAL_TO: - return String.format("<= %s", operands.get(0)); + return String.format("<= %s", operands.get(0).toString()); case EQUALS: - return String.format("= %s", operands.get(0)); + return String.format("= %s", operands.get(0).toString()); case IN: - return String.format("in [%s]", String.join(",", operands)); + return String.format("in [%s]", + operands.stream() + .map(NumericOperand::toString) + .collect(Collectors.joining(",")) + ); default: break; } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java new file mode 100644 index 0000000..2d3f3f2 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java @@ -0,0 +1,31 @@ +/* + * NumericOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import lombok.AllArgsConstructor; +import lombok.Getter; + +import java.io.Serializable; + +@AllArgsConstructor +@Getter +public abstract class NumericOperand implements Serializable { + private final String operand; + private final boolean isParenthesized; + + public NumericOperand(final String operand) { + this.operand = operand; + isParenthesized = false; + } + + public abstract Double evaluate(DQRule dqRule, OperandEvaluator evaluator); +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java new file mode 100644 index 0000000..9ed19c8 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java @@ -0,0 +1,20 @@ +/* + * OperandEvaluator.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; + +import java.io.Serializable; +import java.util.List; + +public abstract class OperandEvaluator implements Serializable { + public abstract Double evaluate(DQRule rule, String functionName, List operands); +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 261741d..1000aa1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -21,8 +21,12 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.AtomicNumberOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.BinaryExpressionOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.FunctionCallOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; @@ -30,7 +34,6 @@ import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener; import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; -import org.antlr.v4.runtime.RuleContext; import java.util.ArrayList; import java.util.Arrays; @@ -401,34 +404,115 @@ private Optional parseNumberBasedCondition( Condition condition = null; if (ctx.BETWEEN() != null && ctx.number().size() == 2) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.BETWEEN, - Arrays.asList(ctx.number(0).getText(), ctx.number(1).getText())); + Optional operand1 = parseNumericOperand(ctx.number(0), false); + Optional operand2 = parseNumericOperand(ctx.number(1), false); + + if (operand1.isPresent() && operand2.isPresent()) { + condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.BETWEEN, + Arrays.asList(operand1.get(), operand2.get()) + ); + } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.GREATER_THAN_EQUAL_TO, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.GREATER_THAN_EQUAL_TO, + Collections.singletonList(operand.get())); + } } else if (ctx.GREATER_THAN() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.GREATER_THAN, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.GREATER_THAN, + Collections.singletonList(operand.get())); + } } else if (ctx.LESS_THAN() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.LESS_THAN, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.LESS_THAN, + Collections.singletonList(operand.get())); + } } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.LESS_THAN_EQUAL_TO, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.LESS_THAN_EQUAL_TO, + Collections.singletonList(operand.get())); + } } else if (ctx.EQUAL_TO() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.EQUALS, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.EQUALS, + Collections.singletonList(operand.get())); + } } else if (ctx.IN() != null && ctx.numberArray() != null && ctx.numberArray().number().size() > 0) { - List numbers = ctx.numberArray().number().stream() - .map(RuleContext::getText) + List> numbers = ctx.numberArray().number() + .stream() + .map(op -> parseNumericOperand(op, false)) .collect(Collectors.toList()); - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.IN, numbers); + if (numbers.stream().allMatch(Optional::isPresent)) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.IN, + numbers.stream().map(Optional::get).collect(Collectors.toList())); + } } return Optional.ofNullable(condition); } + private Optional parseNumericOperand( + DataQualityDefinitionLanguageParser.NumberContext numberContext, boolean isParenthesized + ) { + if (numberContext.numberOp() != null) { + Optional operand1 = parseNumericOperand(numberContext.number(0), false); + Optional operand2 = parseNumericOperand(numberContext.number(1), false); + if (operand1.isPresent() && operand2.isPresent()) { + return Optional.of( + new BinaryExpressionOperand( + numberContext.getText(), + numberContext.numberOp().getText(), + operand1.get(), operand2.get(), + isParenthesized + ) + ); + } else { + return Optional.empty(); + } + } else if (numberContext.functionCall() != null) { + DataQualityDefinitionLanguageParser.FunctionCallContext fcc = numberContext.functionCall(); + String functionName = fcc.IDENTIFIER().getText(); + List functionParameters = new ArrayList<>(); + + if (fcc.functionParameters() != null) { + List> parameters = fcc.functionParameters().number() + .stream() + .map(op -> parseNumericOperand(op, false)) + .collect(Collectors.toList()); + + if (parameters.stream().allMatch(Optional::isPresent)) { + functionParameters = parameters.stream().map(Optional::get).collect(Collectors.toList()); + return Optional.of( + new FunctionCallOperand(fcc.getText(), functionName, functionParameters) + ); + } + } else { + // No parameter function + return Optional.of( + new FunctionCallOperand(fcc.getText(), functionName, functionParameters) + ); + } + } else if (numberContext.LPAREN() != null) { + return parseNumericOperand(numberContext.number(0), true); + } else if (numberContext.atomicNumber() != null) { + return Optional.of(new AtomicNumberOperand(numberContext.getText())); + } + + return Optional.empty(); + } + private Optional parseStringBasedCondition( DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx ) { diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 98aeed5..c6833e8 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -75,12 +75,14 @@ private static Stream provideRawRules() { Arguments.of("IsPrimaryKey \"colA\" \"colB\" \"colC\""), Arguments.of("RowCount = 100"), Arguments.of("RowCount = -100"), + Arguments.of("RowCount between (0.9 * average(last(10))) and 1.1 * average(last(10))"), Arguments.of("RowCountMatch \"reference\" = 1.0"), Arguments.of("RowCountMatch \"reference\" >= 0.95"), Arguments.of("RowCountMatch \"reference\" between 0.8 and 0.98"), Arguments.of("Completeness \"col_1\" between 0.5 and 0.8"), Arguments.of("IsComplete \"col_1\""), Arguments.of("Completeness \"col_1\" between -0.5 and -0.4"), + Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"), Arguments.of("ColumnDataType \"col_1\" = \"String\""), Arguments.of("ColumnDataType \"col_1\" = \"String\" with threshold between 0.4 and 0.8"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"]"), @@ -88,6 +90,8 @@ private static Stream provideRawRules() { Arguments.of("ColumnNamesMatchPattern \"aws_.*_[a-zA-Z0-9]+\""), Arguments.of("ColumnExists \"load_dt\""), Arguments.of("ColumnCount >= 100"), + Arguments.of("ColumnCount = avg(std(last(10)))"), + Arguments.of("ColumnCount = avg(std(last(percentile(1,2,3))))"), Arguments.of("ColumnCount > -100.123456"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between -0.44444 and 0.888888"), @@ -251,12 +255,12 @@ void test_serializationDeserializationWithNumericExpression() assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); - assertTrue(((NumberBasedCondition) dqRule.getCondition()).evaluate(0.4)); + assertTrue(((NumberBasedCondition) dqRule.getCondition()).evaluate(0.4, dqRule, null)); byte[] serialized = serialize(dqRule); DQRule deserialized = deserialize(serialized, DQRule.class); assertEquals(dqRule.toString(), deserialized.toString()); assertEquals(NumberBasedCondition.class, deserialized.getCondition().getClass()); - assertFalse(((NumberBasedCondition) deserialized.getCondition()).evaluate(0.9)); + assertFalse(((NumberBasedCondition) deserialized.getCondition()).evaluate(0.9, dqRule, null)); } @Test diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java index 0c51b4e..1ffd830 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java @@ -117,7 +117,7 @@ void test_ruleParsingAndVerifyingNumberBasedCondition(String rule, Double metric NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); assertTrue(dqRule.toString().contains(condition.getFormattedCondition())); - assertEquals(shouldRulePass, condition.evaluate(metric)); + assertEquals(shouldRulePass, condition.evaluate(metric, dqRule, null)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } @@ -136,7 +136,7 @@ void test_ruleParsingAndVerifyingNumberBasedThresholdCondition(String rule, Doub NumberBasedCondition thresholdCondition = (NumberBasedCondition) dqRule.getThresholdCondition(); assertTrue(dqRule.toString().contains(thresholdCondition.getFormattedCondition())); - assertEquals(shouldRulePass, thresholdCondition.evaluate(metric)); + assertEquals(shouldRulePass, thresholdCondition.evaluate(metric, dqRule, null)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java new file mode 100644 index 0000000..08a1b71 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java @@ -0,0 +1,110 @@ +/* + * NumericOperandTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.exception.InvalidDataQualityRulesetException; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; +import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class NumericOperandTest { + private static final String MULTIPLY_FUNCTION_NAME = "multiply"; + private static final String AVG_FUNCTION_NAME = "avg"; + + private final DQDLParser parser = new DQDLParser(); + private final OperandEvaluator evaluator = new OperandEvaluator() { + @Override + public Double evaluate(DQRule rule, String functionName, List operands) { + if (MULTIPLY_FUNCTION_NAME.equals(functionName)) { + return operands.stream().reduce(1.0, (a, b) -> a * b); + } else if (AVG_FUNCTION_NAME.equals(functionName)) { + return operands.stream().reduce(0.0, (a, b) -> a + b) / operands.size(); + } else { + throw new RuntimeException("Function not supported"); + } + } + }; + + @Test + public void test_functionCallWorksWithAtomicNumberOperands() throws InvalidDataQualityRulesetException { + String rule = "RowCount = multiply(1,2,3)"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + + assertTrue(condition.evaluate(6.0, dqRule, evaluator)); + assertFalse(condition.evaluate(3.0, dqRule, evaluator)); + } + + @Test + public void test_functionCallWorksWithNestedFunctionCallOperands() throws InvalidDataQualityRulesetException { + String rule = "RowCount = multiply(avg(2,4), avg(10,20))"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + assertTrue(condition.evaluate(45.0, dqRule, evaluator)); + assertFalse(condition.evaluate(40.0, dqRule, evaluator)); + } + + @Test + public void test_functionCallWorksInBinaryExpression() throws InvalidDataQualityRulesetException { + String rule = "RowCount = 2.0 * multiply(avg(2,4), avg(10,20))"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + assertTrue(condition.evaluate(90.0, dqRule, evaluator)); + assertFalse(condition.evaluate(45.0, dqRule, evaluator)); + } + + @Test + public void test_functionCallWorksInSimpleBinaryExpression() throws InvalidDataQualityRulesetException { + String rule = "RowCount = 8.0 * (5.0 - (1.0 + (4.0 / 2.0)))"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + assertTrue(condition.evaluate(16.0, dqRule, evaluator)); + assertFalse(condition.evaluate(8.0, dqRule, evaluator)); + } +} From 977356d48ee4ba87f681b8225321bc65734e3f3b Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 5 Oct 2023 19:06:48 -0400 Subject: [PATCH 03/50] Added tests for Anomaly detection rule - Tests for invalid rules/valid rules were added. --- configuration/rules/rules-config.json | 15 ++++++++++++++- .../ml/dataquality/dqdl/model/DQRuleTest.java | 9 ++++++--- .../dqdl/parser/InvalidDQRulesetTest.java | 3 ++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index bb4dea8..1bf2ded 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -345,6 +345,19 @@ } ], "return_type": "NUMBER" + }, + { + "rule_type_name": "DetectAnomalies", + "description": "Checks if the current value of the metric is anomalous with respect to the historical values", + "parameters": [ + { + "type": "String", + "name": "MetricParameter", + "description": "The parameters required to evaluate the metric. The first parameter must be the metric name.", + "is_var_arg": true + } + ], + "return_type": "BOOLEAN" } ] -} \ No newline at end of file +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index c6833e8..cd38d8c 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -150,9 +150,12 @@ private static Stream provideRawRules() { Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.1"), Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\" between 0.8 and 0.9"), Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\" >= 0.95"), - Arguments.of( "CustomSql \"select count(*) from primary\" > 0"), - Arguments.of( "CustomSql \"select col-A from primary\""), - Arguments.of( "CustomSql \"select col-A from primary\" with threshold > 0.5") + Arguments.of("CustomSql \"select count(*) from primary\" > 0"), + Arguments.of("CustomSql \"select col-A from primary\""), + Arguments.of("CustomSql \"select col-A from primary\" with threshold > 0.5"), + Arguments.of("DetectAnomalies \"RowCount\""), + Arguments.of("DetectAnomalies \"Completeness\" \"colA\""), + Arguments.of("DetectAnomalies \"ColumnCorrelation\" \"colA\" \"colB\"") ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index b5662b1..b242e3a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -69,7 +69,8 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 with threshold > 0.1 ]"), Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), - Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]") + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]"), + Arguments.of("Rules = [ DetectAnomalies ]") ); } From 04e97bc80cdd5e060abdacca18e140c052cab4f1 Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Thu, 12 Oct 2023 17:03:45 -0400 Subject: [PATCH 04/50] New monitor type, AllStatistics --- configuration/rules/rules-config.json | 13 +++++++++++++ .../glue/ml/dataquality/dqdl/model/DQRuleType.java | 6 +++++- .../dataquality/dqdl/parser/DQDLParserListener.java | 4 ++++ .../ml/dataquality/dqdl/model/DQMonitorTest.java | 3 ++- .../dqdl/parser/InvalidDQRulesetTest.java | 4 +++- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 1bf2ded..f53000c 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -358,6 +358,19 @@ } ], "return_type": "BOOLEAN" + }, + { + "rule_type_name": "AllStatistics", + "is_monitor_only": true, + "description": "Monitor Only. Produces a collection of statistics.", + "parameters": [ + { + "type": "String", + "name": "TargetColumn", + "description": "Name of the column to analyze" + } + ], + "return_type": "NUMBER" } ] } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 59f33ea..4158248 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -33,17 +33,21 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; + private final boolean isMonitorOnly; public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "description") String description, @JsonProperty(value = "parameters") List parameters, @JsonProperty(value = "return_type") String returnType, - @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported) { + // boolean defaults to false if not present + @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, + @JsonProperty(value = "is_monitor_only") boolean isMonitorOnly) { this.ruleTypeName = ruleTypeName; this.description = description; this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; + this.isMonitorOnly = isMonitorOnly; if (parameters.isEmpty()) { return; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 1000aa1..2dbc9ac 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -213,6 +213,10 @@ private Either getDQRule( DQRuleType dqRuleType = optionalDQRuleType.get(); + if (dqRuleType.isMonitorOnly()) { + return Either.fromLeft(String.format("Type %s is not supported in rules section", ruleType)); + } + Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters); if (errorMessage.isPresent()) { diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java index 37d733c..a31c997 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java @@ -82,7 +82,8 @@ private static Stream provideRawMonitors() { Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\""), Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\""), Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\""), - Arguments.of("CustomSql \"select count(*) from primary\"") + Arguments.of("CustomSql \"select count(*) from primary\""), + Arguments.of("AllStatistics \"id\"") ); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index b242e3a..0ed3164 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -70,7 +70,9 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]"), - Arguments.of("Rules = [ DetectAnomalies ]") + Arguments.of("Rules = [ DetectAnomalies ]"), + Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), + Arguments.of("Rules = [ AllStatistics \"id\" ]") ); } From 4ecbdd8aaa633f7f96c41c568e78d5f9fe95384c Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Mon, 16 Oct 2023 18:37:35 -0400 Subject: [PATCH 05/50] Renamed Monitors to Analyzers, as per feedback from Product. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 12 ++--- configuration/rules/rules-config.json | 4 +- .../model/{DQMonitor.java => DQAnalyzer.java} | 4 +- .../ml/dataquality/dqdl/model/DQRuleType.java | 6 +-- .../ml/dataquality/dqdl/model/DQRuleset.java | 18 ++++---- .../dqdl/parser/DQDLParserListener.java | 46 +++++++++---------- ...DQMonitorTest.java => DQAnalyzerTest.java} | 32 ++++++------- .../dataquality/dqdl/model/DQRulesetTest.java | 10 ++-- .../dqdl/parser/InvalidDQRulesetTest.java | 18 ++++---- 9 files changed, 75 insertions(+), 75 deletions(-) rename src/com/amazonaws/glue/ml/dataquality/dqdl/model/{DQMonitor.java => DQAnalyzer.java} (94%) rename tst/com/amazonaws/glue/ml/dataquality/dqdl/model/{DQMonitorTest.java => DQAnalyzerTest.java} (76%) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 265378c..d84f927 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -5,7 +5,7 @@ import CommonLexerRules; metadataSectionStart: 'Metadata'; dataSourcesSectionStart: 'DataSources'; rulesSectionStart: 'Rules'; -monitorsSectionStart: 'Monitors'; +analyzersSectionStart: 'Analyzers'; // Expressions dateNow: 'now()'; @@ -85,7 +85,7 @@ durationBasedCondition: | IN durationExpressionArray; ruleType: IDENTIFIER; -monitorType: IDENTIFIER; +analyzerType: IDENTIFIER; parameter: (QUOTED_STRING | INT | DIGIT); condition: @@ -97,7 +97,7 @@ condition: withThresholdCondition: 'with' 'threshold' numberBasedCondition; dqRule: ruleType parameter* condition? withThresholdCondition?; -dqMonitor: monitorType parameter*; +dqAnalyzer: analyzerType parameter*; topLevelRule: dqRule @@ -106,14 +106,14 @@ topLevelRule: // Rules Definition dqRules: topLevelRule (COMMA topLevelRule)*; -dqMonitors: dqMonitor (COMMA dqMonitor)*; +dqAnalyzers: dqAnalyzer (COMMA dqAnalyzer)*; // Top Level Document rules: rulesSectionStart EQUAL_TO LBRAC dqRules RBRAC | rulesSectionStart EQUAL_TO LBRAC RBRAC; // empty array -monitors: monitorsSectionStart EQUAL_TO LBRAC dqMonitors RBRAC; +analyzers: analyzersSectionStart EQUAL_TO LBRAC dqAnalyzers RBRAC; // This dictionary does not support nested dictionaries. Just strings and arrays. dictionary: LCURL pair (COMMA pair)* RCURL; @@ -124,4 +124,4 @@ array: LBRAC QUOTED_STRING (COMMA QUOTED_STRING)* RBRAC; metadata: metadataSectionStart EQUAL_TO dictionary; dataSources: dataSourcesSectionStart EQUAL_TO dictionary; -document: metadata? dataSources? rules monitors?; +document: metadata? dataSources? rules analyzers?; diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index f53000c..15b3877 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -361,8 +361,8 @@ }, { "rule_type_name": "AllStatistics", - "is_monitor_only": true, - "description": "Monitor Only. Produces a collection of statistics.", + "is_analyzer_only": true, + "description": "Analyzer Only. Produces a collection of statistics.", "parameters": [ { "type": "String", diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java similarity index 94% rename from src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java rename to src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java index 7c877a9..d85ad69 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitor.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java @@ -1,5 +1,5 @@ /* - * DQMonitor.java + * DQAnalyzer.java * * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. * @@ -17,7 +17,7 @@ @AllArgsConstructor @Getter -public class DQMonitor { +public class DQAnalyzer { private final String ruleType; private final Map parameters; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 4158248..8018d86 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -33,7 +33,7 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; - private final boolean isMonitorOnly; + private final boolean isAnalyzerOnly; public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "description") String description, @@ -41,13 +41,13 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "return_type") String returnType, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, - @JsonProperty(value = "is_monitor_only") boolean isMonitorOnly) { + @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly) { this.ruleTypeName = ruleTypeName; this.description = description; this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; - this.isMonitorOnly = isMonitorOnly; + this.isAnalyzerOnly = isAnalyzerOnly; if (parameters.isEmpty()) { return; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java index 6fb8b0b..d27613a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java @@ -30,7 +30,7 @@ public class DQRuleset { private final String primarySourceName; private final List additionalDataSourcesNames; private final List rules; - private final List monitors; + private final List analyzers; private static final String LINE_SEP = System.lineSeparator(); @@ -38,12 +38,12 @@ public DQRuleset(final List rules) { this(rules, new ArrayList<>()); } - public DQRuleset(final List rules, final List monitors) { + public DQRuleset(final List rules, final List analyzers) { this.metadata = new HashMap<>(); this.primarySourceName = null; this.additionalDataSourcesNames = new ArrayList<>(); this.rules = rules; - this.monitors = monitors; + this.analyzers = analyzers; } @Override @@ -87,10 +87,10 @@ public String toString() { .collect(Collectors.joining("," + LINE_SEP)) + LINE_SEP + "]"; - String monitorsStr = ""; - if (!monitors.isEmpty()) { - monitorsStr = "Monitors = [" + LINE_SEP + - monitors.stream() + String analyzersStr = ""; + if (!analyzers.isEmpty()) { + analyzersStr = "Analyzers = [" + LINE_SEP + + analyzers.stream() .map(i -> " " + i) .collect(Collectors.joining("," + LINE_SEP)) + LINE_SEP + "]"; @@ -107,8 +107,8 @@ public String toString() { sb.append(rulesStr); - if (!monitorsStr.isEmpty()) { - sb.append(LINE_SEP).append(LINE_SEP).append(monitorsStr); + if (!analyzersStr.isEmpty()) { + sb.append(LINE_SEP).append(LINE_SEP).append(analyzersStr); } return sb.toString(); diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 2dbc9ac..bf54279 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -10,7 +10,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.parser; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQMonitor; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQAnalyzer; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleLogicalOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; @@ -54,7 +54,7 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private String primarySource; private List additionalSources; private final List dqRules = new ArrayList<>(); - private final List dqMonitors = new ArrayList<>(); + private final List dqAnalyzers = new ArrayList<>(); private static final String METADATA_VERSION_KEY = "Version"; private static final Set ALLOWED_METADATA_KEYS; @@ -78,7 +78,7 @@ public DQDLParserListener(DQDLErrorListener errorListener) { public Either, DQRuleset> getParsedRuleset() { if (errorMessages.isEmpty() && errorListener.getErrorMessages().isEmpty()) { - return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules, dqMonitors)); + return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules, dqAnalyzers)); } else { List allErrorMessages = new ArrayList<>(); allErrorMessages.addAll(errorMessages); @@ -182,18 +182,18 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu } @Override - public void enterDqMonitors(DataQualityDefinitionLanguageParser.DqMonitorsContext dqMonitorsContext) { + public void enterDqAnalyzers(DataQualityDefinitionLanguageParser.DqAnalyzersContext dqAnalyzersContext) { if (!errorMessages.isEmpty()) { return; } - for (DataQualityDefinitionLanguageParser.DqMonitorContext dmc: dqMonitorsContext.dqMonitor()) { - Either dqMonitorEither = getDQMonitor(dmc); - if (dqMonitorEither.isLeft()) { - errorMessages.add(dqMonitorEither.getLeft()); + for (DataQualityDefinitionLanguageParser.DqAnalyzerContext dac: dqAnalyzersContext.dqAnalyzer()) { + Either dqAnalyzerEither = getDQAnalyzer(dac); + if (dqAnalyzerEither.isLeft()) { + errorMessages.add(dqAnalyzerEither.getLeft()); return; } else { - dqMonitors.add(dqMonitorEither.getRight()); + dqAnalyzers.add(dqAnalyzerEither.getRight()); } } } @@ -213,8 +213,8 @@ private Either getDQRule( DQRuleType dqRuleType = optionalDQRuleType.get(); - if (dqRuleType.isMonitorOnly()) { - return Either.fromLeft(String.format("Type %s is not supported in rules section", ruleType)); + if (dqRuleType.isAnalyzerOnly()) { + return Either.fromLeft(String.format("Analyzer Type: %s is not supported in rules section", ruleType)); } Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters); @@ -280,36 +280,36 @@ private Either getDQRule( ); } - private Either getDQMonitor( - DataQualityDefinitionLanguageParser.DqMonitorContext dqMonitorContext) { - String monitorType = dqMonitorContext.monitorType().getText(); - List parameters = dqMonitorContext.parameter().stream() + private Either getDQAnalyzer( + DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) { + String analyzerType = dqAnalyzerContext.analyzerType().getText(); + List parameters = dqAnalyzerContext.parameter().stream() .map(p -> p.getText().replaceAll("\"", "")) .collect(Collectors.toList()); - // We just use the DQ Rule names to valid what monitor names to allow. + // We just use the DQ Rule names to validate what analyzer names to allow. // This might change closer to re:Invent, but keeping it simple for now. - Optional optionalDQMonitorType = DQRuleType.getRuleType(monitorType, parameters.size()); + Optional optionalDQAnalyzerType = DQRuleType.getRuleType(analyzerType, parameters.size()); - if (!optionalDQMonitorType.isPresent()) { - return Either.fromLeft(String.format("Monitor Type: %s is not valid", monitorType)); + if (!optionalDQAnalyzerType.isPresent()) { + return Either.fromLeft(String.format("Analyzer Type: %s is not valid", analyzerType)); } - DQRuleType dqRuleType = optionalDQMonitorType.get(); + DQRuleType dqRuleType = optionalDQAnalyzerType.get(); if (dqRuleType.getReturnType().equals("BOOLEAN")) { - return Either.fromLeft(String.format("Monitor Type: %s is not supported", monitorType)); + return Either.fromLeft(String.format("Analyzer Type: %s is not supported", analyzerType)); } Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters); if (errorMessage.isPresent()) { - return Either.fromLeft(String.format(errorMessage.get() + ": %s", monitorType)); + return Either.fromLeft(String.format(errorMessage.get() + ": %s", analyzerType)); } Map parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); - return Either.fromRight(new DQMonitor(monitorType, parameterMap)); + return Either.fromRight(new DQAnalyzer(analyzerType, parameterMap)); } private Either parseCondition( diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java similarity index 76% rename from tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java rename to tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java index a31c997..174144a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQMonitorTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java @@ -1,5 +1,5 @@ /* - * DQMonitorTest.java + * DQAnalyzerTest.java * * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. * @@ -23,42 +23,42 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -public class DQMonitorTest { +public class DQAnalyzerTest { DQDLParser parser = new DQDLParser(); @Test - void test_singleMonitor() { + void test_singleAnalyzer() { String column = "colA"; - String ruleset = String.format("Rules = [ IsComplete \"%s\" ] Monitors = [ Completeness \"%s\" ]", column, column); + String ruleset = String.format("Rules = [ IsComplete \"%s\" ] Analyzers = [ Completeness \"%s\" ]", column, column); try { DQRuleset dqRuleset = parser.parse(ruleset); - DQMonitor dqMonitor = dqRuleset.getMonitors().get(0); - assertEquals("Completeness", dqMonitor.getRuleType()); - assertEquals(1, dqMonitor.getParameters().size()); - assertTrue(dqMonitor.getParameters().containsValue(column)); + DQAnalyzer dqAnalyzer = dqRuleset.getAnalyzers().get(0); + assertEquals("Completeness", dqAnalyzer.getRuleType()); + assertEquals(1, dqAnalyzer.getParameters().size()); + assertTrue(dqAnalyzer.getParameters().containsValue(column)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } } @ParameterizedTest - @MethodSource("provideRawMonitors") - void test_monitorParsingAndGeneratingWithParser(String monitor) { + @MethodSource("provideRawAnalyzers") + void test_analyzerParsingAndGeneratingWithParser(String analyzer) { try { - DQRuleset dqRuleset = parser.parse(String.format("Rules = [ IsComplete \"colA\" ] Monitors = [ %s ]", monitor)); + DQRuleset dqRuleset = parser.parse(String.format("Rules = [ IsComplete \"colA\" ] Analyzers = [ %s ]", analyzer)); assertEquals(1, dqRuleset.getRules().size()); - assertEquals(1, dqRuleset.getMonitors().size()); + assertEquals(1, dqRuleset.getAnalyzers().size()); - DQMonitor dqMonitor = dqRuleset.getMonitors().get(0); - String dqMonitorAsString = dqMonitor.toString(); - assertEquals(monitor, dqMonitorAsString); + DQAnalyzer dqAnalyzer = dqRuleset.getAnalyzers().get(0); + String dqAnalyzerAsString = dqAnalyzer.toString(); + assertEquals(analyzer, dqAnalyzerAsString); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } } - private static Stream provideRawMonitors() { + private static Stream provideRawAnalyzers() { return Stream.of( Arguments.of("RowCount"), Arguments.of("RowCountMatch \"reference\""), diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index 8124ad1..0380b1a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -182,11 +182,11 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndNoPrimarySourceToString() { } @Test - void test_isPrimaryCheckWithMetadataAndSourcesAndMonitors() { + void test_isPrimaryCheckWithMetadataAndSourcesAndAnalyzers() { String dqdl = "Metadata = { \"Version\": \"1.0\" }" + LINE_SEP + "DataSources = { \"Primary\": \"orders-table\", \"AdditionalDataSources\": [ \"ref-table\" ] } " + LINE_SEP + "Rules = [ IsPrimaryKey \"colA\" ] " + LINE_SEP + - "Monitors = [ Completeness \"colA\" ]"; + "Analyzers = [ Completeness \"colA\" ]"; DQRuleset dqRuleset = parseDQDL(dqdl); assertEquals("orders-table", dqRuleset.getPrimarySourceName()); @@ -194,8 +194,8 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndMonitors() { assertEquals("ref-table", dqRuleset.getAdditionalDataSourcesNames().get(0)); assertEquals(1, dqRuleset.getRules().size()); assertEquals("IsPrimaryKey", dqRuleset.getRules().get(0).getRuleType()); - assertEquals(1, dqRuleset.getMonitors().size()); - assertEquals("Completeness", dqRuleset.getMonitors().get(0).getRuleType()); + assertEquals(1, dqRuleset.getAnalyzers().size()); + assertEquals("Completeness", dqRuleset.getAnalyzers().get(0).getRuleType()); String dqdlFormatted = "Metadata = {" + LINE_SEP + @@ -208,7 +208,7 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndMonitors() { "Rules = [" + LINE_SEP + " IsPrimaryKey \"colA\"" + LINE_SEP + "]" + LINE_SEP + LINE_SEP + - "Monitors = [" + LINE_SEP + + "Analyzers = [" + LINE_SEP + " Completeness \"colA\"" + LINE_SEP + "]"; assertEquals(dqdlFormatted, dqRuleset.toString()); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 0ed3164..9288034 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -76,14 +76,14 @@ private static Stream provideInvalidRulesets() { ); } - private static Stream provideInvalidRulesetsWithMonitors() { + private static Stream provideInvalidRulesetsWithAnalyzers() { return Stream.of( - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ IsComplete \"colA\" ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\", ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\", Foo ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\" > 1.0 ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Monitors = [ Completeness \"colA\", Uniqueness \"colB\" = 1.0 ]") + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ IsComplete \"colA\" ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Foo ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\" > 1.0 ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Uniqueness \"colB\" = 1.0 ]") ); } @@ -99,8 +99,8 @@ void test_invalidRulesetParsing(String ruleset) { } @ParameterizedTest - @MethodSource("provideInvalidRulesetsWithMonitors") - void test_invalidRulesetWithMonitorsParsing(String ruleset) { + @MethodSource("provideInvalidRulesetsWithAnalyzers") + void test_invalidRulesetWithAnalyzersParsing(String ruleset) { try { parser.parse(ruleset); fail("Ruleset validation exception was expected"); From 398288ee26492f4c3f88ae9dd52b63a3936c3c52 Mon Sep 17 00:00:00 2001 From: Tyler McDaniel Date: Thu, 19 Oct 2023 10:19:32 -0400 Subject: [PATCH 06/50] Move responsibilty for numeric operand resolution out of package. --- .../condition/number/AtomicNumberOperand.java | 7 --- .../number/BinaryExpressionOperand.java | 21 +------ .../condition/number/FunctionCallOperand.java | 10 ---- .../number/NumberBasedCondition.java | 3 +- .../condition/number/NumericOperand.java | 3 - .../condition/number/OperandEvaluator.java | 8 ++- .../ml/dataquality/dqdl/model/DQRuleTest.java | 5 +- .../dqdl/model/condition/ConditionTest.java | 5 +- .../condition/number/NumericOperandTest.java | 58 ++++++++++++++----- 9 files changed, 57 insertions(+), 63 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java index a433ed0..9945b3b 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java @@ -10,8 +10,6 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; - /* * Atomic number operands are decimal numbers like 1.0, 3.14 etc that can be used in number based conditions. * They are used for defining static thresholds on rules. @@ -21,11 +19,6 @@ public AtomicNumberOperand(final String operand) { super(operand); } - @Override - public Double evaluate(DQRule dqRule, OperandEvaluator evaluator) { - return Double.parseDouble(getOperand()); - } - @Override public String toString() { if (this.isParenthesized()) { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java index 7a5a470..12d1ed4 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java @@ -10,7 +10,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import lombok.Getter; /* * A BinaryExpressionOperand is a numerical expression that consists of two operands and an operator. @@ -18,6 +18,7 @@ * The operator can be one of: +, -, /, * * The purpose of this operand is for combining with a dynamic function call operand to create dynamic rule thresholds. */ +@Getter public class BinaryExpressionOperand extends NumericOperand { private final String operator; private final NumericOperand operand1; @@ -34,24 +35,6 @@ public BinaryExpressionOperand(final String operand, this.operand2 = operand2; } - public Double evaluate(DQRule dqRule, OperandEvaluator evaluator) { - Double operand1Evaluated = operand1.evaluate(dqRule, evaluator); - Double operand2Evaluated = operand2.evaluate(dqRule, evaluator); - - switch (operator) { - case "+": - return operand1Evaluated + operand2Evaluated; - case "-": - return operand1Evaluated - operand2Evaluated; - case "/": - return operand1Evaluated / operand2Evaluated; - case "*": - return operand1Evaluated * operand2Evaluated; - default: - throw new IllegalArgumentException("Bad operator"); - } - } - @Override public String toString() { String formatted = String.format("%s %s %s", diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java index dae99f9..5dccf68 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java @@ -10,7 +10,6 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import lombok.Getter; import java.util.List; @@ -35,15 +34,6 @@ public FunctionCallOperand(final String operand, this.operands = operands; } - @Override - public Double evaluate(DQRule dqRule, OperandEvaluator evaluator) { - return evaluator.evaluate( - dqRule, - this.functionName, - this.operands.stream().map(o -> o.evaluate(dqRule, evaluator)).collect(Collectors.toList()) - ); - } - @Override public String toString() { String params = this.operands.stream().map(NumericOperand::toString).collect(Collectors.joining(",")); diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index a3eb9b1..d403f1c 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -37,8 +37,7 @@ public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator if (operands == null) return false; List operandsAsDouble = operands.stream() - .map(operand -> operand.evaluate(dqRule, evaluator)) - .collect(Collectors.toList()); + .map(operand -> evaluator.evaluate(dqRule, operand)).collect(Collectors.toList()); switch (operator) { case BETWEEN: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java index 2d3f3f2..6fb74ba 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java @@ -10,7 +10,6 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import lombok.AllArgsConstructor; import lombok.Getter; @@ -26,6 +25,4 @@ public NumericOperand(final String operand) { this.operand = operand; isParenthesized = false; } - - public abstract Double evaluate(DQRule dqRule, OperandEvaluator evaluator); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java index 9ed19c8..be2449a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java @@ -13,8 +13,12 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import java.io.Serializable; -import java.util.List; +/** + * Class encapsulates implementation logic for resolving NumericOperand to a number (double). + */ public abstract class OperandEvaluator implements Serializable { - public abstract Double evaluate(DQRule rule, String functionName, List operands); + + // resolve operand to number + public abstract Double evaluate(DQRule rule, NumericOperand operand); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index cd38d8c..e7ef21b 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -36,6 +36,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -258,12 +259,12 @@ void test_serializationDeserializationWithNumericExpression() assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); - assertTrue(((NumberBasedCondition) dqRule.getCondition()).evaluate(0.4, dqRule, null)); + assertTrue(((NumberBasedCondition) dqRule.getCondition()).evaluate(0.4, dqRule, testEvaluator)); byte[] serialized = serialize(dqRule); DQRule deserialized = deserialize(serialized, DQRule.class); assertEquals(dqRule.toString(), deserialized.toString()); assertEquals(NumberBasedCondition.class, deserialized.getCondition().getClass()); - assertFalse(((NumberBasedCondition) deserialized.getCondition()).evaluate(0.9, dqRule, null)); + assertFalse(((NumberBasedCondition) deserialized.getCondition()).evaluate(0.9, dqRule, testEvaluator)); } @Test diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java index 1ffd830..56a4c3e 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java @@ -23,6 +23,7 @@ import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -117,7 +118,7 @@ void test_ruleParsingAndVerifyingNumberBasedCondition(String rule, Double metric NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); assertTrue(dqRule.toString().contains(condition.getFormattedCondition())); - assertEquals(shouldRulePass, condition.evaluate(metric, dqRule, null)); + assertEquals(shouldRulePass, condition.evaluate(metric, dqRule, testEvaluator)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } @@ -136,7 +137,7 @@ void test_ruleParsingAndVerifyingNumberBasedThresholdCondition(String rule, Doub NumberBasedCondition thresholdCondition = (NumberBasedCondition) dqRule.getThresholdCondition(); assertTrue(dqRule.toString().contains(thresholdCondition.getFormattedCondition())); - assertEquals(shouldRulePass, thresholdCondition.evaluate(metric, dqRule, null)); + assertEquals(shouldRulePass, thresholdCondition.evaluate(metric, dqRule, testEvaluator)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java index 08a1b71..846b9da 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java @@ -28,19 +28,45 @@ public class NumericOperandTest { private static final String AVG_FUNCTION_NAME = "avg"; private final DQDLParser parser = new DQDLParser(); - private final OperandEvaluator evaluator = new OperandEvaluator() { + public static final OperandEvaluator testEvaluator = new OperandEvaluator() { @Override - public Double evaluate(DQRule rule, String functionName, List operands) { - if (MULTIPLY_FUNCTION_NAME.equals(functionName)) { - return operands.stream().reduce(1.0, (a, b) -> a * b); - } else if (AVG_FUNCTION_NAME.equals(functionName)) { - return operands.stream().reduce(0.0, (a, b) -> a + b) / operands.size(); + public Double evaluate(DQRule rule, NumericOperand operand) { + if (operand instanceof AtomicNumberOperand) { + return Double.parseDouble(operand.getOperand()); + } + if (operand instanceof FunctionCallOperand) { + FunctionCallOperand op = (FunctionCallOperand) operand; + if (MULTIPLY_FUNCTION_NAME.equals(op.getFunctionName())) { + return op.getOperands().stream() + .map(ops -> testEvaluator.evaluate(rule, ops)).reduce(1.0, (a, b) -> a * b); + } else if (AVG_FUNCTION_NAME.equals(op.getFunctionName())) { + return op.getOperands().stream() + .map(ops -> testEvaluator.evaluate(rule, ops)).reduce(0.0, (a, b) -> a + b / op.getOperands().size()); + } else { + throw new RuntimeException("Function not supported"); + } + } + if (operand instanceof BinaryExpressionOperand) { + BinaryExpressionOperand op = (BinaryExpressionOperand) operand; + Double op1 = testEvaluator.evaluate(rule, op.getOperand1()); + Double op2 = testEvaluator.evaluate(rule, op.getOperand2()); + switch (op.getOperator()) { + case "+": + return op1 + op2; + case "-": + return op1 - op2; + case "/": + return op1 / op2; + case "*": + return op1 * op2; + default: + throw new IllegalArgumentException("Bad operator"); + } } else { - throw new RuntimeException("Function not supported"); + throw new RuntimeException("Type not supported"); } } }; - @Test public void test_functionCallWorksWithAtomicNumberOperands() throws InvalidDataQualityRulesetException { String rule = "RowCount = multiply(1,2,3)"; @@ -53,8 +79,8 @@ public void test_functionCallWorksWithAtomicNumberOperands() throws InvalidDataQ assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); - assertTrue(condition.evaluate(6.0, dqRule, evaluator)); - assertFalse(condition.evaluate(3.0, dqRule, evaluator)); + assertTrue(condition.evaluate(6.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(3.0, dqRule, testEvaluator)); } @Test @@ -70,8 +96,8 @@ public void test_functionCallWorksWithNestedFunctionCallOperands() throws Invali assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); - assertTrue(condition.evaluate(45.0, dqRule, evaluator)); - assertFalse(condition.evaluate(40.0, dqRule, evaluator)); + assertTrue(condition.evaluate(45.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(40.0, dqRule, testEvaluator)); } @Test @@ -87,8 +113,8 @@ public void test_functionCallWorksInBinaryExpression() throws InvalidDataQuality assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); - assertTrue(condition.evaluate(90.0, dqRule, evaluator)); - assertFalse(condition.evaluate(45.0, dqRule, evaluator)); + assertTrue(condition.evaluate(90.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(45.0, dqRule, testEvaluator)); } @Test @@ -104,7 +130,7 @@ public void test_functionCallWorksInSimpleBinaryExpression() throws InvalidDataQ assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); - assertTrue(condition.evaluate(16.0, dqRule, evaluator)); - assertFalse(condition.evaluate(8.0, dqRule, evaluator)); + assertTrue(condition.evaluate(16.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(8.0, dqRule, testEvaluator)); } } From 10cdea33dfd50f1684130e92f9e18d2e59f1d9b1 Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Mon, 23 Oct 2023 14:42:08 -0400 Subject: [PATCH 07/50] Add new interface for getting rule type and parameters from both analyzers and rules --- .../ml/dataquality/dqdl/model/DQAnalyzer.java | 2 +- .../ml/dataquality/dqdl/model/DQRule.java | 2 +- .../dqdl/model/HasRuleTypeAndParameters.java | 20 +++++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java index d85ad69..66b8e84 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java @@ -17,7 +17,7 @@ @AllArgsConstructor @Getter -public class DQAnalyzer { +public class DQAnalyzer implements HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 57e35ae..6963786 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -25,7 +25,7 @@ @AllArgsConstructor @Getter @EqualsAndHashCode -public class DQRule implements Serializable { +public class DQRule implements Serializable, HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; private final Condition condition; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java new file mode 100644 index 0000000..36d6bfc --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java @@ -0,0 +1,20 @@ +/* + * HasRuleTypeAndParameters.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import java.util.Map; + +public interface HasRuleTypeAndParameters { + + String getRuleType(); + + Map getParameters(); +} From 5c32c6c6fb1a57cba02fc01f32efbc8b1f94a5f2 Mon Sep 17 00:00:00 2001 From: Jesus Max Hernandez Date: Wed, 25 Oct 2023 12:35:12 -0400 Subject: [PATCH 08/50] Add scope to DQRuleType --- configuration/rules/rules-config.json | 87 ++++++++++++------- .../ml/dataquality/dqdl/model/DQRuleType.java | 6 +- .../dqdl/model/DeserializationTest.java | 26 ++++++ 3 files changed, 89 insertions(+), 30 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 15b3877..ea82ccb 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -4,7 +4,8 @@ "rule_type_name": "RowCount", "description": "Check the number of rows in the dataset", "parameters": [], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "RowCountMatch", @@ -16,13 +17,15 @@ "description": "Alias of reference dataset" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "ColumnCount", "description": "Checks the number of columns in the dataset", "parameters": [], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "Completeness", @@ -34,7 +37,8 @@ "description": "Name of column to check completeness of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "IsComplete", @@ -46,7 +50,8 @@ "description": "Name of column to check completeness of" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "column" }, { "rule_type_name": "ColumnDataType", @@ -59,7 +64,8 @@ } ], "return_type": "STRING", - "is_threshold_supported": true + "is_threshold_supported": true, + "scope": "column" }, { "rule_type_name": "ColumnNamesMatchPattern", @@ -71,7 +77,8 @@ "description": "Pattern to match against the names of the columns" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "table" }, { "rule_type_name": "ColumnExists", @@ -83,7 +90,8 @@ "description": "Name of column to check existence of" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "column" }, { "rule_type_name": "ColumnCorrelation", @@ -100,7 +108,8 @@ "description": "Name of second column" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "Uniqueness", @@ -112,7 +121,8 @@ "description": "Name of column to check uniqueness of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "IsUnique", @@ -124,7 +134,8 @@ "description": "Name of column to check uniqueness of" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "column" }, { "rule_type_name": "Mean", @@ -136,7 +147,8 @@ "description": "Name of column to check mean of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "Sum", @@ -148,7 +160,8 @@ "description": "Name of column to check sum of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "StandardDeviation", @@ -160,7 +173,8 @@ "description": "Name of column to check standard deviation of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "Entropy", @@ -172,7 +186,8 @@ "description": "Name of column to check entropy of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "DistinctValuesCount", @@ -184,7 +199,8 @@ "description": "Name of column to check distinct values count of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "UniqueValueRatio", @@ -196,7 +212,8 @@ "description": "Name of column to check unique value ratio of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "ColumnLength", @@ -208,7 +225,8 @@ "description": "Name of column to check the length of the values of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "IsPrimaryKey", @@ -221,7 +239,8 @@ "is_var_arg": true } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "column" }, { "rule_type_name": "ColumnValues", @@ -234,7 +253,8 @@ } ], "return_type": "STRING_ARRAY|NUMBER_ARRAY|DATE_ARRAY", - "is_threshold_supported": true + "is_threshold_supported": true, + "scope": "column" }, { "rule_type_name": "DataFreshness", @@ -246,7 +266,8 @@ "description": "Name of column to check the freshness of" } ], - "return_type": "DURATION_ARRAY" + "return_type": "DURATION_ARRAY", + "scope": "column" }, { "rule_type_name": "CustomSql", @@ -259,7 +280,8 @@ } ], "return_type": "NUMBER|BOOLEAN", - "is_threshold_supported": true + "is_threshold_supported": true, + "scope": "table" }, { "rule_type_name": "ReferentialIntegrity", @@ -276,7 +298,8 @@ "description": "Alias of reference dataset and comma separated names of columns from reference dataset. The alias and the names should be separated by a period. The names should be enclosed in curly brackets." } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "DatasetMatch", @@ -293,7 +316,8 @@ "description": "Mappings of key columns used for joining the two datasets" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "DatasetMatch", @@ -315,7 +339,8 @@ "description": "Mappings of columns used for matching" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "SchemaMatch", @@ -327,7 +352,8 @@ "description": "Alias of reference dataset" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "AggregateMatch", @@ -344,7 +370,8 @@ "description": "The second aggregate expression" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "DetectAnomalies", @@ -357,7 +384,8 @@ "is_var_arg": true } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "column" }, { "rule_type_name": "AllStatistics", @@ -370,7 +398,8 @@ "description": "Name of the column to analyze" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" } ] } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 8018d86..e6ef704 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -35,19 +35,23 @@ public class DQRuleType { private final boolean isThresholdSupported; private final boolean isAnalyzerOnly; + private final String scope; + public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "description") String description, @JsonProperty(value = "parameters") List parameters, @JsonProperty(value = "return_type") String returnType, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, - @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly) { + @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, + @JsonProperty(value = "scope") String scope) { this.ruleTypeName = ruleTypeName; this.description = description; this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; this.isAnalyzerOnly = isAnalyzerOnly; + this.scope = scope; if (parameters.isEmpty()) { return; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java index d92079e..081a29a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java @@ -179,4 +179,30 @@ public void test_parseDQRuleTypeWithMultipleParametersAndIncorrectVarArgParamete assertEquals(IllegalArgumentException.class, thrown.getCause().getClass()); assertTrue(thrown.getMessage().contains("Property isVarArg can only be set to true on last element in parameters list")); } + + @Test + public void test_parseDQRuleTypeScope() throws JsonProcessingException { + String ruleTypeName = "ColumnCount"; + String ruleTypeDesc = "This rule checks the column count"; + String returnType = "NUMBER"; + String scope = "table"; + + String json = String.format( + "{" + + "\"rule_type_name\":\"%s\"," + + "\"description\":\"%s\"," + + "\"parameters\": [ ]," + + "\"return_type\": \"%s\"," + + "\"scope\": \"%s\"" + + "}", + ruleTypeName, ruleTypeDesc, returnType, scope); + + DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); + + assertEquals(ruleTypeName, ruleType.getRuleTypeName()); + assertEquals(ruleTypeDesc, ruleType.getDescription()); + assertEquals(returnType, ruleType.getReturnType()); + assertEquals(scope, ruleType.getScope()); + assertTrue(ruleType.getParameters().isEmpty()); + } } From 062cdb4983b2442348ffd52af6306c6a9dc96190 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Mon, 30 Oct 2023 12:48:02 -0400 Subject: [PATCH 09/50] Updated the parser based on the updated parameter grammar - Double quotes are now optional in the parameter. We check for the sub rules to identify if quotes are provided. If they are, we strip them out before storing them in DQRule. - Added a test to verify that parameters without double quotes are parsed correctly. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 3 +- .../ml/dataquality/dqdl/model/DQRuleType.java | 7 ++-- .../dqdl/parser/DQDLParserListener.java | 14 ++++++-- .../ml/dataquality/dqdl/model/DQRuleTest.java | 32 +++++++++++++++++++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index d84f927..7fc4959 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -86,7 +86,8 @@ durationBasedCondition: ruleType: IDENTIFIER; analyzerType: IDENTIFIER; -parameter: (QUOTED_STRING | INT | DIGIT); +parameter: QUOTED_STRING + | IDENTIFIER; condition: numberBasedCondition diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index e6ef704..0196984 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -34,9 +34,10 @@ public class DQRuleType { private final String returnType; private final boolean isThresholdSupported; private final boolean isAnalyzerOnly; - private final String scope; + private final boolean isExperimental; + @SuppressWarnings("checkstyle:parameternumber") public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "description") String description, @JsonProperty(value = "parameters") List parameters, @@ -44,7 +45,8 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, - @JsonProperty(value = "scope") String scope) { + @JsonProperty(value = "scope") String scope, + @JsonProperty(value = "experimental") boolean isExperimental) { this.ruleTypeName = ruleTypeName; this.description = description; this.parameters = parameters; @@ -52,6 +54,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.isThresholdSupported = isThresholdSupported; this.isAnalyzerOnly = isAnalyzerOnly; this.scope = scope; + this.isExperimental = isExperimental; if (parameters.isEmpty()) { return; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index bf54279..f477ec3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -202,7 +202,7 @@ private Either getDQRule( DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { String ruleType = dqRuleContext.ruleType().getText(); List parameters = dqRuleContext.parameter().stream() - .map(p -> p.getText().replaceAll("\"", "")) + .map(this::parseParameter) .collect(Collectors.toList()); Optional optionalDQRuleType = DQRuleType.getRuleType(ruleType, parameters.size()); @@ -284,7 +284,7 @@ private Either getDQAnalyzer( DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) { String analyzerType = dqAnalyzerContext.analyzerType().getText(); List parameters = dqAnalyzerContext.parameter().stream() - .map(p -> p.getText().replaceAll("\"", "")) + .map(this::parseParameter) .collect(Collectors.toList()); // We just use the DQ Rule names to validate what analyzer names to allow. @@ -723,4 +723,14 @@ private String removeEscapes(String stringWithEscapes) { stringWithEscapes = stringWithEscapes.replaceAll("\\\\(.)", "$1"); return stringWithEscapes; } + + private String parseParameter(DataQualityDefinitionLanguageParser.ParameterContext pc) { + if (pc.QUOTED_STRING() != null) { + return removeQuotes(pc.QUOTED_STRING().getText()); + } else if (pc.IDENTIFIER() != null) { + return pc.IDENTIFIER().getText(); + } else { + return pc.getText(); + } + } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index e7ef21b..ef01836 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -277,6 +277,38 @@ void test_compositeRulesAreReparseable() throws InvalidDataQualityRulesetExcepti assertEquals(reStringed, rulesetString); } + @Test + void test_parametersWithoutQuotesAreParsed() throws InvalidDataQualityRulesetException { + String colA = "colA"; + String colB = "col\\\"B"; + String colC = "col C"; + + String allCols = "AllColumns"; + + String rule1 = String.format("IsPrimaryKey %s \"%s\" \"%s\"", colA, colB, colC); + String rule2 = String.format("ColumnValues %s between 1 and 10", colA); + + String analyzer1 = String.format("Completeness \"%s\"", colC); + String analyzer2 = String.format("AllStatistics %s", allCols); + + String ruleset = String.format( + "Rules = [ %s, %s ] Analyzers = [ %s, %s ]", rule1, rule2, analyzer1, analyzer2); + + DQRuleset dqRuleset = parser.parse(ruleset); + + DQRule parsedRule1 = dqRuleset.getRules().get(0); + DQRule parsedRule2 = dqRuleset.getRules().get(1); + + DQAnalyzer parsedAnalyzer1 = dqRuleset.getAnalyzers().get(0); + DQAnalyzer parsedAnalyzer2 = dqRuleset.getAnalyzers().get(1); + + assertTrue(Stream.of(colA, colB, colC).allMatch(c -> parsedRule1.getParameters().containsValue(c))); + assertTrue(Stream.of(colA).allMatch(c -> parsedRule2.getParameters().containsValue(c))); + + assertTrue(Stream.of(colC).allMatch(c -> parsedAnalyzer1.getParameters().containsValue(c))); + assertTrue(Stream.of(allCols).allMatch(c -> parsedAnalyzer2.getParameters().containsValue(c))); + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; From bf68783f698183df2eb83a9c745b2585dc962ca4 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 31 Oct 2023 13:46:03 -0400 Subject: [PATCH 10/50] Updated parser based on the new "rulesOrAnalyzers" parser rule - Rules can be empty, Analyzers can be empty, but both can't be empty at the same time. - Also added a guard against empty dictionaries for the Metadata/DataSources section. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 3 +- .../dqdl/parser/DQDLParserListener.java | 32 ++++++++++++++++--- .../dqdl/parser/InvalidDQRulesetTest.java | 7 ++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 7fc4959..8efe59d 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -124,5 +124,6 @@ array: LBRAC QUOTED_STRING (COMMA QUOTED_STRING)* RBRAC; metadata: metadataSectionStart EQUAL_TO dictionary; dataSources: dataSourcesSectionStart EQUAL_TO dictionary; +rulesOrAnalyzers: rules | analyzers | rules analyzers; -document: metadata? dataSources? rules analyzers?; +document: metadata? dataSources? rulesOrAnalyzers; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index f477ec3..875c1dc 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -90,8 +90,18 @@ public Either, DQRuleset> getParsedRuleset() { @Override public void enterMetadata(DataQualityDefinitionLanguageParser.MetadataContext ctx) { - for (DataQualityDefinitionLanguageParser.PairContext pairContext - : ctx.dictionary().pair()) { + // The logic below, just above the loop is a guard against an NPE caused by empty dictionaries. + // Need to investigate why dictionaryContext.pair() returns 1 element, + // which is an empty string, for an empty dictionary. + // We would not have this problem if dictionaryContext.pair() returned 0 entries in the list. + DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary(); + List dictionaryErrors = validateDictionary(dictionaryContext); + if (!dictionaryErrors.isEmpty()) { + errorMessages.addAll(dictionaryErrors); + return; + } + + for (DataQualityDefinitionLanguageParser.PairContext pairContext: dictionaryContext.pair()) { String key = removeEscapes(removeQuotes(pairContext.QUOTED_STRING().getText())); if (!ALLOWED_METADATA_KEYS.contains(key)) { errorMessages.add("Unsupported key provided in Metadata section"); @@ -112,8 +122,14 @@ public void enterRules(DataQualityDefinitionLanguageParser.RulesContext ctx) { @Override public void enterDataSources(DataQualityDefinitionLanguageParser.DataSourcesContext ctx) { - for (DataQualityDefinitionLanguageParser.PairContext pairContext - : ctx.dictionary().pair()) { + DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary(); + List dictionaryErrors = validateDictionary(dictionaryContext); + if (!dictionaryErrors.isEmpty()) { + errorMessages.addAll(dictionaryErrors); + return; + } + + for (DataQualityDefinitionLanguageParser.PairContext pairContext: dictionaryContext.pair()) { String key = removeEscapes(removeQuotes(pairContext.QUOTED_STRING().getText())); if (!ALLOWED_SOURCES_KEYS.contains(key)) { @@ -733,4 +749,12 @@ private String parseParameter(DataQualityDefinitionLanguageParser.ParameterConte return pc.getText(); } } + + private List validateDictionary(DataQualityDefinitionLanguageParser.DictionaryContext dc) { + List dictionaryErrors = new ArrayList<>(); + if (dc.pair() == null || (dc.pair().size() == 1 && dc.pair().get(0).getText().isEmpty())) { + dictionaryErrors.add("Empty dictionary provided"); + } + return dictionaryErrors; + } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 9288034..ef8036d 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -24,6 +24,12 @@ public class InvalidDQRulesetTest { private static Stream provideInvalidRulesets() { return Stream.of( + Arguments.of(""), + Arguments.of("Metadata = {}"), + Arguments.of("DataSources = {}"), + Arguments.of("Metadata = { \"Version\": \"1.0\" }"), + Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = {}"), + Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = { \"Primary\": \"Foo\" }"), Arguments.of("Rules = {"), Arguments.of("Rules = }"), Arguments.of("Rules = { }"), @@ -78,6 +84,7 @@ private static Stream provideInvalidRulesets() { private static Stream provideInvalidRulesetsWithAnalyzers() { return Stream.of( + Arguments.of("Rules = [ ] Analyzers = [ Completeness \"colA\" ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ IsComplete \"colA\" ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", ]"), From e68d80ca29789d0d253cc85a2a54f94c689bec5b Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Mon, 30 Oct 2023 18:05:15 -0400 Subject: [PATCH 11/50] Support for connector words in front of parameters - Parameters can be configured using words like "of" and "and". This will support the DQDL extension design for Hemingway. - Introduced a new model class called DQParameterValue that will store whether the user provided connector words and double quotes in the parameter. This will be used when converting the rule back to a string. --- configuration/dqdl/CommonLexerRules.g4 | 1 + .../dqdl/DataQualityDefinitionLanguage.g4 | 5 +- .../ml/dataquality/dqdl/model/DQRule.java | 66 ++++++++++++++- .../dqdl/model/DQRuleParameterValue.java | 53 ++++++++++++ .../dqdl/parser/DQDLParserListener.java | 20 ++--- .../dqdl/model/DQRuleParameterValueTest.java | 80 +++++++++++++++++++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 50 ++++++++++++ 7 files changed, 262 insertions(+), 13 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java create mode 100644 tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4 index d1890a6..617783e 100644 --- a/configuration/dqdl/CommonLexerRules.g4 +++ b/configuration/dqdl/CommonLexerRules.g4 @@ -12,6 +12,7 @@ LPAREN: '('; RPAREN: ')'; AND: 'and' | 'AND'; OR: 'or' | 'OR'; +OF: 'of' | 'OF'; BETWEEN: 'between'; EQUAL_TO: '='; diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 8efe59d..0d6cdb2 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -88,6 +88,7 @@ ruleType: IDENTIFIER; analyzerType: IDENTIFIER; parameter: QUOTED_STRING | IDENTIFIER; +parameters: OF? parameter (AND? parameter)*; condition: numberBasedCondition @@ -97,8 +98,8 @@ condition: withThresholdCondition: 'with' 'threshold' numberBasedCondition; -dqRule: ruleType parameter* condition? withThresholdCondition?; -dqAnalyzer: analyzerType parameter*; +dqRule: ruleType parameters? condition? withThresholdCondition?; +dqAnalyzer: analyzerType parameters?; topLevelRule: dqRule diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 6963786..c95178c 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -17,6 +17,8 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -28,34 +30,82 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; + private final LinkedHashMap parameterValueMap; private final Condition condition; private final Condition thresholdCondition; private final DQRuleLogicalOperator operator; private final List nestedRules; + // Adding this constructor so as to not break the Data Quality ETL package. + public DQRule(final String ruleType, + final Map parameters, + final Condition condition, + final Condition thresholdCondition, + final DQRuleLogicalOperator operator, + final List nestedRules) { + this.ruleType = ruleType; + this.parameters = parameters; + this.parameterValueMap = createParameterValueMap(parameters); + this.condition = condition; + this.thresholdCondition = thresholdCondition; + this.operator = operator; + this.nestedRules = nestedRules; + } + public DQRule(final String ruleType, final Map parameters, final Condition condition) { this.ruleType = ruleType; this.parameters = parameters; + this.parameterValueMap = createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = null; this.operator = DQRuleLogicalOperator.AND; this.nestedRules = new ArrayList<>(); } + // Can't overload the constructor above, due to type erasure + public static DQRule createFromParameterValueMap(final String ruleType, + final LinkedHashMap parameters, + final Condition condition) { + return createFromParameterValueMap(ruleType, parameters, condition, null); + } + public DQRule(final String ruleType, final Map parameters, final Condition condition, final Condition thresholdCondition) { this.ruleType = ruleType; this.parameters = parameters; + this.parameterValueMap = createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = thresholdCondition; this.operator = DQRuleLogicalOperator.AND; this.nestedRules = new ArrayList<>(); } + // Can't overload the constructor above, due to type erasure + public static DQRule createFromParameterValueMap(final String ruleType, + final LinkedHashMap parameters, + final Condition condition, + final Condition thresholdCondition) { + Map paramValuesAsStringsMap = new HashMap<>(); + parameters.forEach((k, v) -> paramValuesAsStringsMap.put(k, v.getValue())); + + DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; + List nestedRules = new ArrayList<>(); + + return new DQRule( + ruleType, + paramValuesAsStringsMap, + parameters, + condition, + thresholdCondition, + operator, + nestedRules + ); + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -63,8 +113,8 @@ public String toString() { if (nestedRules == null || nestedRules.isEmpty()) { sb.append(ruleType); - if (parameters != null) { - parameters.values().forEach(p -> sb.append(" ").append("\"").append(p).append("\"")); + if (parameterValueMap != null) { + parameterValueMap.values().forEach(p -> sb.append(" ").append(p.toString())); } if (condition != null) { @@ -89,4 +139,16 @@ public String toString() { return sb.toString(); } + + private static LinkedHashMap createParameterValueMap(Map parameters) { + LinkedHashMap map = new LinkedHashMap<>(); + if (parameters == null) return map; + + // Add quotes when converting from the map of string values, and do not use connector word. + // This is to maintain backwards compatibility. + boolean isQuoted = true; + parameters.forEach((k, v) -> map.put(k, new DQRuleParameterValue(v, isQuoted))); + + return map; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java new file mode 100644 index 0000000..acead65 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java @@ -0,0 +1,53 @@ +/* + * DQRuleParameterValue.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; + +@AllArgsConstructor +@Getter +@EqualsAndHashCode +public class DQRuleParameterValue implements Serializable { + private static final String EMPTY_CONNECTOR = ""; + + private final String value; + private final boolean isQuoted; + + // We could use an Optional here, instead of resorting to an empty string. + // But this needs to be serializable for Spark. + // Optional has presented problems in that regard. + private final String connectorWord; + + public DQRuleParameterValue(final String value) { + this.value = value; + this.isQuoted = false; + this.connectorWord = EMPTY_CONNECTOR; + } + + public DQRuleParameterValue(final String value, final boolean isQuoted) { + this.value = value; + this.isQuoted = isQuoted; + this.connectorWord = EMPTY_CONNECTOR; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (!EMPTY_CONNECTOR.equals(connectorWord)) sb.append(connectorWord).append(" "); + String surroundBy = isQuoted ? "\"" : ""; + sb.append(surroundBy).append(value).append(surroundBy); + return sb.toString(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 875c1dc..3823118 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -181,7 +181,7 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu } } - dqRules.add(new DQRule("Composite", null, null, null, op, nestedRules)); + dqRules.add(new DQRule("Composite", null, null, null, null, op, nestedRules)); } else if (tlc.dqRule(0) != null) { Either dqRuleEither = getDQRule(tlc.dqRule(0)); if (dqRuleEither.isLeft()) { @@ -217,9 +217,8 @@ public void enterDqAnalyzers(DataQualityDefinitionLanguageParser.DqAnalyzersCont private Either getDQRule( DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { String ruleType = dqRuleContext.ruleType().getText(); - List parameters = dqRuleContext.parameter().stream() - .map(this::parseParameter) - .collect(Collectors.toList()); + + List parameters = parseParameters(dqRuleContext.parameters()); Optional optionalDQRuleType = DQRuleType.getRuleType(ruleType, parameters.size()); @@ -291,17 +290,15 @@ private Either getDQRule( } return Either.fromRight( - new DQRule(dqRuleType.getRuleTypeName(), parameterMap, condition, - thresholdCondition, DQRuleLogicalOperator.AND, new ArrayList<>()) + new DQRule(dqRuleType.getRuleTypeName(), parameterMap, condition, thresholdCondition) ); } private Either getDQAnalyzer( DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) { String analyzerType = dqAnalyzerContext.analyzerType().getText(); - List parameters = dqAnalyzerContext.parameter().stream() - .map(this::parseParameter) - .collect(Collectors.toList()); + + List parameters = parseParameters(dqAnalyzerContext.parameters()); // We just use the DQ Rule names to validate what analyzer names to allow. // This might change closer to re:Invent, but keeping it simple for now. @@ -740,6 +737,11 @@ private String removeEscapes(String stringWithEscapes) { return stringWithEscapes; } + private List parseParameters(DataQualityDefinitionLanguageParser.ParametersContext psc) { + if (psc == null || psc.parameter() == null) return new ArrayList<>(); + return psc.parameter().stream().map(this::parseParameter).collect(Collectors.toList()); + } + private String parseParameter(DataQualityDefinitionLanguageParser.ParameterContext pc) { if (pc.QUOTED_STRING() != null) { return removeQuotes(pc.QUOTED_STRING().getText()); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java new file mode 100644 index 0000000..1e50325 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java @@ -0,0 +1,80 @@ +/* + * DQRuleParameterValueTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class DQRuleParameterValueTest { + + @Test + public void test_constructorWithValueArg() { + String value = "col-A"; + DQRuleParameterValue param = new DQRuleParameterValue(value); + assertEquals(value, param.getValue()); + assertFalse(param.isQuoted()); + assertTrue(param.getConnectorWord().isEmpty()); + } + + @Test + public void test_constructorWithValueAndIsQuotedArgs() { + String value = "col-A"; + boolean isQuoted = true; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted); + assertEquals(value, param.getValue()); + assertEquals(isQuoted, param.isQuoted()); + assertTrue(param.getConnectorWord().isEmpty()); + } + + @Test + public void test_parameterValueToStringWithNoConnectorWordAndNoQuotes() { + String value = "col-A"; + String connectorWord = ""; + boolean isQuoted = false; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord); + assertEquals(value, param.toString()); + } + + @Test + public void test_parameterValueToStringWithConnectorWordAndNoQuotes() { + String value = "col-A"; + String connectorWord = "of"; + boolean isQuoted = false; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord); + assertEquals(String.format("%s %s", connectorWord, value), param.toString()); + } + + @Test + public void test_parameterValueToStringWithConnectorWordAndWithQuotes() { + String value = "col-A"; + String connectorWord = "of"; + boolean isQuoted = true; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord); + assertEquals(String.format("%s \"%s\"", connectorWord, value), param.toString()); + } + + @Test + public void test_equalsAndHashCode() { + String value = "col-A"; + String connectorWord = "of"; + boolean isQuoted = true; + + DQRuleParameterValue param1 = new DQRuleParameterValue(value, isQuoted, connectorWord); + DQRuleParameterValue param2 = new DQRuleParameterValue(value, isQuoted, connectorWord); + + assertNotSame(param1, param2); + assertEquals(param1, param2); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index ef01836..93860b5 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -29,9 +29,11 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -39,6 +41,9 @@ import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -277,6 +282,38 @@ void test_compositeRulesAreReparseable() throws InvalidDataQualityRulesetExcepti assertEquals(reStringed, rulesetString); } + @Test + void test_constructorWithOriginalParameterMap() { + String ruleType = "IsComplete"; + String columnKey = "TargetColumn"; + String column = "colA"; + String emptyCondition = ""; + + Map parameters = new HashMap<>(); + parameters.put(columnKey, column); + + Condition condition = new Condition(emptyCondition); + Condition thresholdCondition = new Condition(emptyCondition); + + DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; + List nestedRules = new ArrayList<>(); + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition, operator, nestedRules); + + assertEquals(ruleType, rule.getRuleType()); + + assertTrue(rule.getParameters().containsKey(columnKey)); + assertEquals(column, rule.getParameters().get(columnKey)); + assertTrue(rule.getParameterValueMap().containsKey(columnKey)); + assertEquals(column, rule.getParameterValueMap().get(columnKey).getValue()); + assertTrue(rule.getParameterValueMap().get(columnKey).getConnectorWord().isEmpty()); + assertTrue(rule.getParameterValueMap().get(columnKey).isQuoted()); + assertTrue(rule.getCondition().getConditionAsString().isEmpty()); + assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); + assertEquals(operator, rule.getOperator()); + assertTrue(rule.getNestedRules().isEmpty()); + } + @Test void test_parametersWithoutQuotesAreParsed() throws InvalidDataQualityRulesetException { String colA = "colA"; @@ -309,6 +346,19 @@ void test_parametersWithoutQuotesAreParsed() throws InvalidDataQualityRulesetExc assertTrue(Stream.of(allCols).allMatch(c -> parsedAnalyzer2.getParameters().containsValue(c))); } + @Test + public void test_equalsAndHashCode() throws InvalidDataQualityRulesetException { + String rule = "IsPrimaryKey \"colA\" \"colB\""; + String ruleset = String.format("Rules = [ %s ]", rule); + + DQRuleset dqRuleset1 = parser.parse(ruleset); + DQRuleset dqRuleset2 = parser.parse(ruleset); + + assertNotSame(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1.hashCode(), dqRuleset2.hashCode()); + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; From 469954f94e5e5178e951725f4a2e892d885e68e7 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Wed, 1 Nov 2023 14:21:34 -0400 Subject: [PATCH 12/50] Populate the DQRuleParameterValue based on the updated grammar - The updated grammer tells us if a connector word is used and if a parameter is in double quotes. - With these changes, we now use this information to set the values in DQRuleParameterValue accordingly. --- configuration/dqdl/CommonLexerRules.g4 | 2 +- .../dqdl/DataQualityDefinitionLanguage.g4 | 7 ++-- .../ml/dataquality/dqdl/model/DQAnalyzer.java | 18 +++++++- .../ml/dataquality/dqdl/model/DQRule.java | 26 +++--------- .../dqdl/model/DQRuleParameterValue.java | 21 ++++++++++ .../ml/dataquality/dqdl/model/DQRuleType.java | 15 +++---- .../dqdl/parser/DQDLParserListener.java | 41 ++++++++++++------- .../ml/dataquality/dqdl/model/DQRuleTest.java | 9 +++- 8 files changed, 86 insertions(+), 53 deletions(-) diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4 index 617783e..5b219a1 100644 --- a/configuration/dqdl/CommonLexerRules.g4 +++ b/configuration/dqdl/CommonLexerRules.g4 @@ -32,7 +32,7 @@ NEGATIVE: '-'; LINE_COMMENT: '#' .*? '\r'? '\n' -> skip; // Match "#" stuff '\n' -IDENTIFIER: [a-zA-Z0-9]+; +IDENTIFIER: [a-zA-Z0-9_.]+; WS: [ \t\n]+ -> skip; diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 0d6cdb2..296a3dc 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -88,7 +88,8 @@ ruleType: IDENTIFIER; analyzerType: IDENTIFIER; parameter: QUOTED_STRING | IDENTIFIER; -parameters: OF? parameter (AND? parameter)*; +connectorWord: OF | AND; +parameterWithConnectorWord: connectorWord? parameter; condition: numberBasedCondition @@ -98,8 +99,8 @@ condition: withThresholdCondition: 'with' 'threshold' numberBasedCondition; -dqRule: ruleType parameters? condition? withThresholdCondition?; -dqAnalyzer: analyzerType parameters?; +dqRule: ruleType parameterWithConnectorWord* condition? withThresholdCondition?; +dqAnalyzer: analyzerType parameterWithConnectorWord*; topLevelRule: dqRule diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java index 66b8e84..9bf56ea 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java @@ -13,6 +13,7 @@ import lombok.AllArgsConstructor; import lombok.Getter; +import java.util.LinkedHashMap; import java.util.Map; @AllArgsConstructor @@ -20,14 +21,27 @@ public class DQAnalyzer implements HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; + private final Map parameterValueMap; + + public DQAnalyzer(final String ruleType, + final Map parameters) { + this.ruleType = ruleType; + this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(this.parameters); + } + + public static DQAnalyzer createFromValueMap(final String ruleType, + final LinkedHashMap parameters) { + return new DQAnalyzer(ruleType, DQRuleParameterValue.createParameterMap(parameters), parameters); + } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(ruleType); - if (parameters != null) { - parameters.values().forEach(p -> sb.append(" ").append("\"").append(p).append("\"")); + if (parameterValueMap != null) { + parameterValueMap.values().forEach(p -> sb.append(" ").append(p.toString())); } return sb.toString(); diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index c95178c..482815c 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -17,7 +17,6 @@ import java.io.Serializable; import java.util.ArrayList; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -30,7 +29,7 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; - private final LinkedHashMap parameterValueMap; + private final Map parameterValueMap; private final Condition condition; private final Condition thresholdCondition; private final DQRuleLogicalOperator operator; @@ -45,7 +44,7 @@ public DQRule(final String ruleType, final List nestedRules) { this.ruleType = ruleType; this.parameters = parameters; - this.parameterValueMap = createParameterValueMap(parameters); + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = thresholdCondition; this.operator = operator; @@ -57,7 +56,7 @@ public DQRule(final String ruleType, final Condition condition) { this.ruleType = ruleType; this.parameters = parameters; - this.parameterValueMap = createParameterValueMap(parameters); + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = null; this.operator = DQRuleLogicalOperator.AND; @@ -77,7 +76,7 @@ public DQRule(final String ruleType, final Condition thresholdCondition) { this.ruleType = ruleType; this.parameters = parameters; - this.parameterValueMap = createParameterValueMap(parameters); + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = thresholdCondition; this.operator = DQRuleLogicalOperator.AND; @@ -89,15 +88,12 @@ public static DQRule createFromParameterValueMap(final String ruleType, final LinkedHashMap parameters, final Condition condition, final Condition thresholdCondition) { - Map paramValuesAsStringsMap = new HashMap<>(); - parameters.forEach((k, v) -> paramValuesAsStringsMap.put(k, v.getValue())); - DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; List nestedRules = new ArrayList<>(); return new DQRule( ruleType, - paramValuesAsStringsMap, + DQRuleParameterValue.createParameterMap(parameters), parameters, condition, thresholdCondition, @@ -139,16 +135,4 @@ public String toString() { return sb.toString(); } - - private static LinkedHashMap createParameterValueMap(Map parameters) { - LinkedHashMap map = new LinkedHashMap<>(); - if (parameters == null) return map; - - // Add quotes when converting from the map of string values, and do not use connector word. - // This is to maintain backwards compatibility. - boolean isQuoted = true; - parameters.forEach((k, v) -> map.put(k, new DQRuleParameterValue(v, isQuoted))); - - return map; - } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java index acead65..c61643a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java @@ -15,6 +15,9 @@ import lombok.Getter; import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; @AllArgsConstructor @Getter @@ -50,4 +53,22 @@ public String toString() { sb.append(surroundBy).append(value).append(surroundBy); return sb.toString(); } + + public static Map createParameterValueMap(Map parameters) { + Map map = new HashMap<>(); + if (parameters == null) return map; + + // Add quotes when converting from the map of string values, and do not use connector word. + // This is to maintain backwards compatibility. + boolean isQuoted = true; + parameters.forEach((k, v) -> map.put(k, new DQRuleParameterValue(v, isQuoted))); + + return map; + } + + public static Map createParameterMap(Map parameters) { + Map paramValuesAsStringsMap = new LinkedHashMap<>(); + parameters.forEach((k, v) -> paramValuesAsStringsMap.put(k, v.getValue())); + return paramValuesAsStringsMap; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 0196984..8af11b1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets; import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; @@ -71,11 +70,9 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, } public Optional verifyParameters(List expectedParameters, - List actualParameters) { + List actualParameters) { if (!expectedParameters.isEmpty()) { - - boolean isVarArg = expectedParameters.get( - expectedParameters.size() - 1).isVarArg(); + boolean isVarArg = expectedParameters.get(expectedParameters.size() - 1).isVarArg(); if (isVarArg) { if (expectedParameters.size() > actualParameters.size()) { @@ -93,9 +90,9 @@ public Optional verifyParameters(List expectedParameter return Optional.empty(); } - public Map createParameterMap(List dqRuleTypeParameters, - List actualParameters) { - Map parameterMap = new LinkedHashMap<>(); + public LinkedHashMap createParameterMap(List dqRuleTypeParameters, + List actualParameters) { + LinkedHashMap parameterMap = new LinkedHashMap<>(); for (int i = 0; i < dqRuleTypeParameters.size(); i++) { String dqRuleTypeParameterName = dqRuleTypeParameters.get(i).getName(); @@ -109,7 +106,7 @@ public Map createParameterMap(List dqRuleTypePa for (int j = counter; j < actualParameters.size(); j++) { String newDqRuleTypeParameterName = dqRuleTypeParameterName + (j + 1); - String actualParameterName = actualParameters.get(j); + DQRuleParameterValue actualParameterName = actualParameters.get(j); parameterMap.put(newDqRuleTypeParameterName, actualParameterName); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 3823118..0b6cbea 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -12,6 +12,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.DQAnalyzer; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleLogicalOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleParameterValue; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; @@ -40,6 +41,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -218,7 +220,7 @@ private Either getDQRule( DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { String ruleType = dqRuleContext.ruleType().getText(); - List parameters = parseParameters(dqRuleContext.parameters()); + List parameters = parseParameters(dqRuleContext.parameterWithConnectorWord()); Optional optionalDQRuleType = DQRuleType.getRuleType(ruleType, parameters.size()); @@ -238,7 +240,8 @@ private Either getDQRule( return Either.fromLeft(String.format(errorMessage.get() + ": %s", ruleType)); } - Map parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); + LinkedHashMap parameterMap = + dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); Condition condition; @@ -290,7 +293,8 @@ private Either getDQRule( } return Either.fromRight( - new DQRule(dqRuleType.getRuleTypeName(), parameterMap, condition, thresholdCondition) + DQRule.createFromParameterValueMap( + dqRuleType.getRuleTypeName(), parameterMap, condition, thresholdCondition) ); } @@ -298,7 +302,7 @@ private Either getDQAnalyzer( DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) { String analyzerType = dqAnalyzerContext.analyzerType().getText(); - List parameters = parseParameters(dqAnalyzerContext.parameters()); + List parameters = parseParameters(dqAnalyzerContext.parameterWithConnectorWord()); // We just use the DQ Rule names to validate what analyzer names to allow. // This might change closer to re:Invent, but keeping it simple for now. @@ -320,9 +324,10 @@ private Either getDQAnalyzer( return Either.fromLeft(String.format(errorMessage.get() + ": %s", analyzerType)); } - Map parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); + LinkedHashMap parameterMap = + dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); - return Either.fromRight(new DQAnalyzer(analyzerType, parameterMap)); + return Either.fromRight(DQAnalyzer.createFromValueMap(analyzerType, parameterMap)); } private Either parseCondition( @@ -737,18 +742,24 @@ private String removeEscapes(String stringWithEscapes) { return stringWithEscapes; } - private List parseParameters(DataQualityDefinitionLanguageParser.ParametersContext psc) { - if (psc == null || psc.parameter() == null) return new ArrayList<>(); - return psc.parameter().stream().map(this::parseParameter).collect(Collectors.toList()); + private List parseParameters( + List parameters) { + if (parameters == null) return new ArrayList<>(); + return parameters.stream().map(this::parseParameter).collect(Collectors.toList()); } - private String parseParameter(DataQualityDefinitionLanguageParser.ParameterContext pc) { - if (pc.QUOTED_STRING() != null) { - return removeQuotes(pc.QUOTED_STRING().getText()); - } else if (pc.IDENTIFIER() != null) { - return pc.IDENTIFIER().getText(); + private DQRuleParameterValue parseParameter( + DataQualityDefinitionLanguageParser.ParameterWithConnectorWordContext pc) { + String connectorWord = pc.connectorWord() == null ? "" : pc.connectorWord().getText(); + + if (pc.parameter().QUOTED_STRING() != null) { + return new DQRuleParameterValue( + removeQuotes(pc.parameter().QUOTED_STRING().getText()), true, connectorWord); + } else if (pc.parameter().IDENTIFIER() != null) { + return new DQRuleParameterValue( + pc.parameter().IDENTIFIER().getText(), false, connectorWord); } else { - return pc.getText(); + return new DQRuleParameterValue(pc.parameter().getText(), true, connectorWord); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 93860b5..1fb4696 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -78,6 +78,7 @@ private static Stream provideRawRules() { // Arguments.of("FileCount between -10000 and -1000"), Arguments.of("IsPrimaryKey \"colA\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\""), + Arguments.of("IsPrimaryKey colA \"col B\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\" \"colC\""), Arguments.of("RowCount = 100"), Arguments.of("RowCount = -100"), @@ -86,6 +87,7 @@ private static Stream provideRawRules() { Arguments.of("RowCountMatch \"reference\" >= 0.95"), Arguments.of("RowCountMatch \"reference\" between 0.8 and 0.98"), Arguments.of("Completeness \"col_1\" between 0.5 and 0.8"), + Arguments.of("Completeness of col_1 between 0.5 and 0.8"), Arguments.of("IsComplete \"col_1\""), Arguments.of("Completeness \"col_1\" between -0.5 and -0.4"), Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"), @@ -100,6 +102,8 @@ private static Stream provideRawRules() { Arguments.of("ColumnCount = avg(std(last(percentile(1,2,3))))"), Arguments.of("ColumnCount > -100.123456"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8"), + Arguments.of("ColumnCorrelation of col_1 col_2 between 0.4 and 0.8"), + Arguments.of("ColumnCorrelation of col_1 and \"col abc\" between 0.4 and 0.8"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between -0.44444 and 0.888888"), Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2"), Arguments.of("IsUnique \"col_1\""), @@ -160,8 +164,9 @@ private static Stream provideRawRules() { Arguments.of("CustomSql \"select col-A from primary\""), Arguments.of("CustomSql \"select col-A from primary\" with threshold > 0.5"), Arguments.of("DetectAnomalies \"RowCount\""), - Arguments.of("DetectAnomalies \"Completeness\" \"colA\""), - Arguments.of("DetectAnomalies \"ColumnCorrelation\" \"colA\" \"colB\"") + Arguments.of("DetectAnomalies of RowCount"), + Arguments.of("DetectAnomalies of Completeness of \"colA\""), + Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\"") ); } From 882effa63e66b5f0fc073077d32e0d24d91f741e Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 7 Nov 2023 12:33:18 -0500 Subject: [PATCH 13/50] Fixed DQRule's toString method so that it does not add newlines when there are no rules - Added tests for empty rules/missing rules alongside analyzers. --- .../ml/dataquality/dqdl/model/DQRuleset.java | 20 ++++-- .../dataquality/dqdl/model/DQRulesetTest.java | 68 +++++++++++++++++++ 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java index d27613a..87161be 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java @@ -81,11 +81,14 @@ public String toString() { "}"; } - String rulesStr = "Rules = [" + LINE_SEP + - rules.stream() - .map(i -> " " + i) - .collect(Collectors.joining("," + LINE_SEP)) + - LINE_SEP + "]"; + String rulesStr = ""; + if (!rules.isEmpty()) { + rulesStr = "Rules = [" + LINE_SEP + + rules.stream() + .map(i -> " " + i) + .collect(Collectors.joining("," + LINE_SEP)) + + LINE_SEP + "]"; + } String analyzersStr = ""; if (!analyzers.isEmpty()) { @@ -105,10 +108,13 @@ public String toString() { sb.append(sourcesStr).append(LINE_SEP).append(LINE_SEP); } - sb.append(rulesStr); + if (!rulesStr.isEmpty()) { + sb.append(rulesStr); + } if (!analyzersStr.isEmpty()) { - sb.append(LINE_SEP).append(LINE_SEP).append(analyzersStr); + if (!rulesStr.isEmpty()) sb.append(LINE_SEP).append(LINE_SEP); + sb.append(analyzersStr); } return sb.toString(); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index 0380b1a..390b636 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -15,6 +15,8 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -214,6 +216,72 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndAnalyzers() { assertEquals(dqdlFormatted, dqRuleset.toString()); } + @Test + void test_rulesetWithAnalyzersAndEmptyRules() { + String dqdl = "Rules = [] Analyzers = [ RowCount ]"; + try { + dqdlParser.parse(dqdl); + } catch (InvalidDataQualityRulesetException e) { + assertTrue(e.getMessage().contains("No rules provided")); + } + } + + @Test + void test_rulesetWithMetadataAndSourcesAndAnalyzersAndNoRules() { + String dqdl = + "Metadata = { \"Version\": \"1.0\" }" + LINE_SEP + + "DataSources = {" + + " \"Primary\": \"orders-table\", " + LINE_SEP + + " \"AdditionalDataSources\": [ \"ref-table\" ]" + LINE_SEP + + "}" + LINE_SEP + + "Analyzers = [ RowCount, Completeness \"colA\", Uniqueness of col_A ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals("orders-table", dqRuleset.getPrimarySourceName()); + assertEquals(1, dqRuleset.getAdditionalDataSourcesNames().size()); + assertEquals("ref-table", dqRuleset.getAdditionalDataSourcesNames().get(0)); + assertEquals(0, dqRuleset.getRules().size()); + assertEquals(3, dqRuleset.getAnalyzers().size()); + assertEquals("RowCount", dqRuleset.getAnalyzers().get(0).getRuleType()); + assertEquals("Completeness", dqRuleset.getAnalyzers().get(1).getRuleType()); + assertEquals("Uniqueness", dqRuleset.getAnalyzers().get(2).getRuleType()); + + String dqdlFormatted = + "Metadata = {" + LINE_SEP + + " \"Version\": \"1.0\"" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "DataSources = {" + LINE_SEP + + " \"Primary\": \"orders-table\"," + LINE_SEP + + " \"AdditionalDataSources\": [ \"ref-table\" ]" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "Analyzers = [" + LINE_SEP + + " RowCount," + LINE_SEP + + " Completeness \"colA\"," + LINE_SEP + + " Uniqueness of col_A" + LINE_SEP + + "]"; + assertEquals(dqdlFormatted, dqRuleset.toString()); + } + + @Test + void test_rulesetWithAnalyzersAndNoRules() { + String dqdl = "Analyzers = [ Completeness \"colA\", AllStatistics of AllColumns, Uniqueness of \"col_A\" ]"; + DQRuleset dqRuleset = parseDQDL(dqdl); + + List analyzers = dqRuleset.getAnalyzers(); + assertEquals(3, analyzers.size()); + assertEquals("Completeness", analyzers.get(0).getRuleType()); + assertEquals("AllStatistics", analyzers.get(1).getRuleType()); + assertEquals("Uniqueness", analyzers.get(2).getRuleType()); + + String dqdlFormatted = + "Analyzers = [" + LINE_SEP + + " Completeness \"colA\"," + LINE_SEP + + " AllStatistics of AllColumns," + LINE_SEP + + " Uniqueness of \"col_A\"" + LINE_SEP + + "]"; + assertEquals(dqdlFormatted, dqRuleset.toString()); + } + @Disabled void test_jobStatusRuleWithEqualityCheck() { String dqdl = "Rules = [ JobStatus = \"SUCCEEDED\" ]"; From 788f6facbab3d1c6530a9041e4badbb53f65d6fa Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 7 Nov 2023 14:31:59 -0500 Subject: [PATCH 14/50] Allow empty rules or empty analyzers but not both - Glue Studio configures a new job with an empty rules section (Rules = [ ]). When a customer adds Analyzers, we need to allow this ruleset to succeed. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 4 +- .../ml/dataquality/dqdl/model/DQRuleset.java | 4 +- .../dqdl/parser/DQDLParserListener.java | 12 +++-- .../dataquality/dqdl/model/DQRulesetTest.java | 44 ++++++++++++++++--- .../dqdl/parser/InvalidDQRulesetTest.java | 3 +- 5 files changed, 49 insertions(+), 18 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 296a3dc..8ebb360 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -116,7 +116,9 @@ rules: rulesSectionStart EQUAL_TO LBRAC dqRules RBRAC | rulesSectionStart EQUAL_TO LBRAC RBRAC; // empty array -analyzers: analyzersSectionStart EQUAL_TO LBRAC dqAnalyzers RBRAC; +analyzers: + analyzersSectionStart EQUAL_TO LBRAC dqAnalyzers RBRAC + | analyzersSectionStart EQUAL_TO LBRAC RBRAC; // empty array // This dictionary does not support nested dictionaries. Just strings and arrays. dictionary: LCURL pair (COMMA pair)* RCURL; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java index 87161be..dec8804 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java @@ -113,7 +113,9 @@ public String toString() { } if (!analyzersStr.isEmpty()) { - if (!rulesStr.isEmpty()) sb.append(LINE_SEP).append(LINE_SEP); + if (!rulesStr.isEmpty()) { + sb.append(LINE_SEP).append(LINE_SEP); + } sb.append(analyzersStr); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 0b6cbea..de99660 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -79,6 +79,11 @@ public DQDLParserListener(DQDLErrorListener errorListener) { } public Either, DQRuleset> getParsedRuleset() { + // Only add this error message if we did not walk the tree due to empty rules or analyzers sections. + if (errorMessages.isEmpty() && dqRules.isEmpty() && dqAnalyzers.isEmpty()) { + errorMessages.add("No rules or analyzers provided."); + } + if (errorMessages.isEmpty() && errorListener.getErrorMessages().isEmpty()) { return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules, dqAnalyzers)); } else { @@ -115,13 +120,6 @@ public void enterMetadata(DataQualityDefinitionLanguageParser.MetadataContext ct } } - @Override - public void enterRules(DataQualityDefinitionLanguageParser.RulesContext ctx) { - if (ctx.dqRules() == null) { - errorMessages.add("No rules provided."); - } - } - @Override public void enterDataSources(DataQualityDefinitionLanguageParser.DataSourcesContext ctx) { DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary(); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index 390b636..6d37718 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -15,7 +15,10 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -217,13 +220,40 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndAnalyzers() { } @Test - void test_rulesetWithAnalyzersAndEmptyRules() { - String dqdl = "Rules = [] Analyzers = [ RowCount ]"; - try { - dqdlParser.parse(dqdl); - } catch (InvalidDataQualityRulesetException e) { - assertTrue(e.getMessage().contains("No rules provided")); - } + void test_rulesetWithAnalyzersAndEmptyOrMissingRules() { + String dqdl1 = "Analyzers = [ RowCount, Completeness of \"col-A\" ]"; + String dqdl2 = "Rules = [] Analyzers = [ RowCount, Completeness of \"col-A\" ]"; + + Arrays.asList(dqdl1, dqdl2).forEach(dqdl -> { + DQRuleset ruleset = parseDQDL(dqdl); + List dqRules = ruleset.getRules(); + List dqAnalyzers = ruleset.getAnalyzers(); + + assertEquals(0, dqRules.size()); + assertEquals(2, dqAnalyzers.size()); + assertEquals("RowCount", dqAnalyzers.get(0).getRuleType()); + assertEquals(0, dqAnalyzers.get(0).getParameterValueMap().size()); + assertEquals("Completeness", dqAnalyzers.get(1).getRuleType()); + assertTrue(dqAnalyzers.get(1).getParameterValueMap().containsKey("TargetColumn")); + assertEquals("col-A", dqAnalyzers.get(1).getParameterValueMap().get("TargetColumn").getValue()); + }); + } + + @Test + void test_rulesetWithEmptyAnalyzersAndEmptyRules() { + Arrays.asList( + "Rules = []", + "Analyzers = []", + "Rules = [] Analyzers = []" + ).forEach(ruleset -> { + try { + dqdlParser.parse(ruleset); + fail("Ruleset parsing should have failed"); + } catch (InvalidDataQualityRulesetException e) { + System.out.println(e.getMessage()); + assertTrue(e.getMessage().contains("No rules or analyzers provided")); + } + }); } @Test diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index ef8036d..fc9a9e5 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -84,8 +84,7 @@ private static Stream provideInvalidRulesets() { private static Stream provideInvalidRulesetsWithAnalyzers() { return Stream.of( - Arguments.of("Rules = [ ] Analyzers = [ Completeness \"colA\" ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ ]"), + Arguments.of("Rules = [ ] Analyzers = [ ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ IsComplete \"colA\" ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Foo ]"), From 391add8119e08eea444b8d05e029dc55203bef31 Mon Sep 17 00:00:00 2001 From: Tyler McDaniel Date: Fri, 10 Nov 2023 15:01:07 -0500 Subject: [PATCH 15/50] Add logging for condition evaluation to assist with dynamic rule debugging. --- pom.xml | 9 ++++ .../number/NumberBasedCondition.java | 51 ++++++++++++++++--- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index 7021831..417a331 100644 --- a/pom.xml +++ b/pom.xml @@ -15,6 +15,7 @@ 2.12.7.1 5.9.1 1.18.28 + 2.0.16 3.11.0 1.8 1.8 @@ -56,6 +57,14 @@ provided + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + provided + + org.antlr diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index d403f1c..3dea2f1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -15,16 +15,21 @@ import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import java.text.DecimalFormat; import java.util.List; import java.util.stream.Collectors; @Getter @EqualsAndHashCode(callSuper = true) +@Slf4j public class NumberBasedCondition extends Condition { private final NumberBasedConditionOperator operator; private final List operands; + private static final DecimalFormat OP_FORMAT = new DecimalFormat("#.###"); + public NumberBasedCondition(final String conditionAsString, final NumberBasedConditionOperator operator, final List operands) { @@ -39,28 +44,60 @@ public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator List operandsAsDouble = operands.stream() .map(operand -> evaluator.evaluate(dqRule, operand)).collect(Collectors.toList()); + + log.info(String.format("Evaluating condition for rule: %s", dqRule)); + List formatOps = operandsAsDouble.stream().map(OP_FORMAT::format).collect(Collectors.toList()); + String formatMetric = OP_FORMAT.format(metric); + switch (operator) { case BETWEEN: if (operands.size() != 2) return false; - else return metric > operandsAsDouble.get(0) && metric < operandsAsDouble.get(1); + else { + boolean result = metric > operandsAsDouble.get(0) && metric < operandsAsDouble.get(1); + log.info("{} between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result); + return result; + } case GREATER_THAN_EQUAL_TO: if (operands.size() != 1) return false; - else return metric >= operandsAsDouble.get(0); + else { + boolean result = metric >= operandsAsDouble.get(0); + log.info("{} >= {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case GREATER_THAN: if (operands.size() != 1) return false; - else return metric > operandsAsDouble.get(0); + else { + boolean result = metric > operandsAsDouble.get(0); + log.info("{} > {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case LESS_THAN_EQUAL_TO: if (operands.size() != 1) return false; - else return metric <= operandsAsDouble.get(0); + else { + boolean result = metric <= operandsAsDouble.get(0); + log.info("{} <= {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case LESS_THAN: if (operands.size() != 1) return false; - else return metric < operandsAsDouble.get(0); + else { + boolean result = metric < operandsAsDouble.get(0); + log.info("{} < {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case EQUALS: if (operands.size() != 1) return false; - else return metric.equals(operandsAsDouble.get(0)); + else { + boolean result = metric.equals(operandsAsDouble.get(0)); + log.info("{} == {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case IN: - return operandsAsDouble.contains(metric); + boolean result = operandsAsDouble.contains(metric); + log.info("{} in {}? {}", formatMetric, formatOps, result); + return result; default: + log.error("Unknown operator"); return false; } } From 26fcec7a5822a8adf24768cc1c9a33a577a53af7 Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Fri, 26 Jan 2024 16:55:25 -0500 Subject: [PATCH 16/50] Add support for where clause --- .../dqdl/DataQualityDefinitionLanguage.g4 | 4 +- .../ml/dataquality/dqdl/model/DQRule.java | 33 ++++++- .../dqdl/parser/DQDLParserListener.java | 15 +++- .../ml/dataquality/dqdl/model/DQRuleTest.java | 86 ++++++++++++++++++- 4 files changed, 131 insertions(+), 7 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 8ebb360..a3b76d2 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -99,7 +99,9 @@ condition: withThresholdCondition: 'with' 'threshold' numberBasedCondition; -dqRule: ruleType parameterWithConnectorWord* condition? withThresholdCondition?; +whereClause: 'where' quotedString; + +dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withThresholdCondition?; dqAnalyzer: analyzerType parameterWithConnectorWord*; topLevelRule: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 482815c..0b2eec1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -34,8 +34,26 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final Condition thresholdCondition; private final DQRuleLogicalOperator operator; private final List nestedRules; + private final String whereClause; // Adding this constructor so as to not break the Data Quality ETL package. + public DQRule(final String ruleType, + final Map parameters, + final Condition condition, + final Condition thresholdCondition, + final DQRuleLogicalOperator operator, + final List nestedRules, + final String whereClause) { + this.ruleType = ruleType; + this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); + this.condition = condition; + this.thresholdCondition = thresholdCondition; + this.operator = operator; + this.nestedRules = nestedRules; + this.whereClause = whereClause; + } + public DQRule(final String ruleType, final Map parameters, final Condition condition, @@ -49,6 +67,7 @@ public DQRule(final String ruleType, this.thresholdCondition = thresholdCondition; this.operator = operator; this.nestedRules = nestedRules; + this.whereClause = null; } public DQRule(final String ruleType, @@ -61,13 +80,14 @@ public DQRule(final String ruleType, this.thresholdCondition = null; this.operator = DQRuleLogicalOperator.AND; this.nestedRules = new ArrayList<>(); + this.whereClause = null; } // Can't overload the constructor above, due to type erasure public static DQRule createFromParameterValueMap(final String ruleType, final LinkedHashMap parameters, final Condition condition) { - return createFromParameterValueMap(ruleType, parameters, condition, null); + return createFromParameterValueMap(ruleType, parameters, condition, null, null); } public DQRule(final String ruleType, @@ -81,13 +101,15 @@ public DQRule(final String ruleType, this.thresholdCondition = thresholdCondition; this.operator = DQRuleLogicalOperator.AND; this.nestedRules = new ArrayList<>(); + this.whereClause = null; } // Can't overload the constructor above, due to type erasure public static DQRule createFromParameterValueMap(final String ruleType, final LinkedHashMap parameters, final Condition condition, - final Condition thresholdCondition) { + final Condition thresholdCondition, + final String whereClause) { DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; List nestedRules = new ArrayList<>(); @@ -98,7 +120,8 @@ public static DQRule createFromParameterValueMap(final String ruleType, condition, thresholdCondition, operator, - nestedRules + nestedRules, + whereClause ); } @@ -123,6 +146,10 @@ public String toString() { if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition); } + if (whereClause != null) { + if (!isBlank(whereClause)) sb.append(" where ").append("\"" + whereClause + "\""); + } + return sb.toString(); } else { for (int i = 0; i < nestedRules.size(); i++) { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index de99660..2aad264 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -181,7 +181,7 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu } } - dqRules.add(new DQRule("Composite", null, null, null, null, op, nestedRules)); + dqRules.add(new DQRule("Composite", null, null, null, op, nestedRules)); } else if (tlc.dqRule(0) != null) { Either dqRuleEither = getDQRule(tlc.dqRule(0)); if (dqRuleEither.isLeft()) { @@ -290,9 +290,20 @@ private Either getDQRule( } } + String whereClause = null; + if (dqRuleContext.whereClause() != null) { + DataQualityDefinitionLanguageParser.WhereClauseContext ctx = dqRuleContext.whereClause(); + if (ctx.quotedString().getText().isEmpty() || ctx.quotedString().getText().equals("\"\"")) { + return Either.fromLeft( + String.format("Empty where condition provided for rule type: %s", ruleType)); + } else { + whereClause = removeQuotes(ctx.quotedString().getText()); + } + } + return Either.fromRight( DQRule.createFromParameterValueMap( - dqRuleType.getRuleTypeName(), parameterMap, condition, thresholdCondition) + dqRuleType.getRuleTypeName(), parameterMap, condition, thresholdCondition, whereClause) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 1fb4696..2f386dd 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -44,6 +44,7 @@ import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotSame; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -303,7 +304,9 @@ void test_constructorWithOriginalParameterMap() { DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; List nestedRules = new ArrayList<>(); - DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition, operator, nestedRules); + String whereClause = null; + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition, operator, nestedRules, whereClause); assertEquals(ruleType, rule.getRuleType()); @@ -364,6 +367,87 @@ public void test_equalsAndHashCode() throws InvalidDataQualityRulesetException { assertEquals(dqRuleset1.hashCode(), dqRuleset2.hashCode()); } + @Test + public void test_whereClause() throws InvalidDataQualityRulesetException { + String rule = "IsComplete \"colA\" where \"colB is NOT NULL\""; + String ruleset = String.format("Rules = [ %s ]", rule); + + DQRuleset dqRuleset1 = parser.parse(ruleset); + DQRuleset dqRuleset2 = parser.parse(ruleset); + + assertNotSame(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1.hashCode(), dqRuleset2.hashCode()); + } + + @Test + void test_whereClauseRuleToStringFromRule() throws InvalidDataQualityRulesetException { + Map parameters = new HashMap<>(); + parameters.put("TargetColumn", "colA"); + DQRule dqRule = new DQRule("IsComplete", parameters, new Condition(""), null, + DQRuleLogicalOperator.AND, null, "colB is NOT NULL"); + String ruleString = "IsComplete \"colA\" where \"colB is NOT NULL\""; + assertEquals(dqRule.toString(), ruleString); + assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); + } + + @Test + void test_whereClauseRuleToStringFromRuleset() throws InvalidDataQualityRulesetException { + String ruleString = "IsComplete \"colA\" where \"colB is NOT NULL\""; + String ruleset = String.format("Rules = [ %s ]", ruleString); + DQRuleset dqRuleset = parser.parse(ruleset); + DQRule dqRule = dqRuleset.getRules().get(0); + assertEquals(dqRule.toString(), ruleString); + assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); + } + + @Test + void test_whereClauseNeedsQuotedSQLStatement() { + String rule = "IsComplete \"colA\" where \"\""; + String ruleset = String.format("Rules = [ %s ]", rule); + assertThrows(InvalidDataQualityRulesetException.class, () -> parser.parse(ruleset)); + } + + @Test + void test_whereClauseCannotBeEmpty() { + String rule = "IsComplete \"colA\" where \"\""; + String ruleset = String.format("Rules = [ %s ]", rule); + assertThrows(InvalidDataQualityRulesetException.class, () -> parser.parse(ruleset)); + } + + @Test + void test_constructorWithWhereClause() { + String ruleType = "IsComplete"; + String columnKey = "TargetColumn"; + String column = "colA"; + String emptyCondition = ""; + String whereClause = "\"colB is NOT NULL\""; + + Map parameters = new HashMap<>(); + parameters.put(columnKey, column); + + Condition condition = new Condition(emptyCondition); + Condition thresholdCondition = new Condition(emptyCondition); + + DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; + List nestedRules = new ArrayList<>(); + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition, operator, nestedRules, whereClause); + assertEquals(ruleType, rule.getRuleType()); + + assertTrue(rule.getParameters().containsKey(columnKey)); + assertEquals(column, rule.getParameters().get(columnKey)); + assertTrue(rule.getParameterValueMap().containsKey(columnKey)); + assertEquals(column, rule.getParameterValueMap().get(columnKey).getValue()); + assertTrue(rule.getParameterValueMap().get(columnKey).getConnectorWord().isEmpty()); + assertTrue(rule.getParameterValueMap().get(columnKey).isQuoted()); + assertTrue(rule.getCondition().getConditionAsString().isEmpty()); + assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); + assertEquals(operator, rule.getOperator()); + assertTrue(rule.getNestedRules().isEmpty()); + assertEquals(rule.getWhereClause(), whereClause); + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; From 468d5424c7fdb3dd8e806e780f9b6ad55c747b97 Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Thu, 8 Feb 2024 12:34:19 -0500 Subject: [PATCH 17/50] Give rule components @EqualsAndHashcode --- .../number/BinaryExpressionOperand.java | 2 ++ .../condition/number/FunctionCallOperand.java | 2 ++ .../condition/number/NumericOperand.java | 2 ++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 22 ++++++++++++++----- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java index 12d1ed4..3088131 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java @@ -10,6 +10,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; +import lombok.EqualsAndHashCode; import lombok.Getter; /* @@ -19,6 +20,7 @@ * The purpose of this operand is for combining with a dynamic function call operand to create dynamic rule thresholds. */ @Getter +@EqualsAndHashCode(callSuper = true) public class BinaryExpressionOperand extends NumericOperand { private final String operator; private final NumericOperand operand1; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java index 5dccf68..1cf2183 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java @@ -10,6 +10,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; +import lombok.EqualsAndHashCode; import lombok.Getter; import java.util.List; @@ -22,6 +23,7 @@ * Through the use of function call operands, we introduce the concept of dynamic rules in DQDL. */ @Getter +@EqualsAndHashCode(callSuper = true) public class FunctionCallOperand extends NumericOperand { private final String functionName; private final List operands; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java index 6fb74ba..9623996 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java @@ -11,12 +11,14 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; import lombok.Getter; import java.io.Serializable; @AllArgsConstructor @Getter +@EqualsAndHashCode public abstract class NumericOperand implements Serializable { private final String operand; private final boolean isParenthesized; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 2f386dd..e13344d 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -69,14 +69,24 @@ void test_ruleParsingAndGeneratingWithParser(String rule) { } } + @ParameterizedTest + @MethodSource("provideRawRules") + void test_rulesEqualWhenRepresentationsEqual(String ruleStringRepr) { + try { + DQRule rule1 = parser.parse("Rules = [ " + ruleStringRepr + " ]").getRules().get(0); + DQRule rule2 = parser.parse("Rules = [ " + ruleStringRepr + " ]").getRules().get(0); + + assertEquals(rule1, rule2); + assertTrue(rule1.equals(rule2)); + assertEquals(rule1.hashCode(), rule2.hashCode()); + assertNotSame(rule1, rule2); + } catch (InvalidDataQualityRulesetException e) { + fail(e.getMessage()); + } + } + private static Stream provideRawRules() { return Stream.of( - // Arguments.of("JobStatus = \"SUCCEEDED\""), - // Arguments.of("JobStatus in [\"SUCCEEDED\",\"READY\"]"), - // Arguments.of("JobDuration between 10 and 1000"), - // Arguments.of("JobDuration between -10 and 1000"), - // Arguments.of("FileCount between 10 and 100"), - // Arguments.of("FileCount between -10000 and -1000"), Arguments.of("IsPrimaryKey \"colA\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\""), Arguments.of("IsPrimaryKey colA \"col B\""), From c64150bea07cd9a17afbb0ad82ffc8d6d054b181 Mon Sep 17 00:00:00 2001 From: Dongying Song Date: Wed, 14 Feb 2024 09:55:32 -0500 Subject: [PATCH 18/50] Add is_composite_rule_evaluation_row_level_supported flag --- configuration/rules/rules-config.json | 16 +++++++++--- .../ml/dataquality/dqdl/model/DQRule.java | 10 ++++--- .../ml/dataquality/dqdl/model/DQRuleType.java | 4 +++ .../dqdl/parser/DQDLParserListener.java | 2 +- .../ml/dataquality/dqdl/model/DQRuleTest.java | 26 +++++++++++++++++++ .../dqdl/model/DeserializationTest.java | 8 ++++-- 6 files changed, 56 insertions(+), 10 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index ea82ccb..62cd1b8 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -38,6 +38,7 @@ } ], "return_type": "NUMBER", + "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -55,7 +56,7 @@ }, { "rule_type_name": "ColumnDataType", - "description": "Check the data type of the given column", + "description": "Check the data type of the given column. Supported values: Boolean, Date, Timestamp, Integer, Double, Float, Long", "parameters": [ { "type": "String", @@ -65,6 +66,7 @@ ], "return_type": "STRING", "is_threshold_supported": true, + "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -122,6 +124,7 @@ } ], "return_type": "NUMBER", + "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -174,6 +177,7 @@ } ], "return_type": "NUMBER", + "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -254,6 +258,7 @@ ], "return_type": "STRING_ARRAY|NUMBER_ARRAY|DATE_ARRAY", "is_threshold_supported": true, + "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -281,6 +286,7 @@ ], "return_type": "NUMBER|BOOLEAN", "is_threshold_supported": true, + "is_composite_rule_evaluation_row_level_supported": false, "scope": "table" }, { @@ -299,6 +305,7 @@ } ], "return_type": "NUMBER", + "is_composite_rule_evaluation_row_level_supported": false, "scope": "table" }, { @@ -317,6 +324,7 @@ } ], "return_type": "NUMBER", + "is_composite_rule_evaluation_row_level_supported": false, "scope": "table" }, { @@ -385,7 +393,8 @@ } ], "return_type": "BOOLEAN", - "scope": "column" + "scope": "column", + "experimental": true }, { "rule_type_name": "AllStatistics", @@ -399,7 +408,8 @@ } ], "return_type": "NUMBER", - "scope": "column" + "scope": "column", + "experimental": true } ] } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 0b2eec1..91e3554 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -35,6 +35,7 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final DQRuleLogicalOperator operator; private final List nestedRules; private final String whereClause; + private Boolean isCompositeRuleEvaluationRowLevelSupported = true; // Adding this constructor so as to not break the Data Quality ETL package. public DQRule(final String ruleType, @@ -84,7 +85,7 @@ public DQRule(final String ruleType, } // Can't overload the constructor above, due to type erasure - public static DQRule createFromParameterValueMap(final String ruleType, + public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition) { return createFromParameterValueMap(ruleType, parameters, condition, null, null); @@ -105,7 +106,7 @@ public DQRule(final String ruleType, } // Can't overload the constructor above, due to type erasure - public static DQRule createFromParameterValueMap(final String ruleType, + public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition, final Condition thresholdCondition, @@ -114,14 +115,15 @@ public static DQRule createFromParameterValueMap(final String ruleType, List nestedRules = new ArrayList<>(); return new DQRule( - ruleType, + ruleType.getRuleTypeName(), DQRuleParameterValue.createParameterMap(parameters), parameters, condition, thresholdCondition, operator, nestedRules, - whereClause + whereClause, + ruleType.isCompositeRuleEvaluationRowLevelSupported() ); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 8af11b1..e206ab8 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -32,6 +32,7 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; + private final boolean isCompositeRuleEvaluationRowLevelSupported; private final boolean isAnalyzerOnly; private final String scope; private final boolean isExperimental; @@ -43,6 +44,8 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "return_type") String returnType, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, + @JsonProperty(value = "is_composite_rule_evaluation_row_level_supported") + boolean isCompositeRuleEvaluationRowLevelSupported, @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, @JsonProperty(value = "scope") String scope, @JsonProperty(value = "experimental") boolean isExperimental) { @@ -51,6 +54,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; + this.isCompositeRuleEvaluationRowLevelSupported = isCompositeRuleEvaluationRowLevelSupported; this.isAnalyzerOnly = isAnalyzerOnly; this.scope = scope; this.isExperimental = isExperimental; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 2aad264..e3a3dcc 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -303,7 +303,7 @@ private Either getDQRule( return Either.fromRight( DQRule.createFromParameterValueMap( - dqRuleType.getRuleTypeName(), parameterMap, condition, thresholdCondition, whereClause) + dqRuleType, parameterMap, condition, thresholdCondition, whereClause) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index e13344d..65955e8 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -458,6 +458,32 @@ void test_constructorWithWhereClause() { assertEquals(rule.getWhereClause(), whereClause); } + @Test + void test_constructorWithParametersAndCondition() { + String ruleType = "IsComplete"; + String columnKey = "TargetColumn"; + String column = "colA"; + String emptyCondition = ""; + + Map parameters = new HashMap<>(); + parameters.put(columnKey, column); + + Condition condition = new Condition(emptyCondition); + Condition thresholdCondition = new Condition(emptyCondition); + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition); + assertEquals(ruleType, rule.getRuleType()); + + assertTrue(rule.getParameters().containsKey(columnKey)); + assertEquals(column, rule.getParameters().get(columnKey)); + assertTrue(rule.getParameterValueMap().containsKey(columnKey)); + assertEquals(column, rule.getParameterValueMap().get(columnKey).getValue()); + assertTrue(rule.getParameterValueMap().get(columnKey).getConnectorWord().isEmpty()); + assertTrue(rule.getParameterValueMap().get(columnKey).isQuoted()); + assertTrue(rule.getCondition().getConditionAsString().isEmpty()); + assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java index 081a29a..ce14834 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java @@ -76,6 +76,7 @@ public void test_parseDQRuleType() throws JsonProcessingException { String ruleTypeDesc = "This rule matches two datasets"; String returnType = "STRING"; boolean isThresholdSupported = true; + boolean isCompositeRuleEvaluationRowLevelSupported = false; // Parameter 1 String param1Type = "String"; @@ -98,15 +99,18 @@ public void test_parseDQRuleType() throws JsonProcessingException { "\"description\":\"%s\"," + "\"parameters\": [ %s, %s ]," + "\"return_type\": \"%s\"," + - "\"is_threshold_supported\": %s" + + "\"is_threshold_supported\": \"%s\"," + + "\"is_composite_rule_evaluation_row_level_supported\": %s" + "}", - ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType, isThresholdSupported); + ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType, isThresholdSupported, + isCompositeRuleEvaluationRowLevelSupported); DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); assertEquals(ruleTypeName, ruleType.getRuleTypeName()); assertEquals(ruleTypeDesc, ruleType.getDescription()); assertEquals(returnType, ruleType.getReturnType()); assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); + assertEquals(isCompositeRuleEvaluationRowLevelSupported, ruleType.isCompositeRuleEvaluationRowLevelSupported()); DQRuleParameter param1 = ruleType.getParameters().get(0); assertEquals(param1Type, param1.getType()); From 5921da702a580559439db6b2bab114157f7fc95f Mon Sep 17 00:00:00 2001 From: Andrius Juodelis Date: Tue, 13 Feb 2024 13:00:14 -0500 Subject: [PATCH 19/50] add catch StringIndexOutOfBoundsException during parse - minor: narrow try-catch block and add log line --- .../ml/dataquality/dqdl/parser/DQDLParser.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java index d84bf8f..7ac2dc3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java @@ -16,6 +16,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; +import lombok.extern.slf4j.Slf4j; import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.CharStreams; import org.antlr.v4.runtime.CommonTokenStream; @@ -24,10 +25,12 @@ import java.util.List; +@Slf4j public class DQDLParser { private static final String PARSING_ERROR_MESSAGE_PREFIX = "Parsing Error"; public DQRuleset parse(String dqdl) throws InvalidDataQualityRulesetException { + CharStream input = CharStreams.fromString(dqdl); DQDLErrorListener errorListener = new DQDLErrorListener(); @@ -41,15 +44,18 @@ public DQRuleset parse(String dqdl) throws InvalidDataQualityRulesetException { parser.addErrorListener(errorListener); DQDLParserListener listener = new DQDLParserListener(errorListener); - ParseTreeWalker.DEFAULT.walk(listener, parser.document()); - + try { + ParseTreeWalker.DEFAULT.walk(listener, parser.document()); + } catch (StringIndexOutOfBoundsException e) { + log.error(e.getMessage(), e); + throw new InvalidDataQualityRulesetException("Invalid DQDL."); + } Either, DQRuleset> dqRulesetEither = listener.getParsedRuleset(); - if (dqRulesetEither.isLeft()) { throw new InvalidDataQualityRulesetException(generateExceptionMessage(dqRulesetEither.getLeft())); - } else { - return dqRulesetEither.getRight(); } + return dqRulesetEither.getRight(); + } private String generateExceptionMessage(List errorMessages) { From a105b1b7e4bb77a27391aad99281cd7807301d29 Mon Sep 17 00:00:00 2001 From: Shriya Vanvari Date: Tue, 30 Jan 2024 16:33:35 -0500 Subject: [PATCH 20/50] Add NOT operator support for all types of conditions NOT support for number based condition --- configuration/dqdl/CommonLexerRules.g4 | 2 + .../dqdl/DataQualityDefinitionLanguage.g4 | 22 +++---- .../condition/date/DateBasedCondition.java | 22 ++++++- .../date/DateBasedConditionOperator.java | 5 +- .../duration/DurationBasedCondition.java | 18 +++++- .../DurationBasedConditionOperator.java | 5 +- .../number/NumberBasedCondition.java | 51 +++++++++++++--- .../number/NumberBasedConditionOperator.java | 5 +- .../string/StringBasedCondition.java | 17 +++++- .../string/StringBasedConditionOperator.java | 2 + .../dqdl/parser/DQDLParserListener.java | 59 ++++++++++++++----- .../ml/dataquality/dqdl/model/DQRuleTest.java | 12 ++++ .../dqdl/model/condition/ConditionTest.java | 57 +++++++++++++++--- .../date/DateBasedConditionTest.java | 50 ++++++++++++++++ .../duration/DurationBasedConditionTest.java | 32 ++++++++++ 15 files changed, 305 insertions(+), 54 deletions(-) diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4 index 5b219a1..5c4fc99 100644 --- a/configuration/dqdl/CommonLexerRules.g4 +++ b/configuration/dqdl/CommonLexerRules.g4 @@ -21,6 +21,8 @@ GREATER_THAN_EQUAL_TO: '>='; LESS_THAN: '<'; LESS_THAN_EQUAL_TO: '<='; IN: 'in'; +NOT: 'not'; +NEGATION: '!'; DIGIT: [0-9]; DATE: diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index a3b76d2..053041e 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -50,39 +50,39 @@ matchesRegexCondition: 'matches' quotedString; numberArray: LBRAC number (COMMA number)* RBRAC; numberBasedCondition: - BETWEEN number AND number + NOT? BETWEEN number AND number | GREATER_THAN number | GREATER_THAN_EQUAL_TO number | LESS_THAN number | LESS_THAN_EQUAL_TO number - | EQUAL_TO number - | IN numberArray; + | NEGATION? EQUAL_TO number + | NOT? IN numberArray; quotedStringArray: LBRAC quotedString (COMMA quotedString)* RBRAC; stringBasedCondition: - EQUAL_TO quotedString - | IN quotedStringArray + NEGATION? EQUAL_TO quotedString + | NOT? IN quotedStringArray | matchesRegexCondition; dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC; dateBasedCondition: - BETWEEN dateExpression AND dateExpression + NOT? BETWEEN dateExpression AND dateExpression | GREATER_THAN dateExpression | GREATER_THAN_EQUAL_TO dateExpression | LESS_THAN dateExpression | LESS_THAN_EQUAL_TO dateExpression - | EQUAL_TO dateExpression - | IN dateExpressionArray; + | NEGATION? EQUAL_TO dateExpression + | NOT? IN dateExpressionArray; durationExpressionArray: LBRAC durationExpression (COMMA durationExpression)* RBRAC; durationBasedCondition: - BETWEEN durationExpression AND durationExpression + NOT? BETWEEN durationExpression AND durationExpression | GREATER_THAN durationExpression | GREATER_THAN_EQUAL_TO durationExpression | LESS_THAN durationExpression | LESS_THAN_EQUAL_TO durationExpression - | EQUAL_TO durationExpression - | IN durationExpressionArray; + | NEGATION? EQUAL_TO durationExpression + | NOT? IN durationExpressionArray; ruleType: IDENTIFIER; analyzerType: IDENTIFIER; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java index 0439f01..fea9ffb 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java @@ -42,6 +42,11 @@ public String getFormattedCondition() { operands.get(0).getFormattedExpression(), operands.get(1).getFormattedExpression() ); + case NOT_BETWEEN: + return String.format("not between %s and %s", + operands.get(0).getFormattedExpression(), + operands.get(1).getFormattedExpression() + ); case GREATER_THAN: return String.format("> %s", operands.get(0).getFormattedExpression()); case GREATER_THAN_EQUAL_TO: @@ -52,16 +57,27 @@ public String getFormattedCondition() { return String.format("<= %s", operands.get(0).getFormattedExpression()); case EQUALS: return String.format("= %s", operands.get(0).getFormattedExpression()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).getFormattedExpression()); case IN: { - List formattedOperands = operands.stream() - .map(DateExpression::getFormattedExpression) - .collect(Collectors.toList()); + List formattedOperands = getFormattedOperands(); return String.format("in [%s]", String.join(",", formattedOperands)); } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(",", formattedOperands)); + } default: break; } return ""; } + + private List getFormattedOperands() { + List formattedOperands = operands.stream() + .map(DateExpression::getFormattedExpression) + .collect(Collectors.toList()); + return formattedOperands; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java index 09bab98..565f771 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java @@ -12,10 +12,13 @@ public enum DateBasedConditionOperator { BETWEEN, + NOT_BETWEEN, GREATER_THAN, GREATER_THAN_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, EQUALS, - IN + NOT_EQUALS, + IN, + NOT_IN } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java index 8629c4a..128dd7a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java @@ -40,6 +40,10 @@ public String getFormattedCondition() { return String.format("between %s and %s", operands.get(0).getFormattedDuration(), operands.get(1).getFormattedDuration()); + case NOT_BETWEEN: + return String.format("not between %s and %s", + operands.get(0).getFormattedDuration(), + operands.get(1).getFormattedDuration()); case GREATER_THAN: return String.format("> %s", operands.get(0).getFormattedDuration()); case GREATER_THAN_EQUAL_TO: @@ -50,16 +54,24 @@ public String getFormattedCondition() { return String.format("<= %s", operands.get(0).getFormattedDuration()); case EQUALS: return String.format("= %s", operands.get(0).getFormattedDuration()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).getFormattedDuration()); case IN: { - List formattedOperands = operands.stream() - .map(Duration::getFormattedDuration) - .collect(Collectors.toList()); + List formattedOperands = getFormattedOperands(); return String.format("in [%s]", String.join(", ", formattedOperands)); } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(", ", formattedOperands)); + } default: break; } return ""; } + + private List getFormattedOperands() { + return operands.stream().map(Duration::getFormattedDuration).collect(Collectors.toList()); + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java index 099d410..966b432 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java @@ -12,10 +12,13 @@ public enum DurationBasedConditionOperator { BETWEEN, + NOT_BETWEEN, GREATER_THAN, GREATER_THAN_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, EQUALS, - IN + NOT_EQUALS, + IN, + NOT_IN } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index 3dea2f1..9ef22ab 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -13,6 +13,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; +import static java.lang.Math.abs; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.extern.slf4j.Slf4j; @@ -57,6 +58,13 @@ public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator log.info("{} between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result); return result; } + case NOT_BETWEEN: + if (operands.size() != 2) return false; + else { + boolean result = metric <= operandsAsDouble.get(0) || metric >= operandsAsDouble.get(1); + log.info("{} not between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result); + return result; + } case GREATER_THAN_EQUAL_TO: if (operands.size() != 1) return false; else { @@ -88,14 +96,29 @@ public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator case EQUALS: if (operands.size() != 1) return false; else { - boolean result = metric.equals(operandsAsDouble.get(0)); + boolean result = isOperandEqualToMetric(metric, operandsAsDouble.get(0)); log.info("{} == {}? {}", formatMetric, formatOps.get(0), result); return result; } - case IN: - boolean result = operandsAsDouble.contains(metric); + case NOT_EQUALS: + if (operands.size() != 1) return false; + else { + boolean result = !isOperandEqualToMetric(metric, operandsAsDouble.get(0)); + log.info("{} != {}? {}", formatMetric, formatOps.get(0), result); + return result; + } + case IN: { + boolean result = operandsAsDouble.stream().anyMatch(operand -> + isOperandEqualToMetric(metric, operand)); log.info("{} in {}? {}", formatMetric, formatOps, result); return result; + } + case NOT_IN: { + boolean result = !operandsAsDouble.stream().anyMatch(operand -> + isOperandEqualToMetric(metric, operand)); + log.info("{} not in {}? {}", formatMetric, formatOps, result); + return result; + } default: log.error("Unknown operator"); return false; @@ -109,6 +132,8 @@ public String getFormattedCondition() { switch (operator) { case BETWEEN: return String.format("between %s and %s", operands.get(0).toString(), operands.get(1).toString()); + case NOT_BETWEEN: + return String.format("not between %s and %s", operands.get(0).toString(), operands.get(1).toString()); case GREATER_THAN: return String.format("> %s", operands.get(0).toString()); case GREATER_THAN_EQUAL_TO: @@ -119,16 +144,26 @@ public String getFormattedCondition() { return String.format("<= %s", operands.get(0).toString()); case EQUALS: return String.format("= %s", operands.get(0).toString()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).toString()); case IN: - return String.format("in [%s]", - operands.stream() - .map(NumericOperand::toString) - .collect(Collectors.joining(",")) - ); + return String.format("in [%s]", getFormattedOperands()); + case NOT_IN: + return String.format("not in [%s]", getFormattedOperands()); default: break; } return ""; } + + private String getFormattedOperands() { + return operands.stream() + .map(NumericOperand::toString) + .collect(Collectors.joining(",")); + } + + protected boolean isOperandEqualToMetric(Double metric, Double operand) { + return abs(metric - operand) <= 0.00001; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java index 828e5f9..cd109d3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java @@ -12,10 +12,13 @@ public enum NumberBasedConditionOperator { BETWEEN, + NOT_BETWEEN, GREATER_THAN, GREATER_THAN_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, EQUALS, - IN + NOT_EQUALS, + IN, + NOT_IN } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java index 2bc54d0..1873666 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java @@ -41,12 +41,16 @@ public String getFormattedCondition() { return String.format("matches %s", formatOperand(operands.get(0))); case EQUALS: return String.format("= %s", formatOperand(operands.get(0))); + case NOT_EQUALS: + return String.format("!= %s", formatOperand(operands.get(0))); case IN: { - List formattedOperands = operands.stream() - .map(this::formatOperand) - .collect(Collectors.toList()); + List formattedOperands = getFormattedOperands(); return String.format("in [%s]", String.join(",", formattedOperands)); } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(",", formattedOperands)); + } default: break; } @@ -54,6 +58,13 @@ public String getFormattedCondition() { return ""; } + private List getFormattedOperands() { + List formattedOperands = operands.stream() + .map(this::formatOperand) + .collect(Collectors.toList()); + return formattedOperands; + } + private String formatOperand(String operand) { return "\"" + operand + "\""; } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java index f3bd814..c5ef781 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java @@ -12,6 +12,8 @@ public enum StringBasedConditionOperator { EQUALS, + NOT_EQUALS, IN, + NOT_IN, MATCHES } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index e3a3dcc..ce2db49 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -439,9 +439,10 @@ private Optional parseNumberBasedCondition( Optional operand2 = parseNumericOperand(ctx.number(1), false); if (operand1.isPresent() && operand2.isPresent()) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.BETWEEN, - Arrays.asList(operand1.get(), operand2.get()) - ); + NumberBasedConditionOperator op = (ctx.NOT() != null) ? + NumberBasedConditionOperator.NOT_BETWEEN + : NumberBasedConditionOperator.BETWEEN; + condition = new NumberBasedCondition(exprStr, op, Arrays.asList(operand1.get(), operand2.get())); } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.number().size() == 1) { Optional operand = parseNumericOperand(ctx.number(0), false); @@ -474,9 +475,11 @@ private Optional parseNumberBasedCondition( } else if (ctx.EQUAL_TO() != null && ctx.number().size() == 1) { Optional operand = parseNumericOperand(ctx.number(0), false); if (operand.isPresent()) { + NumberBasedConditionOperator op = (ctx.NEGATION() != null) ? + NumberBasedConditionOperator.NOT_EQUALS + : NumberBasedConditionOperator.EQUALS; condition = new NumberBasedCondition( - exprStr, NumberBasedConditionOperator.EQUALS, - Collections.singletonList(operand.get())); + exprStr, op, Collections.singletonList(operand.get())); } } else if (ctx.IN() != null && ctx.numberArray() != null && ctx.numberArray().number().size() > 0) { List> numbers = ctx.numberArray().number() @@ -485,8 +488,10 @@ private Optional parseNumberBasedCondition( .collect(Collectors.toList()); if (numbers.stream().allMatch(Optional::isPresent)) { - condition = new NumberBasedCondition( - exprStr, NumberBasedConditionOperator.IN, + NumberBasedConditionOperator op = (ctx.NOT() != null) ? + NumberBasedConditionOperator.NOT_IN + : NumberBasedConditionOperator.IN; + condition = new NumberBasedCondition(exprStr, op, numbers.stream().map(Optional::get).collect(Collectors.toList())); } } @@ -551,12 +556,18 @@ private Optional parseStringBasedCondition( Condition condition = null; if (ctx.EQUAL_TO() != null && ctx.quotedString() != null) { - condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.EQUALS, + StringBasedConditionOperator op = (ctx.NEGATION() != null) ? + StringBasedConditionOperator.NOT_EQUALS + : StringBasedConditionOperator.EQUALS; + condition = new StringBasedCondition(exprStr, op, Collections.singletonList(removeQuotes(ctx.quotedString().QUOTED_STRING().getText()))); } else if (ctx.IN() != null && ctx.quotedStringArray() != null && ctx.quotedStringArray().quotedString().size() > 0) { - condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.IN, + StringBasedConditionOperator op = (ctx.NOT() != null) ? + StringBasedConditionOperator.NOT_IN + : StringBasedConditionOperator.IN; + condition = new StringBasedCondition(exprStr, op, ctx.quotedStringArray().quotedString().stream() .map(s -> removeQuotes(removeEscapes(s.getText()))) .collect(Collectors.toList()) @@ -579,8 +590,11 @@ private Optional parseDateBasedCondition( Optional lower = parseDateExpression(ctx.dateExpression(0)); Optional upper = parseDateExpression(ctx.dateExpression(1)); if (lower.isPresent() && upper.isPresent()) { + DateBasedConditionOperator op = (ctx.NOT() != null) ? + DateBasedConditionOperator.NOT_BETWEEN + : DateBasedConditionOperator.BETWEEN; condition = new DateBasedCondition( - exprStr, DateBasedConditionOperator.BETWEEN, Arrays.asList(lower.get(), upper.get()) + exprStr, op, Arrays.asList(lower.get(), upper.get()) ); } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) { @@ -614,8 +628,11 @@ private Optional parseDateBasedCondition( } else if (ctx.EQUAL_TO() != null && ctx.dateExpression().size() == 1) { Optional operand = parseDateExpression(ctx.dateExpression(0)); if (operand.isPresent()) { + DateBasedConditionOperator op = (ctx.NEGATION() != null) ? + DateBasedConditionOperator.NOT_EQUALS + : DateBasedConditionOperator.EQUALS; condition = new DateBasedCondition( - exprStr, DateBasedConditionOperator.EQUALS, Collections.singletonList(operand.get()) + exprStr, op, Collections.singletonList(operand.get()) ); } } else if (ctx.IN() != null && @@ -626,8 +643,11 @@ private Optional parseDateBasedCondition( .collect(Collectors.toList()); if (expressions.stream().allMatch(Optional::isPresent)) { + DateBasedConditionOperator op = (ctx.NOT() != null) ? + DateBasedConditionOperator.NOT_IN + : DateBasedConditionOperator.IN; condition = new DateBasedCondition( - exprStr, DateBasedConditionOperator.IN, + exprStr, op, expressions.stream().map(Optional::get).collect(Collectors.toList()) ); } @@ -647,8 +667,11 @@ private Optional parseDurationBasedCondition( Optional lower = parseDuration(ctx.durationExpression(0)); Optional upper = parseDuration(ctx.durationExpression(1)); if (lower.isPresent() && upper.isPresent()) { + DurationBasedConditionOperator op = (ctx.NOT() != null) ? + DurationBasedConditionOperator.NOT_BETWEEN + : DurationBasedConditionOperator.BETWEEN; condition = new DurationBasedCondition( - exprStr, DurationBasedConditionOperator.BETWEEN, Arrays.asList(lower.get(), upper.get()) + exprStr, op, Arrays.asList(lower.get(), upper.get()) ); } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.durationExpression().size() == 1) { @@ -686,8 +709,11 @@ private Optional parseDurationBasedCondition( } else if (ctx.EQUAL_TO() != null && ctx.durationExpression().size() == 1) { Optional operand = parseDuration(ctx.durationExpression(0)); if (operand.isPresent()) { + DurationBasedConditionOperator op = (ctx.NEGATION() != null) ? + DurationBasedConditionOperator.NOT_EQUALS + : DurationBasedConditionOperator.EQUALS; condition = new DurationBasedCondition( - exprStr, DurationBasedConditionOperator.EQUALS, + exprStr, op, Collections.singletonList(operand.get()) ); } @@ -700,8 +726,11 @@ private Optional parseDurationBasedCondition( .collect(Collectors.toList()); if (durations.stream().allMatch(Optional::isPresent)) { + DurationBasedConditionOperator op = (ctx.NOT() != null) ? + DurationBasedConditionOperator.NOT_IN + : DurationBasedConditionOperator.IN; condition = new DurationBasedCondition( - exprStr, DurationBasedConditionOperator.IN, + exprStr, op, durations.stream().map(Optional::get).collect(Collectors.toList()) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 65955e8..a092e91 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -92,17 +92,21 @@ private static Stream provideRawRules() { Arguments.of("IsPrimaryKey colA \"col B\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\" \"colC\""), Arguments.of("RowCount = 100"), + Arguments.of("RowCount != 100"), Arguments.of("RowCount = -100"), Arguments.of("RowCount between (0.9 * average(last(10))) and 1.1 * average(last(10))"), + Arguments.of("RowCount not between (0.9 * average(last(10))) and 1.1 * average(last(10))"), Arguments.of("RowCountMatch \"reference\" = 1.0"), Arguments.of("RowCountMatch \"reference\" >= 0.95"), Arguments.of("RowCountMatch \"reference\" between 0.8 and 0.98"), Arguments.of("Completeness \"col_1\" between 0.5 and 0.8"), Arguments.of("Completeness of col_1 between 0.5 and 0.8"), + Arguments.of("Completeness of col_1 not between 0.5 and 0.8"), Arguments.of("IsComplete \"col_1\""), Arguments.of("Completeness \"col_1\" between -0.5 and -0.4"), Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"), Arguments.of("ColumnDataType \"col_1\" = \"String\""), + Arguments.of("ColumnDataType \"col_1\" != \"String\""), Arguments.of("ColumnDataType \"col_1\" = \"String\" with threshold between 0.4 and 0.8"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"]"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9"), @@ -110,6 +114,7 @@ private static Stream provideRawRules() { Arguments.of("ColumnExists \"load_dt\""), Arguments.of("ColumnCount >= 100"), Arguments.of("ColumnCount = avg(std(last(10)))"), + Arguments.of("ColumnCount != avg(std(last(10)))"), Arguments.of("ColumnCount = avg(std(last(percentile(1,2,3))))"), Arguments.of("ColumnCount > -100.123456"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8"), @@ -122,7 +127,9 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\""), Arguments.of("ColumnValues \"load_dt\" > (now() - 1 days)"), Arguments.of("ColumnValues \"order-id\" in [1,2,3,4]"), + Arguments.of("ColumnValues \"order-id\" not in [1,2,3,4]"), Arguments.of("ColumnValues \"order-id\" in [\"1\",\"2\",\"3\",\"4\"]"), + Arguments.of("ColumnValues \"order-id\" not in [\"1\",\"2\",\"3\",\"4\"]"), Arguments.of("Sum \"col_A-B.C\" > 100.0"), Arguments.of("Sum \"col_A-B.C\" > -100.0"), Arguments.of("Mean \"col_A-B.CD\" between 10 and 20"), @@ -140,16 +147,21 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col-A\" matches \"[a-zA-Z0-9]*\""), Arguments.of("ColumnValues \"col-A\" >= now()"), Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and now()"), + Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and now()"), Arguments.of("ColumnValues \"col-A\" between now() and (now() + 3 hours)"), Arguments.of("ColumnValues \"col-A\" < (now() + 4 days)"), Arguments.of("ColumnValues \"col-A\" = (now() - 3 hours)"), + Arguments.of("ColumnValues \"col-A\" != (now() - 3 hours)"), Arguments.of("ColumnValues \"col-A\" in [now(),(now() - 3 hours),now(),(now() + 4 days)]"), + Arguments.of("ColumnValues \"col-A\" not in [now(),(now() - 3 hours),now(),(now() + 4 days)]"), Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and (now() + 14 days)"), + Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and (now() + 14 days)"), Arguments.of("ColumnValues \"col-A\" matches \"[a-z]*\" with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [\"A\",\"B\"] with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [1,2,3] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"A\" with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold between 0.4 and 0.8"), + Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold not between 0.4 and 0.8"), Arguments.of("ColumnValues \"col-A\" > 0.4 with threshold > 0.4"), Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\"] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = 1 with threshold > 0.98"), diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java index 56a4c3e..4d557b1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java @@ -35,32 +35,66 @@ public class ConditionTest { private static Stream provideRulesWithNumberBasedConditions() { return Stream.of( Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.5, true), + Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.4, false), Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.3, false), + Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.9, false), Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.91, false), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.5, false), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.4, true), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.3, true), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.9, true), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.91, true), + Arguments.of("Completeness \"colA\" not in [0.4, 0.9]", 0.91, true), Arguments.of("ColumnCorrelation \"colA\" \"colB\" between -0.2 and 1.0", 0.9, true), Arguments.of("ColumnCorrelation \"colA\" \"colB\" between -0.2 and 1.0", -0.19, true), Arguments.of("ColumnCorrelation \"colA\" \"colB\" between -0.2 and 1.0", -0.2001, false), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not between -0.2 and 1.0", -0.2, true), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not between -0.2 and 1.0", -0.19, false), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not between -0.2 and 1.0", -0.21, true), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not in [-0.2, 1.0]", -0.2001, true), + Arguments.of("ColumnLength \"colA\" in [1, 2, 3]", 4.0, false), + Arguments.of("ColumnLength \"colA\" not in [1, 2, 3]", 4.0, true), + Arguments.of("ColumnLength \"colA\" in [1, 2, 3, 4]", 4.0, true), + Arguments.of("ColumnLength \"colA\" not in [1, 2, 3, 4]", 4.0, false), + Arguments.of("ColumnValues \"colA\" in [1, 2, 3, 4]", 3.999999, true), + Arguments.of("ColumnValues \"colA\" in [1, 2, 3, 4]", 3.999, false), Arguments.of("Completeness \"colA\" >= 0.4", 0.4, true), Arguments.of("Completeness \"colA\" >= 0.4", 0.39, false), Arguments.of("Completeness \"colA\" >= 0.4", 1.0, true), + Arguments.of("DatasetMatch \"reference\" \"colA\" not between 0.1 and 1.0", 1.0, true), + Arguments.of("DistinctValuesCount \"colA\" not between 0.1 and 1.0", 1.0, true), + Arguments.of("Entropy \"colA\" <= 0.678", 0.679, false), + Arguments.of("Entropy \"colA\" <= 0.678", 0.677, true), + Arguments.of("Entropy \"colA\" <= 0.678", -0.1, true), + Arguments.of("Entropy \"colA\" != 0.678", 0.678, false), + Arguments.of("Mean \"colA\" != 10.0", 10.0, false), + Arguments.of("RowCount != 10.0", 10.0, false), + Arguments.of("Mean \"colA\" != 10.0", 10.0, false), + Arguments.of("StandardDeviation \"colA\" = 10.0", 10.0, true), + Arguments.of("StandardDeviation \"colA\" = -10000.0", -10000.0, true), + Arguments.of("StandardDeviation \"colA\" = 99.34", 99.35, false), + Arguments.of("StandardDeviation \"colA\" != 10.0", 10.0, false), + Arguments.of("StandardDeviation \"colA\" != -10000.0", -10000.0, false), + Arguments.of("StandardDeviation \"colA\" != 99.34", 99.35, true), + Arguments.of("Sum \"colA\" not in [5.0, 10.0]", 10.0, false), Arguments.of("Uniqueness \"colA\" > 0.4", 0.41, true), Arguments.of("Uniqueness \"colA\" > 0.4", 0.4, false), Arguments.of("Uniqueness \"colA\" > 0.4", -0.4, false), + Arguments.of("Uniqueness \"colA\" != 0.4", -0.4, true), + Arguments.of("Uniqueness \"colA\" not between 0.1 and 0.5", 0.5, true), + Arguments.of("Uniqueness \"colA\" not in [0.1, 0.1, 0.5]", 0.3, true), Arguments.of("UniqueValueRatio \"colA\" < -0.4", 100.9, false), Arguments.of("UniqueValueRatio \"colA\" < -0.4", -0.5, true), Arguments.of("UniqueValueRatio \"colA\" < -0.4", -0.41, true), - Arguments.of("Entropy \"colA\" <= 0.678", 0.679, false), - Arguments.of("Entropy \"colA\" <= 0.678", 0.677, true), - Arguments.of("Entropy \"colA\" <= 0.678", -0.1, true), - Arguments.of("StandardDeviation \"colA\" = 10.0", 10.0, true), - Arguments.of("StandardDeviation \"colA\" = -10000.0", -10000.0, true), - Arguments.of("StandardDeviation \"colA\" = 99.34", 99.35, false) + Arguments.of("UniqueValueRatio \"colA\" not between -0.5 and -0.4", -0.41, false), + Arguments.of("UniqueValueRatio \"colA\" not between -0.4 and -0.5", -0.41, true) ); } private static Stream provideRulesWithNumberBasedThresholdConditions() { return Stream.of( Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold between 0.4 and 0.9", 0.5, true), + Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold not between 0.4 and 0.9", 0.5, false), Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold > 0.6", 0.59, false), Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold >= 0.5", 0.5, true), Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold < 0.333", 0.334, false), @@ -72,6 +106,7 @@ private static Stream provideRulesWithNumberBasedThresholdConditions( Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold < 0.333", 0.332, true), Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold <= 0.333", 0.3, true), Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold = 0.2", 0.2, true), + Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold != 0.2", 0.2, false), Arguments.of("ColumnValues \"Customer_ID\" in [1,2,3,4,5,6,7,8,9] with threshold > 0.98", 0.979, false) ); } @@ -80,29 +115,35 @@ private static Stream provideRulesWithDateBasedThresholdConditions() return Stream.of( // With static dates Arguments.of("ColumnValues \"colA\" in [ \"2022-01-01\", \"2022-12-31\" ]"), + Arguments.of("ColumnValues \"colA\" not in [ \"2022-01-01\", \"2022-12-31\" ]"), Arguments.of("ColumnValues \"colA\" >= \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" > \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" <= \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" < \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" between \"2022-01-01\" and \"2022-12-31\""), + Arguments.of("ColumnValues \"colA\" not between \"2022-01-01\" and \"2022-12-31\""), // With dynamic expressions Arguments.of("ColumnValues \"colA\" in [ (now() - 14 days), (now() - 7 days), \"2022-01-01\" ]"), + Arguments.of("ColumnValues \"colA\" not in [ (now() - 14 days), (now() - 7 days), \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" >= now() ]"), Arguments.of("ColumnValues \"colA\" > (now() - 12 hours) ]"), Arguments.of("ColumnValues \"colA\" <= (now() + 3 days) ]"), Arguments.of("ColumnValues \"colA\" < (now() + 72 hours) ]"), - Arguments.of("ColumnValues \"colA\" between (now() - 14 days) and now()") + Arguments.of("ColumnValues \"colA\" between (now() - 14 days) and now()"), + Arguments.of("ColumnValues \"colA\" not between (now() - 14 days) and now()") ); } private static Stream provideRulesWithDurationBasedThresholdConditions() { return Stream.of( Arguments.of("DataFreshness \"colA\" in [ 3 hours, 12 hours, 1 days ]"), + Arguments.of("DataFreshness \"colA\" not in [ 3 hours, 12 hours, 1 days ]"), Arguments.of("DataFreshness \"colA\" >= 12 hours"), Arguments.of("DataFreshness \"colA\" > 2 days"), Arguments.of("DataFreshness \"colA\" <= 2 hours"), Arguments.of("DataFreshness \"colA\" < 6 hours"), - Arguments.of("DataFreshness \"colA\" between 6 hours and 12 hours") + Arguments.of("DataFreshness \"colA\" between 6 hours and 12 hours"), + Arguments.of("DataFreshness \"colA\" not between 6 hours and 12 hours") ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java index e5b332d..6283b7c 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java @@ -35,6 +35,16 @@ private static Stream provideDateBasedConditionsWithExpectedFormatted ), "between \"2023-01-01\" and \"2023-12-31\"" ), + Arguments.of( + new DateBasedCondition( + "notbetween\"2023-01-01\"and\"2023-12-31\"", + DateBasedConditionOperator.NOT_BETWEEN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), new DateExpression.StaticDate("2023-12-31") + ) + ), + "not between \"2023-01-01\" and \"2023-12-31\"" + ), Arguments.of( new DateBasedCondition( "between(now()-4days)and(now()+72hours)", @@ -50,6 +60,21 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ), "between (now() - 4 days) and (now() + 72 hours)" ), + Arguments.of( + new DateBasedCondition( + "notbetween(now()-4days)and(now()+72hours)", + DateBasedConditionOperator.NOT_BETWEEN, + Arrays.asList( + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.MINUS,new Duration(4, DurationUnit.DAYS) + ), + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) + ) + ) + ), + "not between (now() - 4 days) and (now() + 72 hours)" + ), Arguments.of( new DateBasedCondition( ">\"2023-01-01\"", @@ -130,6 +155,14 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) ), "= \"2023-01-01\"" ), + Arguments.of( + new DateBasedCondition( + "!=\"2023-01-01\"", + DateBasedConditionOperator.NOT_EQUALS, + Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) + ), + "!= \"2023-01-01\"" + ), Arguments.of( new DateBasedCondition( ">=(now()-2days)", @@ -158,6 +191,23 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ), "in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]" + ), + Arguments.of( + new DateBasedCondition( + "notin[\"2023-01-01\",now(),(now()-2days),(now()+72hours)]", + DateBasedConditionOperator.NOT_IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.CurrentDate(), + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) + ), + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) + ) + ) + ), + "not in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]" ) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java index 98d467e..08b5319 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java @@ -34,6 +34,17 @@ private static Stream provideDurationConditionsWithExpectedFormattedS ), "between 3 hours and 4 days" ), + Arguments.of( + new DurationBasedCondition( + "notbetween3hoursand4days", + DurationBasedConditionOperator.NOT_BETWEEN, + Arrays.asList( + new Duration(3, DurationUnit.HOURS), + new Duration(4, DurationUnit.DAYS) + ) + ), + "not between 3 hours and 4 days" + ), Arguments.of( new DurationBasedCondition( ">256hours", @@ -74,6 +85,14 @@ private static Stream provideDurationConditionsWithExpectedFormattedS ), "= 10 days" ), + Arguments.of( + new DurationBasedCondition( + "!=10days", + DurationBasedConditionOperator.NOT_EQUALS, + Collections.singletonList(new Duration(10, DurationUnit.DAYS)) + ), + "!= 10 days" + ), Arguments.of( new DurationBasedCondition( "in[3hours,4days,96hours,7days]", @@ -86,6 +105,19 @@ private static Stream provideDurationConditionsWithExpectedFormattedS ) ), "in [3 hours, 4 days, 96 hours, 7 days]" + ), + Arguments.of( + new DurationBasedCondition( + "notin[3hours,4days,96hours,7days]", + DurationBasedConditionOperator.NOT_IN, + Arrays.asList( + new Duration(3, DurationUnit.HOURS), + new Duration(4, DurationUnit.DAYS), + new Duration(96, DurationUnit.HOURS), + new Duration(7, DurationUnit.DAYS) + ) + ), + "not in [3 hours, 4 days, 96 hours, 7 days]" ) ); } From 01c67b4ef61b19d4da5eae0622deea396707e37d Mon Sep 17 00:00:00 2001 From: Dongying Song Date: Wed, 21 Feb 2024 11:49:15 -0500 Subject: [PATCH 21/50] Add row-level evaluation option for composite rules --- configuration/rules/rules-config.json | 13 ++-- .../ml/dataquality/dqdl/model/DQRule.java | 4 +- .../ml/dataquality/dqdl/model/DQRuleType.java | 8 +-- .../dqdl/model/DeserializationTest.java | 59 +++++++++++++++++-- 4 files changed, 65 insertions(+), 19 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 62cd1b8..416784f 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -38,7 +38,7 @@ } ], "return_type": "NUMBER", - "is_composite_rule_evaluation_row_level_supported": false, + "is_excluded_at_row_level_in_composite_rules": true, "scope": "column" }, { @@ -66,7 +66,6 @@ ], "return_type": "STRING", "is_threshold_supported": true, - "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -124,7 +123,7 @@ } ], "return_type": "NUMBER", - "is_composite_rule_evaluation_row_level_supported": false, + "is_excluded_at_row_level_in_composite_rules": true, "scope": "column" }, { @@ -177,7 +176,7 @@ } ], "return_type": "NUMBER", - "is_composite_rule_evaluation_row_level_supported": false, + "is_excluded_at_row_level_in_composite_rules": true, "scope": "column" }, { @@ -258,7 +257,6 @@ ], "return_type": "STRING_ARRAY|NUMBER_ARRAY|DATE_ARRAY", "is_threshold_supported": true, - "is_composite_rule_evaluation_row_level_supported": false, "scope": "column" }, { @@ -286,7 +284,6 @@ ], "return_type": "NUMBER|BOOLEAN", "is_threshold_supported": true, - "is_composite_rule_evaluation_row_level_supported": false, "scope": "table" }, { @@ -305,7 +302,7 @@ } ], "return_type": "NUMBER", - "is_composite_rule_evaluation_row_level_supported": false, + "is_excluded_at_row_level_in_composite_rules": true, "scope": "table" }, { @@ -324,7 +321,7 @@ } ], "return_type": "NUMBER", - "is_composite_rule_evaluation_row_level_supported": false, + "is_excluded_at_row_level_in_composite_rules": true, "scope": "table" }, { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 91e3554..9c8c3e4 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -35,7 +35,7 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final DQRuleLogicalOperator operator; private final List nestedRules; private final String whereClause; - private Boolean isCompositeRuleEvaluationRowLevelSupported = true; + private Boolean isExcludedRowLevelInCompositeRules = false; // Adding this constructor so as to not break the Data Quality ETL package. public DQRule(final String ruleType, @@ -123,7 +123,7 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, operator, nestedRules, whereClause, - ruleType.isCompositeRuleEvaluationRowLevelSupported() + ruleType.isExcludedRowLevelInCompositeRules() ); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index e206ab8..dfa0a7b 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -32,7 +32,7 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; - private final boolean isCompositeRuleEvaluationRowLevelSupported; + private final boolean isExcludedRowLevelInCompositeRules; private final boolean isAnalyzerOnly; private final String scope; private final boolean isExperimental; @@ -44,8 +44,8 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "return_type") String returnType, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, - @JsonProperty(value = "is_composite_rule_evaluation_row_level_supported") - boolean isCompositeRuleEvaluationRowLevelSupported, + @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules") + boolean isExcludedRowLevelInCompositeRules, @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, @JsonProperty(value = "scope") String scope, @JsonProperty(value = "experimental") boolean isExperimental) { @@ -54,7 +54,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; - this.isCompositeRuleEvaluationRowLevelSupported = isCompositeRuleEvaluationRowLevelSupported; + this.isExcludedRowLevelInCompositeRules = isExcludedRowLevelInCompositeRules; this.isAnalyzerOnly = isAnalyzerOnly; this.scope = scope; this.isExperimental = isExperimental; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java index ce14834..cca7786 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java @@ -71,12 +71,12 @@ public void test_parseRuleParameterWithoutIsVarArg() throws JsonProcessingExcept } @Test - public void test_parseDQRuleType() throws JsonProcessingException { + public void test_parseDQRuleTypeWithThresholdAndRowLevelFlags() throws JsonProcessingException { String ruleTypeName = "DatasetMatch"; String ruleTypeDesc = "This rule matches two datasets"; String returnType = "STRING"; boolean isThresholdSupported = true; - boolean isCompositeRuleEvaluationRowLevelSupported = false; + boolean isExcludedRowLevelInCompositeRules = true; // Parameter 1 String param1Type = "String"; @@ -100,17 +100,66 @@ public void test_parseDQRuleType() throws JsonProcessingException { "\"parameters\": [ %s, %s ]," + "\"return_type\": \"%s\"," + "\"is_threshold_supported\": \"%s\"," + - "\"is_composite_rule_evaluation_row_level_supported\": %s" + + "\"is_excluded_at_row_level_in_composite_rules\": %s" + "}", ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType, isThresholdSupported, - isCompositeRuleEvaluationRowLevelSupported); + isExcludedRowLevelInCompositeRules); DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); assertEquals(ruleTypeName, ruleType.getRuleTypeName()); assertEquals(ruleTypeDesc, ruleType.getDescription()); assertEquals(returnType, ruleType.getReturnType()); assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); - assertEquals(isCompositeRuleEvaluationRowLevelSupported, ruleType.isCompositeRuleEvaluationRowLevelSupported()); + assertEquals(isExcludedRowLevelInCompositeRules, ruleType.isExcludedRowLevelInCompositeRules()); + + DQRuleParameter param1 = ruleType.getParameters().get(0); + assertEquals(param1Type, param1.getType()); + assertEquals(param1Name, param1.getName()); + assertEquals(param1Desc, param1.getDescription()); + + DQRuleParameter param2 = ruleType.getParameters().get(1); + assertEquals(param2Type, param2.getType()); + assertEquals(param2Name, param2.getName()); + assertEquals(param2Desc, param2.getDescription()); + } + + @Test + public void test_parseDQRuleTypeWithNoThresholdAndRowLevelFlags() throws JsonProcessingException { + String ruleTypeName = "DatasetMatch"; + String ruleTypeDesc = "This rule matches two datasets"; + String returnType = "STRING"; + boolean isThresholdSupported = false; + boolean isExcludedRowLevelInCompositeRules = false; + + // Parameter 1 + String param1Type = "String"; + String param1Name = "PrimaryDatasetAlias"; + String param1Desc = "This is the primary dataset alias"; + String param1Json = String.format( + "{\"type\":\"%s\",\"name\":\"%s\",\"description\":\"%s\"}", param1Type, param1Name, param1Desc); + + // Parameter2 + String param2Type = "String"; + String param2Name = "ReferenceDatasetAlias"; + String param2Desc = "This is the reference dataset alias"; + + String param2Json = String.format( + "{\"type\":\"%s\",\"name\":\"%s\",\"description\":\"%s\"}", param2Type, param2Name, param2Desc); + + String json = String.format( + "{" + + "\"rule_type_name\":\"%s\"," + + "\"description\":\"%s\"," + + "\"parameters\": [ %s, %s ]," + + "\"return_type\": \"%s\"" + + "}", ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType); + + DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); + assertEquals(ruleTypeName, ruleType.getRuleTypeName()); + assertEquals(ruleTypeDesc, ruleType.getDescription()); + assertEquals(returnType, ruleType.getReturnType()); + assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); + assertEquals(isExcludedRowLevelInCompositeRules, ruleType.isExcludedRowLevelInCompositeRules()); DQRuleParameter param1 = ruleType.getParameters().get(0); assertEquals(param1Type, param1.getType()); From 8abd19b2c2657f0318dcd8f018e4acc753ee33c6 Mon Sep 17 00:00:00 2001 From: Yannis Mentekidis Date: Tue, 13 Feb 2024 15:27:15 -0500 Subject: [PATCH 22/50] All Conditions implement evaluate() - Add helper functions to copy and modify a rule - Add more tests to Condition and DQRule --- .../ml/dataquality/dqdl/model/DQRule.java | 11 +++++ .../dqdl/model/condition/Condition.java | 6 +++ .../number/NumberBasedCondition.java | 1 + .../ml/dataquality/dqdl/model/DQRuleTest.java | 45 +++++++++++++++++++ .../dqdl/model/condition/ConditionTest.java | 8 ++++ 5 files changed, 71 insertions(+) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 9c8c3e4..bb32810 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -11,7 +11,9 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import lombok.AccessLevel; import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -26,6 +28,7 @@ @AllArgsConstructor @Getter @EqualsAndHashCode +@Builder(toBuilder = true, access = AccessLevel.PRIVATE) public class DQRule implements Serializable, HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; @@ -127,6 +130,14 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, ); } + public DQRule withNestedRules(final List nestedRules) { + return this.toBuilder().nestedRules(nestedRules).build(); + } + + public DQRule withCondition(final Condition condition) { + return this.toBuilder().condition(condition).build(); + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java index dc59445..17401f5 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java @@ -10,6 +10,8 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.OperandEvaluator; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -27,4 +29,8 @@ public Condition(final String conditionAsString) { public String getFormattedCondition() { return this.conditionAsString; } + + public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) { + throw new UnsupportedOperationException(); + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index 9ef22ab..cdee464 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -39,6 +39,7 @@ public NumberBasedCondition(final String conditionAsString, this.operands = operands; } + @Override public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) { if (operands == null) return false; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index a092e91..50f2930 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -496,6 +496,51 @@ void test_constructorWithParametersAndCondition() { assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); } + @Test + void test_modifyNestedRules() throws InvalidDataQualityRulesetException { + String rule1 = "IsComplete \"name\""; + String rule2 = "IsUnique \"name\""; + String rule3 = "IsPrimaryKey \"name\""; + String ruleset = String.format("Rules = [" + + "(%s) AND (%s)," + + "%s ]", rule1, rule2, rule3); + DQRuleset dqRuleset = parser.parse(ruleset); + + DQRule composite = dqRuleset.getRules().get(0); + + // Copy the list's elements into a new list, without copying the list itself + List nested = new ArrayList<>(composite.getNestedRules()); + nested.add(dqRuleset.getRules().get(1)); // IsComplete AND IsUnique AND IsPrimaryKey + + DQRule modified = composite.withNestedRules(nested); + + // The original rule hasn't been modified + assertEquals(composite.toString(), "(IsComplete \"name\") AND (IsUnique \"name\")"); + + // The modified rule includes all subrules + assertEquals(modified.toString(), "(IsComplete \"name\") AND (IsUnique \"name\") AND (IsPrimaryKey \"name\")"); + assertEquals(modified.getNestedRules().size(), 3); + } + + @Test + void test_withCondition() throws InvalidDataQualityRulesetException { + DQRuleset ruleset = parser.parse("Rules = [RowCount > 20, RowCount > 10 + 10]"); + + DQRule simple = ruleset.getRules().get(0); + DQRule dynamic = ruleset.getRules().get(1); + + Condition simplified = simple.getCondition(); + assertEquals(simplified.getFormattedCondition(), "> 20"); + + DQRule modified = dynamic.withCondition(simplified); + + // The original rule hasn't been modified + assertEquals(dynamic.toString(), "RowCount > 10 + 10"); + + // The modified rule uses the simplified condition + assertEquals(modified.toString(), "RowCount > 20"); + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java index 4d557b1..fc781f6 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java @@ -197,6 +197,14 @@ void test_ruleParsingAndVerifyingDateBasedCondition(String rule) { DateBasedCondition condition = (DateBasedCondition) dqRule.getCondition(); assertTrue(dqRule.toString().contains(condition.getFormattedCondition())); + + try { + condition.evaluate(0.0, dqRule, testEvaluator); + fail("Expected date condition to throw UnsupportedOperationException"); + } catch (UnsupportedOperationException e) { + // pass + } + } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } From 7d7067eee4f1842d1f1b642bf4ea4d889ca82b53 Mon Sep 17 00:00:00 2001 From: Shriya Vanvari Date: Tue, 20 Feb 2024 23:01:25 -0500 Subject: [PATCH 23/50] Support for special keywords --- configuration/dqdl/CommonLexerRules.g4 | 3 + .../dqdl/DataQualityDefinitionLanguage.g4 | 7 +- .../dqdl/model/condition/string/Keyword.java | 17 ++++ .../string/KeywordStringOperand.java | 28 ++++++ .../condition/string/QuotedStringOperand.java | 22 +++++ .../string/StringBasedCondition.java | 16 ++-- .../model/condition/string/StringOperand.java | 28 ++++++ .../dqdl/parser/DQDLParserListener.java | 95 +++++++++++++++++-- .../ml/dataquality/dqdl/model/DQRuleTest.java | 37 ++++++-- 9 files changed, 224 insertions(+), 29 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4 index 5c4fc99..f4ee958 100644 --- a/configuration/dqdl/CommonLexerRules.g4 +++ b/configuration/dqdl/CommonLexerRules.g4 @@ -13,6 +13,9 @@ RPAREN: ')'; AND: 'and' | 'AND'; OR: 'or' | 'OR'; OF: 'of' | 'OF'; +NULL: 'null' | 'NULL'; +EMPTY: 'empty' | 'EMPTY'; +WHITESPACES_ONLY: 'whitespaces_only' | 'WHITESPACES_ONLY'; BETWEEN: 'between'; EQUAL_TO: '='; diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 053041e..5754774 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -58,10 +58,11 @@ numberBasedCondition: | NEGATION? EQUAL_TO number | NOT? IN numberArray; -quotedStringArray: LBRAC quotedString (COMMA quotedString)* RBRAC; +stringValues : quotedString | NULL | EMPTY | WHITESPACES_ONLY; +stringValuesArray: LBRAC stringValues (COMMA stringValues)* RBRAC; stringBasedCondition: - NEGATION? EQUAL_TO quotedString - | NOT? IN quotedStringArray + NEGATION? EQUAL_TO stringValues + | NOT? IN stringValuesArray | matchesRegexCondition; dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java new file mode 100644 index 0000000..1969c8e --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java @@ -0,0 +1,17 @@ +/* + * Keyword.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +public enum Keyword { + NULL, + EMPTY, + WHITESPACES_ONLY +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java new file mode 100644 index 0000000..23431ba --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java @@ -0,0 +1,28 @@ +/* + * KeywordStringOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import lombok.EqualsAndHashCode; + +@EqualsAndHashCode(callSuper = true) +public class KeywordStringOperand extends StringOperand { + final Keyword operand; + + public KeywordStringOperand(final Keyword operand) { + super(operand.toString()); + this.operand = operand; + } + + @Override + public String formatOperand() { + return getOperand().toString(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java new file mode 100644 index 0000000..dcf74f3 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java @@ -0,0 +1,22 @@ +/* + * QuotedStringOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +public class QuotedStringOperand extends StringOperand { + public QuotedStringOperand(final String operand) { + super(operand); + } + + @Override + public String formatOperand() { + return "\"" + getOperand() + "\""; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java index 1873666..2f4f105 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java @@ -22,11 +22,11 @@ @EqualsAndHashCode(callSuper = true) public class StringBasedCondition extends Condition { private final StringBasedConditionOperator operator; - private final List operands; + private final List operands; public StringBasedCondition(final String conditionAsString, final StringBasedConditionOperator operator, - final List operands) { + final List operands) { super(conditionAsString); this.operator = operator; this.operands = operands; @@ -38,11 +38,11 @@ public String getFormattedCondition() { switch (operator) { case MATCHES: - return String.format("matches %s", formatOperand(operands.get(0))); + return String.format("matches %s", operands.get(0).formatOperand()); case EQUALS: - return String.format("= %s", formatOperand(operands.get(0))); + return String.format("= %s", operands.get(0).formatOperand()); case NOT_EQUALS: - return String.format("!= %s", formatOperand(operands.get(0))); + return String.format("!= %s", operands.get(0).formatOperand()); case IN: { List formattedOperands = getFormattedOperands(); return String.format("in [%s]", String.join(",", formattedOperands)); @@ -60,12 +60,8 @@ public String getFormattedCondition() { private List getFormattedOperands() { List formattedOperands = operands.stream() - .map(this::formatOperand) + .map(StringOperand::formatOperand) .collect(Collectors.toList()); return formattedOperands; } - - private String formatOperand(String operand) { - return "\"" + operand + "\""; - } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java new file mode 100644 index 0000000..e121197 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java @@ -0,0 +1,28 @@ +/* + * StringOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; + +@EqualsAndHashCode +@Getter +public abstract class StringOperand implements Serializable { + private final String operand; + + public StringOperand(final String operand) { + this.operand = operand; + } + + public abstract String formatOperand(); +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index ce2db49..f453454 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -28,14 +28,20 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener; import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -555,31 +561,71 @@ private Optional parseStringBasedCondition( String exprStr = ctx.getText(); Condition condition = null; - if (ctx.EQUAL_TO() != null && ctx.quotedString() != null) { + if (ctx.EQUAL_TO() != null && ctx.stringValues() != null) { StringBasedConditionOperator op = (ctx.NEGATION() != null) ? StringBasedConditionOperator.NOT_EQUALS : StringBasedConditionOperator.EQUALS; - condition = new StringBasedCondition(exprStr, op, - Collections.singletonList(removeQuotes(ctx.quotedString().QUOTED_STRING().getText()))); + Optional operand = parseStringOperand(ctx, Optional.of(ctx.stringValues()), op); + if (operand.isPresent()) { + condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand.get())); + } } else if (ctx.IN() != null && - ctx.quotedStringArray() != null && - ctx.quotedStringArray().quotedString().size() > 0) { + ctx.stringValuesArray() != null && + ctx.stringValuesArray().stringValues().size() > 0) { StringBasedConditionOperator op = (ctx.NOT() != null) ? StringBasedConditionOperator.NOT_IN : StringBasedConditionOperator.IN; + List> operands = ctx.stringValuesArray().stringValues() + .stream() + .map(s -> parseStringOperand(ctx, Optional.of(s), op)) + .collect(Collectors.toList()); + condition = new StringBasedCondition(exprStr, op, - ctx.quotedStringArray().quotedString().stream() - .map(s -> removeQuotes(removeEscapes(s.getText()))) - .collect(Collectors.toList()) + operands.stream().map(Optional::get).collect(Collectors.toList()) ); } else if (ctx.matchesRegexCondition() != null) { - condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.MATCHES, - Collections.singletonList(removeQuotes(ctx.matchesRegexCondition().quotedString().getText()))); + StringBasedConditionOperator op = StringBasedConditionOperator.MATCHES; + Optional operand = parseStringOperand(ctx, Optional.ofNullable(ctx.stringValues()), op); + if (operand.isPresent()) { + condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand.get())); + } } return Optional.ofNullable(condition); } + private Optional parseStringOperand( + DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx, + Optional + stringValuesContext, StringBasedConditionOperator op) { + + switch (op) { + case NOT_EQUALS: + case EQUALS: + Keyword keyword = parseKeyword(stringValuesContext.get()); + if (keyword == null) { + return Optional.of(new QuotedStringOperand( + removeQuotes(stringValuesContext.get().quotedString().getText()))); + } else { + return Optional.of(new KeywordStringOperand(keyword)); + } + case NOT_IN: + case IN: + keyword = parseKeyword(stringValuesContext.get()); + if (keyword == null) { + return Optional.of(new QuotedStringOperand( + removeQuotes(removeEscapes(stringValuesContext.get().quotedString().getText())))); + } else { + return Optional.of(new KeywordStringOperand(keyword)); + } + case MATCHES: + return Optional.of(new QuotedStringOperand( + removeQuotes(ctx.matchesRegexCondition().quotedString().getText()))); + default: + return Optional.empty(); + } + } + private Optional parseDateBasedCondition( DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx) { @@ -808,4 +854,33 @@ private List validateDictionary(DataQualityDefinitionLanguageParser.Dict } return dictionaryErrors; } + + private Keyword parseKeyword( + DataQualityDefinitionLanguageParser.StringValuesContext stringValuesContext) { + Keyword keyword = null; + try { + String operand = stringValuesContext.getText().toUpperCase(); + if (isValidEnumValue(operand)) { + Method method = stringValuesContext.getClass().getMethod(operand); + Object result = method.invoke(stringValuesContext); + if (result != null) { + keyword = Keyword.valueOf(operand); + } + } + } catch (IllegalArgumentException | IllegalAccessException | NoSuchMethodException | + InvocationTargetException e) { + errorMessages.add(e.getMessage()); + } + return keyword; + } + + private boolean isValidEnumValue(String value) { + try { + Enum.valueOf(Keyword.class, value); + return true; + } catch (IllegalArgumentException e) { + return false; + } + } + } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 50f2930..47c87c2 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -16,6 +16,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -107,6 +108,7 @@ private static Stream provideRawRules() { Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"), Arguments.of("ColumnDataType \"col_1\" = \"String\""), Arguments.of("ColumnDataType \"col_1\" != \"String\""), + Arguments.of("ColumnDataType \"col_2\" = \"Integer\""), Arguments.of("ColumnDataType \"col_1\" = \"String\" with threshold between 0.4 and 0.8"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"]"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9"), @@ -160,6 +162,19 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col-A\" in [\"A\",\"B\"] with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [1,2,3] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"A\" with threshold > 0.98"), + Arguments.of("ColumnValues \"col-A\" = NULL"), + Arguments.of("ColumnValues \"col-A\" = EMPTY"), + Arguments.of("ColumnValues \"col-A\" = WHITESPACES_ONLY"), + Arguments.of("ColumnValues \"col-A\" != NULL"), + Arguments.of("ColumnValues \"col-A\" != EMPTY"), + Arguments.of("ColumnValues \"col-A\" != WHITESPACES_ONLY"), + Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL]"), + Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL]"), + Arguments.of("ColumnValues \"col-A\" not in [\"a\",NULL]"), + Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL,EMPTY,WHITESPACES_ONLY]"), + Arguments.of("ColumnValues \"col-A\" in [NULL,EMPTY,WHITESPACES_ONLY]"), + Arguments.of("(ColumnValues \"col-A\" not in [NULL,EMPTY,WHITESPACES_ONLY]) OR (ColumnValues \"col-B\" != WHITESPACES_ONLY)"), + Arguments.of("(ColumnValues \"col-A\" in [NULL,EMPTY,WHITESPACES_ONLY]) AND (ColumnValues \"col-B\" != WHITESPACES_ONLY)"), Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold between 0.4 and 0.8"), Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold not between 0.4 and 0.8"), Arguments.of("ColumnValues \"col-A\" > 0.4 with threshold > 0.4"), @@ -231,10 +246,9 @@ void test_setExpressionContainsRuleContainingRule() throws InvalidDataQualityRul assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); + List stringList = constructOperandsAsStringList(dqRule); assertEquals( - Collections.singletonList("ColumnValues in [ \"col-A\" ]"), - ((StringBasedCondition) dqRule.getCondition()).getOperands() - ); + Collections.singletonList("ColumnValues in [ \"col-A\" ]"), stringList); } @Test @@ -259,7 +273,8 @@ void test_setExpressionContainsItemContainingEscapedQuotes() throws InvalidDataQ assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); - assertEquals(Arrays.asList("a\"b", "c", "d\"e"), ((StringBasedCondition) dqRule.getCondition()).getOperands()); + List stringList = constructOperandsAsStringList(dqRule); + assertEquals(Arrays.asList("a\"b", "c", "d\"e"), stringList); } @Test @@ -268,7 +283,8 @@ void test_setExpressionContainsItemContainingCommas() throws InvalidDataQualityR assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); - assertEquals(Arrays.asList("a,,b", "c", "d,,,e"), ((StringBasedCondition) dqRule.getCondition()).getOperands()); + List stringList = constructOperandsAsStringList(dqRule); + assertEquals(Arrays.asList("a,,b", "c", "d,,,e"), stringList); } @Test @@ -278,13 +294,22 @@ void test_serializationDeserializationWithExpressionFieldSet() assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); - assertEquals(Arrays.asList("A", "B"), ((StringBasedCondition) dqRule.getCondition()).getOperands()); + List stringList = constructOperandsAsStringList(dqRule); + assertEquals(Arrays.asList("A", "B"), stringList); byte[] serialized = serialize(dqRule); DQRule deserialized = deserialize(serialized, DQRule.class); assertEquals(dqRule.toString(), deserialized.toString()); assertEquals(StringBasedCondition.class, deserialized.getCondition().getClass()); } + private static List constructOperandsAsStringList(DQRule dqRule) { + List stringOperandsList = ((StringBasedCondition) dqRule.getCondition()).getOperands(); + List stringList = stringOperandsList.stream() + .map(StringOperand::getOperand) + .collect(Collectors.toList()); + return stringList; + } + @Test void test_serializationDeserializationWithNumericExpression() throws InvalidDataQualityRulesetException, IOException, ClassNotFoundException { From fd87cfa72bfba1ad66d9c665871981d0e3c8981a Mon Sep 17 00:00:00 2001 From: Dongying Song Date: Fri, 23 Feb 2024 11:32:40 -0500 Subject: [PATCH 24/50] Add util renameColumnsFromHashToRuleNames for debugging purposes and misc --- configuration/rules/rules-config.json | 1 - .../glue/ml/dataquality/dqdl/model/DQRule.java | 4 ++-- .../glue/ml/dataquality/dqdl/model/DQRuleType.java | 6 +++--- .../ml/dataquality/dqdl/model/DeserializationTest.java | 10 +++++----- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 416784f..5e59cd4 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -176,7 +176,6 @@ } ], "return_type": "NUMBER", - "is_excluded_at_row_level_in_composite_rules": true, "scope": "column" }, { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index bb32810..d387551 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -38,7 +38,7 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final DQRuleLogicalOperator operator; private final List nestedRules; private final String whereClause; - private Boolean isExcludedRowLevelInCompositeRules = false; + private Boolean isExcludedAtRowLevelInCompositeRules = false; // Adding this constructor so as to not break the Data Quality ETL package. public DQRule(final String ruleType, @@ -126,7 +126,7 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, operator, nestedRules, whereClause, - ruleType.isExcludedRowLevelInCompositeRules() + ruleType.isExcludedAtRowLevelInCompositeRules() ); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index dfa0a7b..ec204f5 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -32,7 +32,7 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; - private final boolean isExcludedRowLevelInCompositeRules; + private final boolean isExcludedAtRowLevelInCompositeRules; private final boolean isAnalyzerOnly; private final String scope; private final boolean isExperimental; @@ -45,7 +45,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules") - boolean isExcludedRowLevelInCompositeRules, + boolean isExcludedAtRowLevelInCompositeRules, @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, @JsonProperty(value = "scope") String scope, @JsonProperty(value = "experimental") boolean isExperimental) { @@ -54,7 +54,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; - this.isExcludedRowLevelInCompositeRules = isExcludedRowLevelInCompositeRules; + this.isExcludedAtRowLevelInCompositeRules = isExcludedAtRowLevelInCompositeRules; this.isAnalyzerOnly = isAnalyzerOnly; this.scope = scope; this.isExperimental = isExperimental; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java index cca7786..1b4bfb1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java @@ -76,7 +76,7 @@ public void test_parseDQRuleTypeWithThresholdAndRowLevelFlags() throws JsonProce String ruleTypeDesc = "This rule matches two datasets"; String returnType = "STRING"; boolean isThresholdSupported = true; - boolean isExcludedRowLevelInCompositeRules = true; + boolean isExcludedAtRowLevelInCompositeRules = true; // Parameter 1 String param1Type = "String"; @@ -103,14 +103,14 @@ public void test_parseDQRuleTypeWithThresholdAndRowLevelFlags() throws JsonProce "\"is_excluded_at_row_level_in_composite_rules\": %s" + "}", ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType, isThresholdSupported, - isExcludedRowLevelInCompositeRules); + isExcludedAtRowLevelInCompositeRules); DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); assertEquals(ruleTypeName, ruleType.getRuleTypeName()); assertEquals(ruleTypeDesc, ruleType.getDescription()); assertEquals(returnType, ruleType.getReturnType()); assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); - assertEquals(isExcludedRowLevelInCompositeRules, ruleType.isExcludedRowLevelInCompositeRules()); + assertEquals(isExcludedAtRowLevelInCompositeRules, ruleType.isExcludedAtRowLevelInCompositeRules()); DQRuleParameter param1 = ruleType.getParameters().get(0); assertEquals(param1Type, param1.getType()); @@ -129,7 +129,7 @@ public void test_parseDQRuleTypeWithNoThresholdAndRowLevelFlags() throws JsonPro String ruleTypeDesc = "This rule matches two datasets"; String returnType = "STRING"; boolean isThresholdSupported = false; - boolean isExcludedRowLevelInCompositeRules = false; + boolean isExcludedAtRowLevelInCompositeRules = false; // Parameter 1 String param1Type = "String"; @@ -159,7 +159,7 @@ public void test_parseDQRuleTypeWithNoThresholdAndRowLevelFlags() throws JsonPro assertEquals(ruleTypeDesc, ruleType.getDescription()); assertEquals(returnType, ruleType.getReturnType()); assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); - assertEquals(isExcludedRowLevelInCompositeRules, ruleType.isExcludedRowLevelInCompositeRules()); + assertEquals(isExcludedAtRowLevelInCompositeRules, ruleType.isExcludedAtRowLevelInCompositeRules()); DQRuleParameter param1 = ruleType.getParameters().get(0); assertEquals(param1Type, param1.getType()); From 9583e173f8d019a691d5795157f3caf9942c73fe Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Mon, 26 Feb 2024 17:27:38 -0500 Subject: [PATCH 25/50] where clause should appear before threshold for DQRule string representation --- .../ml/dataquality/dqdl/model/DQRule.java | 9 ++++---- .../ml/dataquality/dqdl/model/DQRuleTest.java | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index d387551..23dbd2c 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -154,15 +154,16 @@ public String toString() { if (!isBlank(formattedCondition)) sb.append(" ").append(condition.getFormattedCondition()); } + // where clause syntax should go before threshold + if (whereClause != null) { + if (!isBlank(whereClause)) sb.append(" where ").append("\"" + whereClause + "\""); + } + if (thresholdCondition != null) { String formattedCondition = thresholdCondition.getFormattedCondition(); if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition); } - if (whereClause != null) { - if (!isBlank(whereClause)) sb.append(" where ").append("\"" + whereClause + "\""); - } - return sb.toString(); } else { for (int i = 0; i < nestedRules.size(); i++) { diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 47c87c2..e792f18 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -427,6 +427,16 @@ public void test_whereClause() throws InvalidDataQualityRulesetException { assertEquals(dqRuleset1.hashCode(), dqRuleset2.hashCode()); } + @Test + public void test_whereClauseWithThreshold() throws InvalidDataQualityRulesetException { + String rule = "ColumnValues \"colA\" in [10,20] where \"colB is NOT NULL\" with threshold > 0.5"; + String ruleset = String.format("Rules = [ %s ]", rule); + + DQRuleset dqRuleset1 = parser.parse(ruleset); + + assertEquals(dqRuleset1.getRules().get(0).toString(), rule); + } + @Test void test_whereClauseRuleToStringFromRule() throws InvalidDataQualityRulesetException { Map parameters = new HashMap<>(); @@ -438,6 +448,17 @@ void test_whereClauseRuleToStringFromRule() throws InvalidDataQualityRulesetExce assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); } + @Test + void test_whereClauseRuleToStringFromRuleWithThreshold() throws InvalidDataQualityRulesetException { + Map parameters = new HashMap<>(); + parameters.put("TargetColumn", "colA"); + DQRule dqRule = new DQRule("ColumnValues", parameters, new Condition("in [10,20]"), new Condition("> 0.5"), + DQRuleLogicalOperator.AND, null, "colB is NOT NULL"); + String ruleString = "ColumnValues \"colA\" in [10,20] where \"colB is NOT NULL\" with threshold > 0.5"; + assertEquals(dqRule.toString(), ruleString); + assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); + } + @Test void test_whereClauseRuleToStringFromRuleset() throws InvalidDataQualityRulesetException { String ruleString = "IsComplete \"colA\" where \"colB is NOT NULL\""; From dd684af8ef9238a13b3933bddbd88502d192cc30 Mon Sep 17 00:00:00 2001 From: Shriya Vanvari Date: Tue, 5 Mar 2024 17:54:42 -0500 Subject: [PATCH 26/50] Support for NULL keyword in numeric and date based conditions --- .../dqdl/DataQualityDefinitionLanguage.g4 | 6 +++-- .../condition/date/NullDateExpression.java | 26 +++++++++++++++++++ .../condition/number/NullNumericOperand.java | 23 ++++++++++++++++ .../dqdl/parser/DQDLParserListener.java | 6 +++++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 6 +++++ 5 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 5754774..60ada14 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -18,7 +18,8 @@ dateExpressionOp: ('-' | '+'); dateExpression: DATE | dateNow - | LPAREN dateNow dateExpressionOp durationExpression RPAREN; + | LPAREN dateNow dateExpressionOp durationExpression RPAREN + | NULL; atomicNumber: DIGIT @@ -42,7 +43,8 @@ number: number numberOp number | functionCall | LPAREN number RPAREN - | atomicNumber; + | atomicNumber + | NULL; quotedString: QUOTED_STRING; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java new file mode 100644 index 0000000..7d33d19 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java @@ -0,0 +1,26 @@ +/* + * NullNumericOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date; + +import java.time.LocalDateTime; + +public class NullDateExpression extends DateExpression { + + @Override + public String getFormattedExpression() { + return "NULL"; + } + + @Override + public LocalDateTime getEvaluatedExpression() { + return null; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java new file mode 100644 index 0000000..edad45d --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java @@ -0,0 +1,23 @@ +/* + * NullNumericOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +public class NullNumericOperand extends NumericOperand { + + public NullNumericOperand(final String operand) { + super(operand.toUpperCase()); + } + + @Override + public String toString() { + return getOperand(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index f453454..171c135 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -18,6 +18,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.NullDateExpression; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedConditionOperator; @@ -25,6 +26,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.AtomicNumberOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.BinaryExpressionOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.FunctionCallOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NullNumericOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand; @@ -550,6 +552,8 @@ private Optional parseNumericOperand( return parseNumericOperand(numberContext.number(0), true); } else if (numberContext.atomicNumber() != null) { return Optional.of(new AtomicNumberOperand(numberContext.getText())); + } else if (numberContext.NULL() != null) { + return Optional.of(new NullNumericOperand(numberContext.getText())); } return Optional.empty(); @@ -797,6 +801,8 @@ private Optional parseDateExpression( )); } else if (ctx.dateNow() != null) { return Optional.of(new DateExpression.CurrentDate()); + } else if (ctx.NULL() != null) { + return Optional.of(new NullDateExpression()); } else { return Optional.of(new DateExpression.StaticDate(removeQuotes(ctx.DATE().getText()))); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index e792f18..0329ae3 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -129,6 +129,7 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\""), Arguments.of("ColumnValues \"load_dt\" > (now() - 1 days)"), Arguments.of("ColumnValues \"order-id\" in [1,2,3,4]"), + Arguments.of("ColumnValues \"order-id\" in [1,2,3,4,NULL]"), Arguments.of("ColumnValues \"order-id\" not in [1,2,3,4]"), Arguments.of("ColumnValues \"order-id\" in [\"1\",\"2\",\"3\",\"4\"]"), Arguments.of("ColumnValues \"order-id\" not in [\"1\",\"2\",\"3\",\"4\"]"), @@ -179,6 +180,11 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold not between 0.4 and 0.8"), Arguments.of("ColumnValues \"col-A\" > 0.4 with threshold > 0.4"), Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\"] with threshold > 0.98"), + Arguments.of("ColumnValues \"col-A\" = NULL"), + Arguments.of("ColumnValues \"col-A\" != NULL"), + Arguments.of("ColumnValues \"col-A\" in [NULL]"), + Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\",NULL] with threshold > 0.98"), + Arguments.of("ColumnValues \"col-A\" not in [\"2022-01-01\",NULL] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = 1 with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"2022-01-01\" with threshold > 0.98"), Arguments.of("DataFreshness \"col-A\" <= 3 days"), From dce7d867fa642f7dbb57e6aae9ff31426af50ded Mon Sep 17 00:00:00 2001 From: Shriya Vanvari Date: Tue, 5 Mar 2024 17:54:42 -0500 Subject: [PATCH 27/50] Support for NOT_MATCHES --- configuration/dqdl/DataQualityDefinitionLanguage.g4 | 2 +- .../dqdl/model/condition/date/NullDateExpression.java | 2 +- .../dqdl/model/condition/string/StringBasedCondition.java | 2 ++ .../model/condition/string/StringBasedConditionOperator.java | 3 ++- .../glue/ml/dataquality/dqdl/parser/DQDLParserListener.java | 5 ++++- .../amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java | 2 ++ 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 60ada14..452d3aa 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -65,7 +65,7 @@ stringValuesArray: LBRAC stringValues (COMMA stringValues)* RBRAC; stringBasedCondition: NEGATION? EQUAL_TO stringValues | NOT? IN stringValuesArray - | matchesRegexCondition; + | NOT? matchesRegexCondition; dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC; dateBasedCondition: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java index 7d33d19..0ce87ac 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java @@ -1,5 +1,5 @@ /* - * NullNumericOperand.java + * NullDateExpression.java * * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. * diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java index 2f4f105..27b228e 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java @@ -39,6 +39,8 @@ public String getFormattedCondition() { switch (operator) { case MATCHES: return String.format("matches %s", operands.get(0).formatOperand()); + case NOT_MATCHES: + return String.format("not matches %s", operands.get(0).formatOperand()); case EQUALS: return String.format("= %s", operands.get(0).formatOperand()); case NOT_EQUALS: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java index c5ef781..afed9f0 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java @@ -15,5 +15,6 @@ public enum StringBasedConditionOperator { NOT_EQUALS, IN, NOT_IN, - MATCHES + MATCHES, + NOT_MATCHES } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 171c135..c781b14 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -588,7 +588,9 @@ private Optional parseStringBasedCondition( operands.stream().map(Optional::get).collect(Collectors.toList()) ); } else if (ctx.matchesRegexCondition() != null) { - StringBasedConditionOperator op = StringBasedConditionOperator.MATCHES; + StringBasedConditionOperator op = (ctx.NOT() != null) ? + StringBasedConditionOperator.NOT_MATCHES + : StringBasedConditionOperator.MATCHES; Optional operand = parseStringOperand(ctx, Optional.ofNullable(ctx.stringValues()), op); if (operand.isPresent()) { condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand.get())); @@ -623,6 +625,7 @@ private Optional parseStringOperand( return Optional.of(new KeywordStringOperand(keyword)); } case MATCHES: + case NOT_MATCHES: return Optional.of(new QuotedStringOperand( removeQuotes(ctx.matchesRegexCondition().quotedString().getText()))); default: diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 0329ae3..0acc71e 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -148,6 +148,7 @@ private static Stream provideRawRules() { Arguments.of("ColumnLength \"col_A-B.CD\" < 10"), Arguments.of("ColumnLength \"col_A-B.CD\" >= 100"), Arguments.of("ColumnValues \"col-A\" matches \"[a-zA-Z0-9]*\""), + Arguments.of("ColumnValues \"col-A\" not matches \"[a-zA-Z0-9]*\""), Arguments.of("ColumnValues \"col-A\" >= now()"), Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and now()"), Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and now()"), @@ -160,6 +161,7 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and (now() + 14 days)"), Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and (now() + 14 days)"), Arguments.of("ColumnValues \"col-A\" matches \"[a-z]*\" with threshold <= 0.4"), + Arguments.of("ColumnValues \"col-A\" not matches \"[a-z]*\" with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [\"A\",\"B\"] with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [1,2,3] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"A\" with threshold > 0.98"), From 2db306fe20370c52b9ec011deee9c0cb287a903f Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Fri, 8 Mar 2024 15:21:56 -0500 Subject: [PATCH 28/50] Add logic to fail rules that don't have where clause support --- configuration/rules/rules-config.json | 17 +++++++++++++++++ .../ml/dataquality/dqdl/model/DQRuleType.java | 4 ++++ .../dqdl/parser/DQDLParserListener.java | 14 +++++++++----- .../ml/dataquality/dqdl/model/DQRuleTest.java | 17 +++++++++++++++++ .../dqdl/parser/InvalidDQRulesetTest.java | 6 ++++++ 5 files changed, 53 insertions(+), 5 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 5e59cd4..a8f6e5c 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -5,6 +5,7 @@ "description": "Check the number of rows in the dataset", "parameters": [], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "table" }, { @@ -39,6 +40,7 @@ ], "return_type": "NUMBER", "is_excluded_at_row_level_in_composite_rules": true, + "is_where_clause_supported": true, "scope": "column" }, { @@ -52,6 +54,7 @@ } ], "return_type": "BOOLEAN", + "is_where_clause_supported": true, "scope": "column" }, { @@ -66,6 +69,7 @@ ], "return_type": "STRING", "is_threshold_supported": true, + "is_where_clause_supported": true, "scope": "column" }, { @@ -110,6 +114,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -124,6 +129,7 @@ ], "return_type": "NUMBER", "is_excluded_at_row_level_in_composite_rules": true, + "is_where_clause_supported": true, "scope": "column" }, { @@ -137,6 +143,7 @@ } ], "return_type": "BOOLEAN", + "is_where_clause_supported": true, "scope": "column" }, { @@ -150,6 +157,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -163,6 +171,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -176,6 +185,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -189,6 +199,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -202,6 +213,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -215,6 +227,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -228,6 +241,7 @@ } ], "return_type": "NUMBER", + "is_where_clause_supported": true, "scope": "column" }, { @@ -242,6 +256,7 @@ } ], "return_type": "BOOLEAN", + "is_where_clause_supported": true, "scope": "column" }, { @@ -256,6 +271,7 @@ ], "return_type": "STRING_ARRAY|NUMBER_ARRAY|DATE_ARRAY", "is_threshold_supported": true, + "is_where_clause_supported": true, "scope": "column" }, { @@ -269,6 +285,7 @@ } ], "return_type": "DURATION_ARRAY", + "is_where_clause_supported": true, "scope": "column" }, { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index ec204f5..4728ff3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -33,6 +33,7 @@ public class DQRuleType { private final String returnType; private final boolean isThresholdSupported; private final boolean isExcludedAtRowLevelInCompositeRules; + private final boolean isWhereClauseSupported; private final boolean isAnalyzerOnly; private final String scope; private final boolean isExperimental; @@ -46,6 +47,8 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules") boolean isExcludedAtRowLevelInCompositeRules, + @JsonProperty(value = "is_where_clause_supported") + boolean isWhereClauseSupported, @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, @JsonProperty(value = "scope") String scope, @JsonProperty(value = "experimental") boolean isExperimental) { @@ -55,6 +58,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; this.isExcludedAtRowLevelInCompositeRules = isExcludedAtRowLevelInCompositeRules; + this.isWhereClauseSupported = isWhereClauseSupported; this.isAnalyzerOnly = isAnalyzerOnly; this.scope = scope; this.isExperimental = isExperimental; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index c781b14..70e580b 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -300,12 +300,16 @@ private Either getDQRule( String whereClause = null; if (dqRuleContext.whereClause() != null) { - DataQualityDefinitionLanguageParser.WhereClauseContext ctx = dqRuleContext.whereClause(); - if (ctx.quotedString().getText().isEmpty() || ctx.quotedString().getText().equals("\"\"")) { - return Either.fromLeft( - String.format("Empty where condition provided for rule type: %s", ruleType)); + if (dqRuleType.isWhereClauseSupported()) { + DataQualityDefinitionLanguageParser.WhereClauseContext ctx = dqRuleContext.whereClause(); + if (ctx.quotedString().getText().isEmpty() || ctx.quotedString().getText().equals("\"\"")) { + return Either.fromLeft( + String.format("Empty where condition provided for rule type: %s", ruleType)); + } else { + whereClause = removeQuotes(ctx.quotedString().getText()); + } } else { - whereClause = removeQuotes(ctx.quotedString().getText()); + return Either.fromLeft(String.format("Where clause is not supported for rule type: %s", ruleType)); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 0acc71e..75e1871 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -92,9 +92,11 @@ private static Stream provideRawRules() { Arguments.of("IsPrimaryKey \"colA\" \"colB\""), Arguments.of("IsPrimaryKey colA \"col B\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\" \"colC\""), + Arguments.of("IsPrimaryKey \"colA\" where \"colA > 100\""), Arguments.of("RowCount = 100"), Arguments.of("RowCount != 100"), Arguments.of("RowCount = -100"), + Arguments.of("RowCount = 100 where \"colA > 100\""), Arguments.of("RowCount between (0.9 * average(last(10))) and 1.1 * average(last(10))"), Arguments.of("RowCount not between (0.9 * average(last(10))) and 1.1 * average(last(10))"), Arguments.of("RowCountMatch \"reference\" = 1.0"), @@ -103,7 +105,9 @@ private static Stream provideRawRules() { Arguments.of("Completeness \"col_1\" between 0.5 and 0.8"), Arguments.of("Completeness of col_1 between 0.5 and 0.8"), Arguments.of("Completeness of col_1 not between 0.5 and 0.8"), + Arguments.of("Completeness \"col_1\" between 0.5 and 0.8 where \"col-A > 100\""), Arguments.of("IsComplete \"col_1\""), + Arguments.of("IsComplete \"col_1\" where \"col-A > 100\""), Arguments.of("Completeness \"col_1\" between -0.5 and -0.4"), Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"), Arguments.of("ColumnDataType \"col_1\" = \"String\""), @@ -112,6 +116,7 @@ private static Stream provideRawRules() { Arguments.of("ColumnDataType \"col_1\" = \"String\" with threshold between 0.4 and 0.8"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"]"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9"), + Arguments.of("ColumnDataType \"col_1\" = \"String\" where \"col-A > 100\""), Arguments.of("ColumnNamesMatchPattern \"aws_.*_[a-zA-Z0-9]+\""), Arguments.of("ColumnExists \"load_dt\""), Arguments.of("ColumnCount >= 100"), @@ -123,10 +128,14 @@ private static Stream provideRawRules() { Arguments.of("ColumnCorrelation of col_1 col_2 between 0.4 and 0.8"), Arguments.of("ColumnCorrelation of col_1 and \"col abc\" between 0.4 and 0.8"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between -0.44444 and 0.888888"), + Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8 where \"col-A > 100\""), Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2"), + Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2 where \"col-A > 100\""), Arguments.of("IsUnique \"col_1\""), + Arguments.of("IsUnique \"col_1\" where \"col-A > 100\""), Arguments.of("Uniqueness \"col_1\" between -0.00000001 and 0.00000000000002"), Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\""), + Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\" where \"col-A > 100\""), Arguments.of("ColumnValues \"load_dt\" > (now() - 1 days)"), Arguments.of("ColumnValues \"order-id\" in [1,2,3,4]"), Arguments.of("ColumnValues \"order-id\" in [1,2,3,4,NULL]"), @@ -135,18 +144,25 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"order-id\" not in [\"1\",\"2\",\"3\",\"4\"]"), Arguments.of("Sum \"col_A-B.C\" > 100.0"), Arguments.of("Sum \"col_A-B.C\" > -100.0"), + Arguments.of("Sum \"col_A-B.C\" > -100.0 where \"col-A > 100\""), Arguments.of("Mean \"col_A-B.CD\" between 10 and 20"), Arguments.of("Mean \"col_A-B.CD\" between -20 and -10"), + Arguments.of("Mean \"col_A-B.CD\" between -20 and -10 where \"col-A > 100\""), Arguments.of("StandardDeviation \"col_A-B.CD\" <= 10.0"), Arguments.of("StandardDeviation \"col_A-B.CD\" <= -10000.0"), + Arguments.of("StandardDeviation \"col_A-B.CD\" <= -10000.0 where \"col-A > 100\""), Arguments.of("Entropy \"col_A-B.CD\" <= 10.0"), Arguments.of("Entropy \"col_A-B.CD\" between 10 and 30"), + Arguments.of("Entropy \"col_A-B.CD\" between 10 and 30 where \"col-A > 100\""), Arguments.of("DistinctValuesCount \"col_A-B.CD\" > 1000"), Arguments.of("DistinctValuesCount \"col_A-B.CD\" between 10 and 30"), + Arguments.of("DistinctValuesCount \"col_A-B.CD\" between 10 and 30 where \"col-A > 100\""), Arguments.of("UniqueValueRatio \"col_A-B.CD\" < 0.5"), Arguments.of("UniqueValueRatio \"col_A-B.CD\" between 0.1 and 0.5"), + Arguments.of("UniqueValueRatio \"col_A-B.CD\" between 0.1 and 0.5 where \"col-A > 100\""), Arguments.of("ColumnLength \"col_A-B.CD\" < 10"), Arguments.of("ColumnLength \"col_A-B.CD\" >= 100"), + Arguments.of("ColumnLength \"col_A-B.CD\" >= 100 where \"col-A > 100\""), Arguments.of("ColumnValues \"col-A\" matches \"[a-zA-Z0-9]*\""), Arguments.of("ColumnValues \"col-A\" not matches \"[a-zA-Z0-9]*\""), Arguments.of("ColumnValues \"col-A\" >= now()"), @@ -192,6 +208,7 @@ private static Stream provideRawRules() { Arguments.of("DataFreshness \"col-A\" <= 3 days"), Arguments.of("DataFreshness \"col-A\" > 30 hours"), Arguments.of("DataFreshness \"col-A\" between 2 days and 4 days"), + Arguments.of("DataFreshness \"col-A\" <= 3 days where \"col-A > 100\""), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" between 0.4 and 0.6"), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" > 0.98"), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" = 0.99"), diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index fc9a9e5..d9ad993 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -43,6 +43,7 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ Completeness \"col-A\" ]"), Arguments.of("Rules = { Completeness \"col-A\" }"), Arguments.of("Rules = [ ColumnNamesMatchPattern aws_* ]"), + Arguments.of("Rules = [ ColumnNamesMatchPattern \"aws_*\" where \"aws_id > 100\"]"), Arguments.of("Rules = [ IsComplete \"col-A\" > 0.05 ]"), Arguments.of("Rules = [ IsUnique \"col-A\" <= 1.5 ]"), Arguments.of("Rules = [ IsPrimaryKey \"col-A\" between 1 and 2 ]"), @@ -63,20 +64,25 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ DataFreshness \"col-A\" between 2 and 4 days ]"), Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference\" \"col-A1\" ]"), Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" = 0.99 ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference.col-A\" = 0.99 where \"col-A > 100\"]"), Arguments.of("Rules = [ DatasetMatch \"reference\" = 0.99 ]"), Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" ]"), Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" ]"), Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 with threshold > 0.9]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 where \"ID > 100\"]"), Arguments.of("Rules = [ SchemaMatch with threshold between 0.2 and 0.4 ]"), Arguments.of("Rules = [ SchemaMatch \"ref-1\" between 0.2 and 0.4 with threshold > 0.5 ]"), Arguments.of("Rules = [ SchemaMatch \"ref-1\" \"ref-2\" ]"), Arguments.of("Rules = [ RowCountMatch > 0.1 ]"), Arguments.of("Rules = [ RowCountMatch \"reference-1\" \"col-1\" > 0.1 ]"), Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 with threshold > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 where \"id > 100\"]"), Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.8 where \"col-A > 100\"]"), Arguments.of("Rules = [ DetectAnomalies ]"), + Arguments.of("Rules = [ DetectAnomalies \"col-A\" where \"col-A > 100\"]"), Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), Arguments.of("Rules = [ AllStatistics \"id\" ]") ); From c14d02d4fca4f6d80328ee1be8215ee51eba1c39 Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Tue, 26 Mar 2024 18:55:41 -0400 Subject: [PATCH 29/50] Add getSortedFormattedCondition for Condition --- .../dqdl/model/condition/Condition.java | 3 + .../condition/date/DateBasedCondition.java | 22 ++++++ .../duration/DurationBasedCondition.java | 23 +++++- .../number/NumberBasedCondition.java | 22 ++++++ .../string/StringBasedCondition.java | 22 ++++++ .../date/DateBasedConditionTest.java | 53 ++++++++++++- .../duration/DurationBasedConditionTest.java | 18 ++++- .../number/NumberBasedConditionTest.java | 78 +++++++++++++++++++ .../string/StringBasedConditionTest.java | 65 ++++++++++++++++ 9 files changed, 298 insertions(+), 8 deletions(-) create mode 100644 tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java create mode 100644 tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java index 17401f5..900ba88 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java @@ -30,6 +30,9 @@ public String getFormattedCondition() { return this.conditionAsString; } + public String getSortedFormattedCondition() { + return this.conditionAsString; + } public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) { throw new UnsupportedOperationException(); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java index fea9ffb..0458f98 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java @@ -74,10 +74,32 @@ public String getFormattedCondition() { return ""; } + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + private List getFormattedOperands() { List formattedOperands = operands.stream() .map(DateExpression::getFormattedExpression) .collect(Collectors.toList()); return formattedOperands; } + + private List getSortedFormattedOperands() { + List formattedOperands = operands.stream() + .map(DateExpression::getFormattedExpression) + .sorted() + .collect(Collectors.toList()); + return formattedOperands; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java index 128dd7a..1f6f80a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -58,11 +59,11 @@ public String getFormattedCondition() { return String.format("!= %s", operands.get(0).getFormattedDuration()); case IN: { List formattedOperands = getFormattedOperands(); - return String.format("in [%s]", String.join(", ", formattedOperands)); + return String.format("in [%s]", String.join(",", formattedOperands)); } case NOT_IN: { List formattedOperands = getFormattedOperands(); - return String.format("not in [%s]", String.join(", ", formattedOperands)); + return String.format("not in [%s]", String.join(",", formattedOperands)); } default: break; @@ -71,7 +72,25 @@ public String getFormattedCondition() { return ""; } + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + private List getFormattedOperands() { return operands.stream().map(Duration::getFormattedDuration).collect(Collectors.toList()); } + + private List getSortedFormattedOperands() { + return operands.stream().map(Duration::getFormattedDuration).sorted().collect(Collectors.toList()); + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index cdee464..3ac9f84 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -19,6 +19,7 @@ import lombok.extern.slf4j.Slf4j; import java.text.DecimalFormat; +import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; @@ -158,12 +159,33 @@ public String getFormattedCondition() { return ""; } + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", getSortedFormattedOperands()); + case NOT_IN: + return String.format("not in [%s]", getSortedFormattedOperands()); + default: + return getFormattedCondition(); + } + } + private String getFormattedOperands() { return operands.stream() .map(NumericOperand::toString) .collect(Collectors.joining(",")); } + private String getSortedFormattedOperands() { + return operands.stream() + .map(NumericOperand::toString) + .sorted(Comparator.comparingDouble(Double::parseDouble)) + .collect(Collectors.joining(",")); + } + protected boolean isOperandEqualToMetric(Double metric, Double operand) { return abs(metric - operand) <= 0.00001; } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java index 27b228e..ca62c59 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java @@ -60,10 +60,32 @@ public String getFormattedCondition() { return ""; } + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + private List getFormattedOperands() { List formattedOperands = operands.stream() .map(StringOperand::formatOperand) .collect(Collectors.toList()); return formattedOperands; } + + private List getSortedFormattedOperands() { + List formattedOperands = operands.stream() + .map(StringOperand::formatOperand) + .sorted() + .collect(Collectors.toList()); + return formattedOperands; + } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java index 6283b7c..be524fa 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java @@ -33,6 +33,7 @@ private static Stream provideDateBasedConditionsWithExpectedFormatted new DateExpression.StaticDate("2023-01-01"), new DateExpression.StaticDate("2023-12-31") ) ), + "between \"2023-01-01\" and \"2023-12-31\"", "between \"2023-01-01\" and \"2023-12-31\"" ), Arguments.of( @@ -43,6 +44,7 @@ private static Stream provideDateBasedConditionsWithExpectedFormatted new DateExpression.StaticDate("2023-01-01"), new DateExpression.StaticDate("2023-12-31") ) ), + "not between \"2023-01-01\" and \"2023-12-31\"", "not between \"2023-01-01\" and \"2023-12-31\"" ), Arguments.of( @@ -58,6 +60,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ) ), + "between (now() - 4 days) and (now() + 72 hours)", "between (now() - 4 days) and (now() + 72 hours)" ), Arguments.of( @@ -73,6 +76,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ) ), + "not between (now() - 4 days) and (now() + 72 hours)", "not between (now() - 4 days) and (now() + 72 hours)" ), Arguments.of( @@ -81,6 +85,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "> \"2023-01-01\"", "> \"2023-01-01\"" ), Arguments.of( @@ -89,6 +94,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(new DateExpression.CurrentDate()) ), + "> now()", "> now()" ), Arguments.of( @@ -97,6 +103,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) DateBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + ">= \"2023-01-01\"", ">= \"2023-01-01\"" ), Arguments.of( @@ -109,6 +116,7 @@ DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) ) ) ), + ">= (now() - 2 days)", ">= (now() - 2 days)" ), Arguments.of( @@ -117,6 +125,7 @@ DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) DateBasedConditionOperator.LESS_THAN, Collections.singletonList(new DateExpression.CurrentDate()) ), + "< now()", "< now()" ), Arguments.of( @@ -129,6 +138,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) ) ) ), + "< (now() + 100 days)", "< (now() + 100 days)" ), Arguments.of( @@ -137,6 +147,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "<= \"2023-01-01\"", "<= \"2023-01-01\"" ), Arguments.of( @@ -145,6 +156,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(new DateExpression.CurrentDate()) ), + "<= now()", "<= now()" ), Arguments.of( @@ -153,6 +165,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.EQUALS, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "= \"2023-01-01\"", "= \"2023-01-01\"" ), Arguments.of( @@ -161,6 +174,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.NOT_EQUALS, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "!= \"2023-01-01\"", "!= \"2023-01-01\"" ), Arguments.of( @@ -173,6 +187,7 @@ DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) ) ) ), + "= (now() - 2 days)", "= (now() - 2 days)" ), Arguments.of( @@ -190,7 +205,22 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ) ), - "in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]" + "in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]", + "in [\"2023-01-01\",(now() + 72 hours),(now() - 2 days),now()]" + ), + Arguments.of( + new DateBasedCondition( + "in[\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + DateBasedConditionOperator.IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.StaticDate("2022-01-01"), + new DateExpression.StaticDate("2021-01-01"), + new DateExpression.StaticDate("2020-01-01") + ) + ), + "in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + "in [\"2020-01-01\",\"2021-01-01\",\"2022-01-01\",\"2023-01-01\"]" ), Arguments.of( new DateBasedCondition( @@ -207,7 +237,22 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ) ), - "not in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]" + "not in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]", + "not in [\"2023-01-01\",(now() + 72 hours),(now() - 2 days),now()]" + ), + Arguments.of( + new DateBasedCondition( + "notin[\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + DateBasedConditionOperator.NOT_IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.StaticDate("2022-01-01"), + new DateExpression.StaticDate("2021-01-01"), + new DateExpression.StaticDate("2020-01-01") + ) + ), + "not in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + "not in [\"2020-01-01\",\"2021-01-01\",\"2022-01-01\",\"2023-01-01\"]" ) ); } @@ -215,7 +260,9 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) @ParameterizedTest @MethodSource("provideDateBasedConditionsWithExpectedFormattedStrings") public void test_correctlyFormatsDuration(DateBasedCondition condition, - String expectedFormattedString) { + String expectedFormattedString, + String expectedSortedFormattedString) { assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java index 08b5319..24cd7c7 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java @@ -32,6 +32,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS new Duration(4, DurationUnit.DAYS) ) ), + "between 3 hours and 4 days", "between 3 hours and 4 days" ), Arguments.of( @@ -43,6 +44,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS new Duration(4, DurationUnit.DAYS) ) ), + "not between 3 hours and 4 days", "not between 3 hours and 4 days" ), Arguments.of( @@ -51,6 +53,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.GREATER_THAN, Collections.singletonList(new Duration(256, DurationUnit.HOURS)) ), + "> 256 hours", "> 256 hours" ), Arguments.of( @@ -59,6 +62,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(new Duration(2, DurationUnit.DAYS)) ), + ">= 2 days", ">= 2 days" ), Arguments.of( @@ -67,6 +71,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.LESS_THAN, Collections.singletonList(new Duration(25000, DurationUnit.HOURS)) ), + "< 25000 hours", "< 25000 hours" ), Arguments.of( @@ -75,6 +80,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(new Duration(24, DurationUnit.DAYS)) ), + "<= 24 days", "<= 24 days" ), Arguments.of( @@ -83,6 +89,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.EQUALS, Collections.singletonList(new Duration(10, DurationUnit.DAYS)) ), + "= 10 days", "= 10 days" ), Arguments.of( @@ -91,6 +98,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.NOT_EQUALS, Collections.singletonList(new Duration(10, DurationUnit.DAYS)) ), + "!= 10 days", "!= 10 days" ), Arguments.of( @@ -104,7 +112,8 @@ private static Stream provideDurationConditionsWithExpectedFormattedS new Duration(7, DurationUnit.DAYS) ) ), - "in [3 hours, 4 days, 96 hours, 7 days]" + "in [3 hours,4 days,96 hours,7 days]", + "in [3 hours,4 days,7 days,96 hours]" ), Arguments.of( new DurationBasedCondition( @@ -117,7 +126,8 @@ private static Stream provideDurationConditionsWithExpectedFormattedS new Duration(7, DurationUnit.DAYS) ) ), - "not in [3 hours, 4 days, 96 hours, 7 days]" + "not in [3 hours,4 days,96 hours,7 days]", + "not in [3 hours,4 days,7 days,96 hours]" ) ); } @@ -125,7 +135,9 @@ private static Stream provideDurationConditionsWithExpectedFormattedS @ParameterizedTest @MethodSource("provideDurationConditionsWithExpectedFormattedStrings") public void test_correctlyFormatsDuration(DurationBasedCondition condition, - String expectedFormattedString) { + String expectedFormattedString, + String expectedSortedFormattedString) { assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java new file mode 100644 index 0000000..ddddf77 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java @@ -0,0 +1,78 @@ +/* + * NumberBasedConditionTest.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class NumberBasedConditionTest { + private static Stream provideNumberConditionsWithExpectedFormattedStrings() { + return Stream.of( + Arguments.of( + new NumberBasedCondition( + "in[15,10,20,5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new AtomicNumberOperand("20"), + new AtomicNumberOperand("5") + ) + ), + "in [15,10,20,5]", + "in [5,10,15,20]" + ), + Arguments.of( + new NumberBasedCondition( + "in[1.5,1.0,2.0,0.5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("1.5"), + new AtomicNumberOperand("1.0"), + new AtomicNumberOperand("2.0"), + new AtomicNumberOperand("0.5") + ) + ), + "in [1.5,1.0,2.0,0.5]", + "in [0.5,1.0,1.5,2.0]" + ), + Arguments.of( + new NumberBasedCondition( + "notin[15,10,20,5]", + NumberBasedConditionOperator.NOT_IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new AtomicNumberOperand("20"), + new AtomicNumberOperand("5") + ) + ), + "not in [15,10,20,5]", + "not in [5,10,15,20]" + ) + ); + } + + @ParameterizedTest + @MethodSource("provideNumberConditionsWithExpectedFormattedStrings") + public void test_correctlyFormatsNumber(NumberBasedCondition condition, + String expectedFormattedString, + String expectedSortedFormattedString) { + assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java new file mode 100644 index 0000000..4bec6f5 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java @@ -0,0 +1,65 @@ +/* + * StringBasedConditionTest.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class StringBasedConditionTest { + + private static Stream provideStringConditionsWithExpectedFormattedStrings() { + return Stream.of( + Arguments.of( + new StringBasedCondition( + "in[\"d\",\"a\",\"c\",\"b\"]", + StringBasedConditionOperator.IN, + Arrays.asList( + new QuotedStringOperand("d"), + new QuotedStringOperand("a"), + new QuotedStringOperand("b"), + new QuotedStringOperand("c") + ) + ), + "in [\"d\",\"a\",\"b\",\"c\"]", + "in [\"a\",\"b\",\"c\",\"d\"]" + ), + Arguments.of( + new StringBasedCondition( + "notin[\"d\",\"a\",\"c\",\"b\"]", + StringBasedConditionOperator.NOT_IN, + Arrays.asList( + new QuotedStringOperand("d"), + new QuotedStringOperand("a"), + new QuotedStringOperand("b"), + new QuotedStringOperand("c") + ) + ), + "not in [\"d\",\"a\",\"b\",\"c\"]", + "not in [\"a\",\"b\",\"c\",\"d\"]" + ) + ); + } + + @ParameterizedTest + @MethodSource("provideStringConditionsWithExpectedFormattedStrings") + public void test_correctlyFormatsString(StringBasedCondition condition, + String expectedFormattedString, + String expectedSortedFormattedString) { + assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); + } +} From c74953bb9c9caf0caecdd1209c666d001225fe13 Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Fri, 29 Mar 2024 12:08:25 -0400 Subject: [PATCH 30/50] Modify getSortedFormattedOperands to take into account NULL operands for numeric conditions --- .../number/NumberBasedCondition.java | 13 ++++++-- .../date/DateBasedConditionTest.java | 16 ++++++++++ .../number/NumberBasedConditionTest.java | 31 +++++++++++++++++++ .../string/StringBasedConditionTest.java | 21 +++++++++++++ 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index 3ac9f84..6bd5f58 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -19,7 +19,6 @@ import lombok.extern.slf4j.Slf4j; import java.text.DecimalFormat; -import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; @@ -182,7 +181,17 @@ private String getFormattedOperands() { private String getSortedFormattedOperands() { return operands.stream() .map(NumericOperand::toString) - .sorted(Comparator.comparingDouble(Double::parseDouble)) + .sorted((s1, s2) -> { + if (s1.equalsIgnoreCase("NULL") && s2.equalsIgnoreCase("NULL")) { + return 0; // Treat both NULLs as equal + } else if (s1.equalsIgnoreCase("NULL")) { + return 1; // Treat NULL as greater than any other value + } else if (s2.equalsIgnoreCase("NULL")) { + return -1; // Treat NULL as greater than any other value + } else { + return Double.compare(Double.parseDouble(s1), Double.parseDouble(s2)); + } + }) .collect(Collectors.joining(",")); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java index be524fa..fd4af81 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java @@ -12,6 +12,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -20,6 +21,7 @@ import java.util.Collections; import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.NULL; import static org.junit.jupiter.api.Assertions.assertEquals; public class DateBasedConditionTest { @@ -253,6 +255,20 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ), "not in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", "not in [\"2020-01-01\",\"2021-01-01\",\"2022-01-01\",\"2023-01-01\"]" + ), + Arguments.of( + new DateBasedCondition( + "in[\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",NULL]", + DateBasedConditionOperator.IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.StaticDate("2022-01-01"), + new DateExpression.StaticDate("2021-01-01"), + new NullDateExpression() + ) + ), + "in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",NULL]", + "in [\"2021-01-01\",\"2022-01-01\",\"2023-01-01\",NULL]" ) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java index ddddf77..02b10bc 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java @@ -63,6 +63,37 @@ private static Stream provideNumberConditionsWithExpectedFormattedStr ), "not in [15,10,20,5]", "not in [5,10,15,20]" + ), + Arguments.of( + new NumberBasedCondition( + "in[15,10,NULL,20,5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new NullNumericOperand("NULL"), + new AtomicNumberOperand("20"), + new AtomicNumberOperand("5") + ) + ), + "in [15,10,NULL,20,5]", + "in [5,10,15,20,NULL]" + ), + // We don't limit customers from adding multiple NULL keywords + Arguments.of( + new NumberBasedCondition( + "in[15,10,NULL,NULL,5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new NullNumericOperand("NULL"), + new NullNumericOperand("NULL"), + new AtomicNumberOperand("5") + ) + ), + "in [15,10,NULL,NULL,5]", + "in [5,10,15,NULL,NULL]" ) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java index 4bec6f5..b497de5 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java @@ -17,6 +17,9 @@ import java.util.Arrays; import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.EMPTY; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.NULL; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.WHITESPACES_ONLY; import static org.junit.jupiter.api.Assertions.assertEquals; public class StringBasedConditionTest { @@ -50,6 +53,24 @@ private static Stream provideStringConditionsWithExpectedFormattedStr ), "not in [\"d\",\"a\",\"b\",\"c\"]", "not in [\"a\",\"b\",\"c\",\"d\"]" + ), + // Test for Keyword values + Arguments.of( + new StringBasedCondition( + "in[\"z\",\"a\",WHITESPACES_ONLY,EMPTY,\"c\",NULL]", + StringBasedConditionOperator.IN, + Arrays.asList( + new QuotedStringOperand("z"), + new QuotedStringOperand("a"), + new KeywordStringOperand(WHITESPACES_ONLY), + new KeywordStringOperand(EMPTY), + new QuotedStringOperand("c"), + new KeywordStringOperand(NULL) + ) + ), + // verifying behavior that quoted strings will be sorted before keywords + "in [\"z\",\"a\",WHITESPACES_ONLY,EMPTY,\"c\",NULL]", + "in [\"a\",\"c\",\"z\",EMPTY,NULL,WHITESPACES_ONLY]" ) ); } From 23c3688b3e670d9fb8a9edef5da838a72e80b437 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Wed, 28 Aug 2024 19:36:41 -0400 Subject: [PATCH 31/50] [TDQ] Checksum Rule Config + Test --- configuration/rules/rules-config.json | 35 +++++++++++++++++++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 27 +++++++++++++- .../dqdl/parser/InvalidDQRulesetTest.java | 6 +++- 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index a8f6e5c..8e909c8 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -423,6 +423,41 @@ "return_type": "NUMBER", "scope": "column", "experimental": true + }, + { + "rule_type_name": "Checksum", + "description": "Takes an input file or directory, desired algorithm, and list of expected key(s). Compares generated checksum(s) with input.", + "parameters": [ + { + "type": "String", + "name": "Algorithm", + "description": "Desired Algorithm for the Checksum." + }, + { + "type": "String", + "name": "Checksums", + "description": "List of checksums to validate against." + } + ], + "return_type": "STRING", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file" + }, + { + "rule_type_name": "Checksum", + "description": "Takes a desired algorithm, and list of expected key(s). Compares generated checksum(s) with Dataframe input.", + "parameters": [ + { + "type": "String", + "name": "Algorithm", + "description": "Desired Algorithm for the Checksum." + } + ], + "return_type": "STRING", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file" } ] } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 75e1871..5beb51c 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -229,10 +229,35 @@ private static Stream provideRawRules() { Arguments.of("DetectAnomalies \"RowCount\""), Arguments.of("DetectAnomalies of RowCount"), Arguments.of("DetectAnomalies of Completeness of \"colA\""), - Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\"") + Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\""), + Arguments.of("Checksum \"MD5\" \"S3://PATH\" in [\"hashList\"]"), + Arguments.of("Checksum \"SHA\" in [\"hashList\",\"hashList\"]") ); } + @Test + void test_fileBasedRulesParsing() { + String checksumRules = "Rules = [ " + + "Checksum \"MD5\" \"s3://sampom-bucket/\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\", \"test\"], " + + "Checksum \"SHA256\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\"] " + + "]"; + try { + DQRuleset dqRuleset = parser.parse(checksumRules); + List ruleList = dqRuleset.getRules(); + assertEquals(2, ruleList.size()); + + DQRule rule0 = ruleList.get(0); + assertEquals("Checksum", rule0.getRuleType()); + assertEquals(2, rule0.getParameters().size()); + + DQRule rule1 = ruleList.get(1); + assertEquals("Checksum", rule1.getRuleType()); + assertEquals(1, rule1.getParameters().size()); + } catch (Exception e) { + fail(e.getMessage()); + } + } + @Test void test_toStringIgnoresSpacesOnlyThreshold() { Map parameters = new HashMap<>(); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index d9ad993..e066980 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -84,7 +84,11 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ DetectAnomalies ]"), Arguments.of("Rules = [ DetectAnomalies \"col-A\" where \"col-A > 100\"]"), Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), - Arguments.of("Rules = [ AllStatistics \"id\" ]") + Arguments.of("Rules = [ Checksum ]"), + Arguments.of("Rules = [ Checksum in [] ]"), + Arguments.of("Rules = [ Checksum SHA SHA SHA ]"), + Arguments.of("Rules = [ Checksum SHA SHA SHA in [] ]"), + Arguments.of("Rules = [ Checksum s3Path ]") ); } From 1c3f1b0ee2ddb0fc728e615f6de7fbcc5cf7b3de Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Tue, 17 Sep 2024 16:46:15 -0400 Subject: [PATCH 32/50] add FileMatch, add FileUniqueness. convert Checksum to FileMatch. --- configuration/rules/rules-config.json | 46 +++++++++++++------ .../ml/dataquality/dqdl/model/DQRuleTest.java | 30 ++++++++---- .../dqdl/parser/InvalidDQRulesetTest.java | 12 +++-- 3 files changed, 59 insertions(+), 29 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 8e909c8..51973be 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -406,8 +406,7 @@ } ], "return_type": "BOOLEAN", - "scope": "column", - "experimental": true + "scope": "column" }, { "rule_type_name": "AllStatistics", @@ -421,32 +420,32 @@ } ], "return_type": "NUMBER", - "scope": "column", - "experimental": true + "scope": "column" }, { - "rule_type_name": "Checksum", - "description": "Takes an input file or directory, desired algorithm, and list of expected key(s). Compares generated checksum(s) with input.", + "rule_type_name": "FileMatch", + "description": "Match Files/Directories against Files/Directories or against an optional list of checksum values.", "parameters": [ { "type": "String", - "name": "Algorithm", - "description": "Desired Algorithm for the Checksum." + "name": "Algorithm or DataPath", + "description": "Desired Algorithm for the Checksum or File/Directory" }, { "type": "String", - "name": "Checksums", - "description": "List of checksums to validate against." + "name": "CompareDataPath", + "description": "File/Directory for comparison" } ], - "return_type": "STRING", + "return_type": "STRING|BOOLEAN", "is_threshold_supported": false, "is_where_clause_supported": false, - "scope": "file" + "scope": "file", + "experimental": true }, { - "rule_type_name": "Checksum", - "description": "Takes a desired algorithm, and list of expected key(s). Compares generated checksum(s) with Dataframe input.", + "rule_type_name": "FileMatch", + "description": "Match Files/Directories inferred from DataFrames against a list of checksum values.", "parameters": [ { "type": "String", @@ -457,7 +456,24 @@ "return_type": "STRING", "is_threshold_supported": false, "is_where_clause_supported": false, - "scope": "file" + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileUniqueness", + "description": "Checks the contents of a folder and the uniqueness of each file within.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "Data path for FileUniqueness." + } + ], + "return_type": "NUMBER", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true } ] } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 5beb51c..e542a8f 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -230,29 +230,41 @@ private static Stream provideRawRules() { Arguments.of("DetectAnomalies of RowCount"), Arguments.of("DetectAnomalies of Completeness of \"colA\""), Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\""), - Arguments.of("Checksum \"MD5\" \"S3://PATH\" in [\"hashList\"]"), - Arguments.of("Checksum \"SHA\" in [\"hashList\",\"hashList\"]") + Arguments.of("FileMatch \"MD5\" \"S3://PATH\" in [\"hashList\"]"), + Arguments.of("FileMatch \"SHA\" in [\"hashList\",\"hashList\"]"), + Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), + Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9") ); } @Test void test_fileBasedRulesParsing() { - String checksumRules = "Rules = [ " + - "Checksum \"MD5\" \"s3://sampom-bucket/\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\", \"test\"], " + - "Checksum \"SHA256\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\"] " + + String fileRules = "Rules = [ " + + "FileMatch \"MD5\" \"s3://sampom-bucket/\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\", \"test\"], " + + "FileMatch \"SHA256\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\"], " + + "FileMatch \"S3://PATH1\" \"S3://PATH2\"," + + "FileUniqueness \"S3://PATH1\" >= 0.9" + "]"; try { - DQRuleset dqRuleset = parser.parse(checksumRules); + DQRuleset dqRuleset = parser.parse(fileRules); List ruleList = dqRuleset.getRules(); - assertEquals(2, ruleList.size()); + assertEquals(4, ruleList.size()); DQRule rule0 = ruleList.get(0); - assertEquals("Checksum", rule0.getRuleType()); + assertEquals("FileMatch", rule0.getRuleType()); assertEquals(2, rule0.getParameters().size()); DQRule rule1 = ruleList.get(1); - assertEquals("Checksum", rule1.getRuleType()); + assertEquals("FileMatch", rule1.getRuleType()); assertEquals(1, rule1.getParameters().size()); + + DQRule rule2 = ruleList.get(2); + assertEquals("FileMatch", rule2.getRuleType()); + assertEquals(2, rule2.getParameters().size()); + + DQRule rule3 = ruleList.get(3); + assertEquals("FileUniqueness", rule3.getRuleType()); + assertEquals(1, rule3.getParameters().size()); } catch (Exception e) { fail(e.getMessage()); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index e066980..b7f4956 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -84,11 +84,13 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ DetectAnomalies ]"), Arguments.of("Rules = [ DetectAnomalies \"col-A\" where \"col-A > 100\"]"), Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), - Arguments.of("Rules = [ Checksum ]"), - Arguments.of("Rules = [ Checksum in [] ]"), - Arguments.of("Rules = [ Checksum SHA SHA SHA ]"), - Arguments.of("Rules = [ Checksum SHA SHA SHA in [] ]"), - Arguments.of("Rules = [ Checksum s3Path ]") + Arguments.of("Rules = [ FileMatch ]"), + Arguments.of("Rules = [ FileMatch in [] ]"), + Arguments.of("Rules = [ FileMatch SHA SHA SHA ]"), + Arguments.of("Rules = [ FileMatch SHA SHA SHA in [] ]"), + Arguments.of("Rules = [ FileMatch s3Path ]"), + Arguments.of("Rules = [ FileMatch S3://PATH1 ]"), + Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]") ); } From 26026f7ff1fa5e192d3452177dbc4adfabc55a52 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Mon, 23 Sep 2024 21:03:33 -0400 Subject: [PATCH 33/50] [TDQ] FileMatch Parsing, added two new With conditions w/ order agnostic parsing --- .../dqdl/DataQualityDefinitionLanguage.g4 | 5 +- configuration/rules/rules-config.json | 28 +++- .../ml/dataquality/dqdl/model/DQRule.java | 23 ++- .../ml/dataquality/dqdl/model/DQRuleType.java | 3 + .../dqdl/parser/DQDLParserListener.java | 83 ++++++++-- .../ml/dataquality/dqdl/model/DQRuleTest.java | 85 ++++++---- .../dqdl/parser/InvalidDQRulesetTest.java | 148 +++++++++--------- 7 files changed, 243 insertions(+), 132 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 452d3aa..0b94edc 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -101,10 +101,13 @@ condition: | durationBasedCondition; withThresholdCondition: 'with' 'threshold' numberBasedCondition; +withHashAlgorithmCondition: 'with' 'hashAlgorithm' stringBasedCondition; +withDataFrameCondition: 'with' 'dataFrame'; +withCondition: withThresholdCondition | withHashAlgorithmCondition | withDataFrameCondition; whereClause: 'where' quotedString; -dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withThresholdCondition?; +dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withCondition? withCondition?; dqAnalyzer: analyzerType parameterWithConnectorWord*; topLevelRule: diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 51973be..3d8146f 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -424,22 +424,23 @@ }, { "rule_type_name": "FileMatch", - "description": "Match Files/Directories against Files/Directories or against an optional list of checksum values.", + "description": "Match Files/Directories against Files/Directories.", "parameters": [ { "type": "String", - "name": "Algorithm or DataPath", - "description": "Desired Algorithm for the Checksum or File/Directory" + "name": "DataPath", + "description": "File/Directory for comparison" }, { "type": "String", "name": "CompareDataPath", - "description": "File/Directory for comparison" + "description": "Other File/Directory for comparison" } ], - "return_type": "STRING|BOOLEAN", + "return_type": "BOOLEAN", "is_threshold_supported": false, "is_where_clause_supported": false, + "is_hash_algo_supported": true, "scope": "file", "experimental": true }, @@ -449,13 +450,25 @@ "parameters": [ { "type": "String", - "name": "Algorithm", - "description": "Desired Algorithm for the Checksum." + "name": "DataPath", + "description": "File/Directory for comparison" } ], "return_type": "STRING", "is_threshold_supported": false, "is_where_clause_supported": false, + "is_hash_algo_supported": true, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileMatch", + "description": "Match Files/Directories inferred from DataFrames against a list of checksum values.", + "parameters": [], + "return_type": "STRING", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "is_hash_algo_supported": true, "scope": "file", "experimental": true }, @@ -472,6 +485,7 @@ "return_type": "NUMBER", "is_threshold_supported": false, "is_where_clause_supported": false, + "is_hash_algo_supported": true, "scope": "file", "experimental": true } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 23dbd2c..bebfdf3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -35,10 +35,12 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final Map parameterValueMap; private final Condition condition; private final Condition thresholdCondition; + private Condition hashAlgoCondition; private final DQRuleLogicalOperator operator; private final List nestedRules; private final String whereClause; private Boolean isExcludedAtRowLevelInCompositeRules = false; + private Boolean dataFrameCondition; // Adding this constructor so as to not break the Data Quality ETL package. public DQRule(final String ruleType, @@ -91,7 +93,7 @@ public DQRule(final String ruleType, public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition) { - return createFromParameterValueMap(ruleType, parameters, condition, null, null); + return createFromParameterValueMap(ruleType, parameters, condition, null, null, null, null); } public DQRule(final String ruleType, @@ -113,7 +115,9 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition, final Condition thresholdCondition, - final String whereClause) { + final Condition hashAlgoCondition, + final String whereClause, + final Boolean dataFrameCondition) { DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; List nestedRules = new ArrayList<>(); @@ -123,10 +127,12 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, parameters, condition, thresholdCondition, + hashAlgoCondition, operator, nestedRules, whereClause, - ruleType.isExcludedAtRowLevelInCompositeRules() + ruleType.isExcludedAtRowLevelInCompositeRules(), + dataFrameCondition ); } @@ -164,7 +170,16 @@ public String toString() { if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition); } - return sb.toString(); + if (hashAlgoCondition != null) { + String formattedCondition = hashAlgoCondition.getFormattedCondition(); + if (!isBlank(formattedCondition)) sb.append(" with hashAlgorithm ").append(formattedCondition); + } + + if (dataFrameCondition != null) { + if (dataFrameCondition) sb.append(" with dataFrame "); + } + + return sb.toString().trim(); } else { for (int i = 0; i < nestedRules.size(); i++) { sb.append("(").append(nestedRules.get(i).toString()).append(")"); diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 4728ff3..7a7061e 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -32,6 +32,7 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; + private final boolean isHashAlgoSupported; private final boolean isExcludedAtRowLevelInCompositeRules; private final boolean isWhereClauseSupported; private final boolean isAnalyzerOnly; @@ -45,6 +46,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "return_type") String returnType, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, + @JsonProperty(value = "is_hash_algo_supported") boolean isHashAlgoSupported, @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules") boolean isExcludedAtRowLevelInCompositeRules, @JsonProperty(value = "is_where_clause_supported") @@ -57,6 +59,7 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; + this.isHashAlgoSupported = isHashAlgoSupported; this.isExcludedAtRowLevelInCompositeRules = isExcludedAtRowLevelInCompositeRules; this.isWhereClauseSupported = isWhereClauseSupported; this.isAnalyzerOnly = isAnalyzerOnly; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 70e580b..890e5fd 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -52,6 +52,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -273,28 +274,75 @@ private Either getDQRule( } } - Condition thresholdCondition = null; - if (dqRuleContext.withThresholdCondition() != null) { - if (dqRuleType.isThresholdSupported()) { - DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = - dqRuleContext.withThresholdCondition().numberBasedCondition(); + Condition thresholdCondition = null, hashAlgoCondition = null; + Boolean dataFrameCondition = null; + List conditionContexts = + dqRuleContext.withCondition() != null + ? dqRuleContext.withCondition().stream().filter(Objects::nonNull).collect(Collectors.toList()) + : Collections.emptyList(); - if (ctx == null) { - return Either.fromLeft( - String.format("Empty threshold condition provided for rule type: %s", ruleType)); - } else { - Optional possibleCond = - parseNumberBasedCondition(dqRuleContext.withThresholdCondition().numberBasedCondition()); - if (possibleCond.isPresent()) { - thresholdCondition = possibleCond.get(); + for (DataQualityDefinitionLanguageParser.WithConditionContext conditionContext : conditionContexts) { + if (conditionContext.withThresholdCondition() != null) { + if (dqRuleType.isThresholdSupported()) { + DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = + conditionContext.withThresholdCondition().numberBasedCondition(); + + if (ctx == null) { + return Either.fromLeft( + String.format("Empty threshold condition provided for rule type: %s", ruleType)); } else { + Optional possibleCond = + parseNumberBasedCondition(conditionContext + .withThresholdCondition().numberBasedCondition()); + if (possibleCond.isPresent()) { + thresholdCondition = possibleCond.get(); + } else { + return Either.fromLeft( + String.format("Unable to parse threshold condition " + + "provided for rule type: %s", ruleType)); + } + } + + } else { + return Either.fromLeft(String.format("Threshold condition not supported " + + "for rule type: %s", ruleType)); + } + } + + if (conditionContext.withHashAlgorithmCondition() != null) { + if (dqRuleType.isHashAlgoSupported()) { + DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx = + conditionContext.withHashAlgorithmCondition().stringBasedCondition(); + + if (ctx == null) { return Either.fromLeft( - String.format("Unable to parse threshold condition provided for rule type: %s", ruleType)); + String.format("Empty algorithm condition provided for rule type: %s", ruleType)); + } else { + Optional possibleCond = + parseStringBasedCondition(conditionContext + .withHashAlgorithmCondition().stringBasedCondition()); + if (possibleCond.isPresent()) { + hashAlgoCondition = possibleCond.get(); + } else { + return Either.fromLeft( + String.format("Unable to parse algorithm condition provided for rule type: %s", + ruleType)); + } } + } else { + return Either.fromLeft(String.format("Algorithm condition " + + "not supported for rule type: %s", ruleType)); } + } - } else { - return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType)); + if (conditionContext.withDataFrameCondition() != null) { + //all hashAlgo rules also support inferring from dataframes + if (dqRuleType.isHashAlgoSupported()) { + dataFrameCondition = true; + } else { + return Either.fromLeft(String.format("DataFrame condition not supported " + + "for rule type: %s", ruleType)); + } } } @@ -315,7 +363,8 @@ private Either getDQRule( return Either.fromRight( DQRule.createFromParameterValueMap( - dqRuleType, parameterMap, condition, thresholdCondition, whereClause) + dqRuleType, parameterMap, condition, thresholdCondition, hashAlgoCondition, + whereClause, dataFrameCondition) ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index e542a8f..4d5bd4f 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -230,44 +230,69 @@ private static Stream provideRawRules() { Arguments.of("DetectAnomalies of RowCount"), Arguments.of("DetectAnomalies of Completeness of \"colA\""), Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\""), - Arguments.of("FileMatch \"MD5\" \"S3://PATH\" in [\"hashList\"]"), - Arguments.of("FileMatch \"SHA\" in [\"hashList\",\"hashList\"]"), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\"]"), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"]"), + Arguments.of("FileMatch in [\"hashList\",\"hashList\"]"), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9") ); } @Test - void test_fileBasedRulesParsing() { + void test_checksumRuleParsing() throws Exception { String fileRules = "Rules = [ " + - "FileMatch \"MD5\" \"s3://sampom-bucket/\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\", \"test\"], " + - "FileMatch \"SHA256\" in [\"68e656b251e67e8358bef8483ab0d51c6619f3e7a1a9f0e75838d41ff368f728\"], " + - "FileMatch \"S3://PATH1\" \"S3://PATH2\"," + - "FileUniqueness \"S3://PATH1\" >= 0.9" + + "FileMatch in [\"exampleHash\"] with hashAlgorithm = \"MD5\" with dataFrame ," + + "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with hashAlgorithm = \"SHA-256\" ," + + "FileMatch \"s3://sampom-bucket3/\" in [\"exampleHash3\"] ," + + "FileMatch in [\"exampleHash4\"] with dataFrame" + "]"; - try { - DQRuleset dqRuleset = parser.parse(fileRules); - List ruleList = dqRuleset.getRules(); - assertEquals(4, ruleList.size()); - - DQRule rule0 = ruleList.get(0); - assertEquals("FileMatch", rule0.getRuleType()); - assertEquals(2, rule0.getParameters().size()); - - DQRule rule1 = ruleList.get(1); - assertEquals("FileMatch", rule1.getRuleType()); - assertEquals(1, rule1.getParameters().size()); - - DQRule rule2 = ruleList.get(2); - assertEquals("FileMatch", rule2.getRuleType()); - assertEquals(2, rule2.getParameters().size()); - - DQRule rule3 = ruleList.get(3); - assertEquals("FileUniqueness", rule3.getRuleType()); - assertEquals(1, rule3.getParameters().size()); - } catch (Exception e) { - fail(e.getMessage()); - } + DQRuleset dqRuleset = parser.parse(fileRules); + List ruleList = dqRuleset.getRules(); + + DQRule rule0 = ruleList.get(0); + assertEquals("FileMatch", rule0.getRuleType()); + assertEquals("exampleHash", ((StringBasedCondition) rule0.getCondition()).getOperands().get(0).getOperand()); + assertEquals("MD5", ((StringBasedCondition) rule0.getHashAlgoCondition()).getOperands().get(0).getOperand()); + assertEquals(true, rule0.getDataFrameCondition()); + + DQRule rule1 = ruleList.get(1); + assertEquals("FileMatch", rule1.getRuleType()); + assertEquals("s3://sampom-bucket2/", rule1.getParameters().get("DataPath")); + assertEquals("exampleHash2", ((StringBasedCondition) rule1.getCondition()).getOperands().get(0).getOperand()); + assertEquals("SHA-256", ((StringBasedCondition) rule1.getHashAlgoCondition()).getOperands().get(0).getOperand()); + assertEquals(null, rule1.getDataFrameCondition()); + + DQRule rule2 = ruleList.get(2); + assertEquals("FileMatch", rule2.getRuleType()); + assertEquals("s3://sampom-bucket3/", rule2.getParameters().get("DataPath")); + assertEquals("exampleHash3", ((StringBasedCondition) rule2.getCondition()).getOperands().get(0).getOperand()); + assertEquals(null, rule2.getDataFrameCondition()); + + DQRule rule3 = ruleList.get(3); + assertEquals("FileMatch", rule3.getRuleType()); + assertEquals("exampleHash4", ((StringBasedCondition) rule3.getCondition()).getOperands().get(0).getOperand()); + assertEquals(true, rule3.getDataFrameCondition()); + } + + @Test + void test_fileMatchRuleParsing() throws Exception { + String fileRules = "Rules = [ " + + "FileMatch \"s3://sampom-bucket1/\" \"s3://sampom-bucket2/\"," + + "FileMatch \"s3://sampom-bucket1/file1.json\" \"s3://sampom-bucket2/file2.json\"" + + "]"; + DQRuleset dqRuleset = parser.parse(fileRules); + List ruleList = dqRuleset.getRules(); + + DQRule rule0 = ruleList.get(0); + assertEquals("FileMatch", rule0.getRuleType()); + assertEquals("s3://sampom-bucket1/", rule0.getParameters().get("DataPath")); + assertEquals("s3://sampom-bucket2/", rule0.getParameters().get("CompareDataPath")); + + DQRule rule1 = ruleList.get(1); + assertEquals("FileMatch", rule0.getRuleType()); + assertEquals("s3://sampom-bucket1/file1.json", rule1.getParameters().get("DataPath")); + assertEquals("s3://sampom-bucket2/file2.json", rule1.getParameters().get("CompareDataPath")); } @Test diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index b7f4956..0fb1883 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -24,84 +24,86 @@ public class InvalidDQRulesetTest { private static Stream provideInvalidRulesets() { return Stream.of( - Arguments.of(""), - Arguments.of("Metadata = {}"), - Arguments.of("DataSources = {}"), - Arguments.of("Metadata = { \"Version\": \"1.0\" }"), - Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = {}"), - Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = { \"Primary\": \"Foo\" }"), - Arguments.of("Rules = {"), - Arguments.of("Rules = }"), - Arguments.of("Rules = { }"), - Arguments.of("Rules = [ ]"), - Arguments.of("Rules = ["), - Arguments.of("Rules = ]"), - Arguments.of("Rules = Abcdefg123"), - Arguments.of("Rules11 = [ ColumnValues \"load_dt\" > (now() - 1) ]"), - Arguments.of("Rules = [ 11ColumnValues \"load_dt\" > (now() - 1) ]"), - Arguments.of("Rules = [ ColumnValues \"load_dt\" \"load_dt_2\" > (now() - 1) ]"), - Arguments.of("Rules = [ Completeness \"col-A\" ]"), - Arguments.of("Rules = { Completeness \"col-A\" }"), - Arguments.of("Rules = [ ColumnNamesMatchPattern aws_* ]"), - Arguments.of("Rules = [ ColumnNamesMatchPattern \"aws_*\" where \"aws_id > 100\"]"), - Arguments.of("Rules = [ IsComplete \"col-A\" > 0.05 ]"), - Arguments.of("Rules = [ IsUnique \"col-A\" <= 1.5 ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\" between 1 and 2 ]"), - Arguments.of("Rules = [ ColumnDataType \"col-A\" ]"), - Arguments.of("Rules = [ ColumnDataType \"col-A\" with threshold > 0.7 ]"), - Arguments.of("Rules = [ ColumnDataType \"col-A\" \"col-B\" ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" matches ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" now() ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > now() + 1 hours ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" = (now() - 3 weeks) ]"), - Arguments.of("Rules = [ Completeness \"col-A\" > 0.4 with threshold > 0.4]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 threshold]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with threshold]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" in [1,\"2\"] ]"), - Arguments.of("Rules = [ DataFreshness \"col-A\" <= 3 ]"), - Arguments.of("Rules = [ DataFreshness \"col-A\" > 30 ]"), - Arguments.of("Rules = [ DataFreshness \"col-A\" between 2 and 4 days ]"), - Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference\" \"col-A1\" ]"), - Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" = 0.99 ]"), - Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference.col-A\" = 0.99 where \"col-A > 100\"]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" = 0.99 ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 with threshold > 0.9]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 where \"ID > 100\"]"), - Arguments.of("Rules = [ SchemaMatch with threshold between 0.2 and 0.4 ]"), - Arguments.of("Rules = [ SchemaMatch \"ref-1\" between 0.2 and 0.4 with threshold > 0.5 ]"), - Arguments.of("Rules = [ SchemaMatch \"ref-1\" \"ref-2\" ]"), - Arguments.of("Rules = [ RowCountMatch > 0.1 ]"), - Arguments.of("Rules = [ RowCountMatch \"reference-1\" \"col-1\" > 0.1 ]"), - Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 with threshold > 0.1 ]"), - Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 where \"id > 100\"]"), - Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), - Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), - Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]"), - Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.8 where \"col-A > 100\"]"), - Arguments.of("Rules = [ DetectAnomalies ]"), - Arguments.of("Rules = [ DetectAnomalies \"col-A\" where \"col-A > 100\"]"), - Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), - Arguments.of("Rules = [ FileMatch ]"), - Arguments.of("Rules = [ FileMatch in [] ]"), - Arguments.of("Rules = [ FileMatch SHA SHA SHA ]"), - Arguments.of("Rules = [ FileMatch SHA SHA SHA in [] ]"), - Arguments.of("Rules = [ FileMatch s3Path ]"), - Arguments.of("Rules = [ FileMatch S3://PATH1 ]"), - Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]") + Arguments.of(""), + Arguments.of("Metadata = {}"), + Arguments.of("DataSources = {}"), + Arguments.of("Metadata = { \"Version\": \"1.0\" }"), + Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = {}"), + Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = { \"Primary\": \"Foo\" }"), + Arguments.of("Rules = {"), + Arguments.of("Rules = }"), + Arguments.of("Rules = { }"), + Arguments.of("Rules = [ ]"), + Arguments.of("Rules = ["), + Arguments.of("Rules = ]"), + Arguments.of("Rules = Abcdefg123"), + Arguments.of("Rules11 = [ ColumnValues \"load_dt\" > (now() - 1) ]"), + Arguments.of("Rules = [ 11ColumnValues \"load_dt\" > (now() - 1) ]"), + Arguments.of("Rules = [ ColumnValues \"load_dt\" \"load_dt_2\" > (now() - 1) ]"), + Arguments.of("Rules = [ Completeness \"col-A\" ]"), + Arguments.of("Rules = { Completeness \"col-A\" }"), + Arguments.of("Rules = [ ColumnNamesMatchPattern aws_* ]"), + Arguments.of("Rules = [ ColumnNamesMatchPattern \"aws_*\" where \"aws_id > 100\"]"), + Arguments.of("Rules = [ IsComplete \"col-A\" > 0.05 ]"), + Arguments.of("Rules = [ IsUnique \"col-A\" <= 1.5 ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\" between 1 and 2 ]"), + Arguments.of("Rules = [ ColumnDataType \"col-A\" ]"), + Arguments.of("Rules = [ ColumnDataType \"col-A\" with threshold > 0.7 ]"), + Arguments.of("Rules = [ ColumnDataType \"col-A\" \"col-B\" ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" matches ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" now() ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > now() + 1 hours ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" = (now() - 3 weeks) ]"), + Arguments.of("Rules = [ Completeness \"col-A\" > 0.4 with threshold > 0.4]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 threshold]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with threshold]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" in [1,\"2\"] ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" <= 3 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 30 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" between 2 and 4 days ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference\" \"col-A1\" ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" = 0.99 ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference.col-A\" = 0.99 where \"col-A > 100\"]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" = 0.99 ]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" ]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" ]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 with threshold > 0.9]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 where \"ID > 100\"]"), + Arguments.of("Rules = [ SchemaMatch with threshold between 0.2 and 0.4 ]"), + Arguments.of("Rules = [ SchemaMatch \"ref-1\" between 0.2 and 0.4 with threshold > 0.5 ]"), + Arguments.of("Rules = [ SchemaMatch \"ref-1\" \"ref-2\" ]"), + Arguments.of("Rules = [ RowCountMatch > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" \"col-1\" > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 with threshold > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 where \"id > 100\"]"), + Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.8 where \"col-A > 100\"]"), + Arguments.of("Rules = [ DetectAnomalies ]"), + Arguments.of("Rules = [ DetectAnomalies \"col-A\" where \"col-A > 100\"]"), + Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), + Arguments.of("Rules = [ FileMatch ]"), + Arguments.of("Rules = [ FileMatch in [] ]"), + Arguments.of("Rules = [ FileMatch SHA SHA SHA ]"), + Arguments.of("Rules = [ FileMatch SHA SHA SHA in [] ]"), + Arguments.of("Rules = [ FileMatch s3Path ]"), + Arguments.of("Rules = [ FileMatch s3Path with noHashAlgorithm ]"), + Arguments.of("FileMatch \"S3://PATH\" \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), + Arguments.of("Rules = [ FileMatch S3://PATH1 ]"), + Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]") ); } private static Stream provideInvalidRulesetsWithAnalyzers() { return Stream.of( - Arguments.of("Rules = [ ] Analyzers = [ ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ IsComplete \"colA\" ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Foo ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\" > 1.0 ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Uniqueness \"colB\" = 1.0 ]") + Arguments.of("Rules = [ ] Analyzers = [ ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ IsComplete \"colA\" ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Foo ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\" > 1.0 ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Uniqueness \"colB\" = 1.0 ]") ); } From 4ba839ad609dcb69a886be14d22b9c93186deb9a Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Tue, 24 Sep 2024 17:36:30 -0400 Subject: [PATCH 34/50] Fix JS Parser - support List Condition --- configuration/dqdl/DataQualityDefinitionLanguage.g4 | 2 +- configuration/rules/rules-config.json | 2 +- .../glue/ml/dataquality/dqdl/parser/DQDLParserListener.java | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 0b94edc..88b0142 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -1,4 +1,4 @@ -grammar DataQualityDefinitionLanguage; // "parser grammars for DQDL" +grammar DataQualityDefinitionLanguage; // "parser grammars for DQDL" import CommonLexerRules; // Sections diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 3d8146f..0d0f548 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -479,7 +479,7 @@ { "type": "String", "name": "DataPath", - "description": "Data path for FileUniqueness." + "description": "Data Path for FileUniqueness." } ], "return_type": "NUMBER", diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 890e5fd..2d5bfb1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -281,6 +281,7 @@ private Either getDQRule( ? dqRuleContext.withCondition().stream().filter(Objects::nonNull).collect(Collectors.toList()) : Collections.emptyList(); + //TODO - reduce complexity for (DataQualityDefinitionLanguageParser.WithConditionContext conditionContext : conditionContexts) { if (conditionContext.withThresholdCondition() != null) { if (dqRuleType.isThresholdSupported()) { From eab76c2040e8f0a3c61fd3f97949774eb2c6d33c Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Tue, 8 Oct 2024 14:51:04 -0400 Subject: [PATCH 35/50] Introduces DQDL Tags. Refactor TDQ to use Tags. Refactor Parsing. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 7 +- .../ml/dataquality/dqdl/model/DQRule.java | 26 ++-- .../dqdl/parser/DQDLParserListener.java | 130 ++++++++---------- .../ml/dataquality/dqdl/model/DQRuleTest.java | 21 ++- 4 files changed, 81 insertions(+), 103 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 88b0142..85d7752 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -66,6 +66,7 @@ stringBasedCondition: NEGATION? EQUAL_TO stringValues | NOT? IN stringValuesArray | NOT? matchesRegexCondition; +tagValues: quotedString | IDENTIFIER; dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC; dateBasedCondition: @@ -93,6 +94,7 @@ parameter: QUOTED_STRING | IDENTIFIER; connectorWord: OF | AND; parameterWithConnectorWord: connectorWord? parameter; +tagWithCondition: 'with' tagValues stringBasedCondition; condition: numberBasedCondition @@ -101,13 +103,10 @@ condition: | durationBasedCondition; withThresholdCondition: 'with' 'threshold' numberBasedCondition; -withHashAlgorithmCondition: 'with' 'hashAlgorithm' stringBasedCondition; -withDataFrameCondition: 'with' 'dataFrame'; -withCondition: withThresholdCondition | withHashAlgorithmCondition | withDataFrameCondition; whereClause: 'where' quotedString; -dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withCondition? withCondition?; +dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withThresholdCondition? tagWithCondition*; dqAnalyzer: analyzerType parameterWithConnectorWord*; topLevelRule: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index bebfdf3..18a995d 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -35,12 +35,11 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final Map parameterValueMap; private final Condition condition; private final Condition thresholdCondition; - private Condition hashAlgoCondition; private final DQRuleLogicalOperator operator; private final List nestedRules; private final String whereClause; private Boolean isExcludedAtRowLevelInCompositeRules = false; - private Boolean dataFrameCondition; + private Map tags; // Adding this constructor so as to not break the Data Quality ETL package. public DQRule(final String ruleType, @@ -93,7 +92,7 @@ public DQRule(final String ruleType, public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition) { - return createFromParameterValueMap(ruleType, parameters, condition, null, null, null, null); + return createFromParameterValueMap(ruleType, parameters, condition, null, null, null); } public DQRule(final String ruleType, @@ -115,9 +114,8 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition, final Condition thresholdCondition, - final Condition hashAlgoCondition, final String whereClause, - final Boolean dataFrameCondition) { + final Map tags) { DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; List nestedRules = new ArrayList<>(); @@ -127,12 +125,11 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, parameters, condition, thresholdCondition, - hashAlgoCondition, operator, nestedRules, whereClause, ruleType.isExcludedAtRowLevelInCompositeRules(), - dataFrameCondition + tags ); } @@ -170,14 +167,17 @@ public String toString() { if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition); } - if (hashAlgoCondition != null) { - String formattedCondition = hashAlgoCondition.getFormattedCondition(); - if (!isBlank(formattedCondition)) sb.append(" with hashAlgorithm ").append(formattedCondition); + if (tags != null && !tags.isEmpty()) { + sb.append(" "); + for (Map.Entry entry : tags.entrySet()) { + sb.append("with \"") + .append(entry.getKey()) + .append("\" = \"") + .append(entry.getValue()) + .append("\" "); + } } - if (dataFrameCondition != null) { - if (dataFrameCondition) sb.append(" with dataFrame "); - } return sb.toString().trim(); } else { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 2d5bfb1..7240ec7 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -10,10 +10,14 @@ package com.amazonaws.glue.ml.dataquality.dqdl.parser; +import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener; +import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQAnalyzer; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleLogicalOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleParameterValue; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedConditionOperator; @@ -30,17 +34,14 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand; -import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; -import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; -import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener; -import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; +import org.antlr.v4.runtime.tree.ParseTree; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; @@ -52,7 +53,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -274,76 +274,29 @@ private Either getDQRule( } } - Condition thresholdCondition = null, hashAlgoCondition = null; - Boolean dataFrameCondition = null; - List conditionContexts = - dqRuleContext.withCondition() != null - ? dqRuleContext.withCondition().stream().filter(Objects::nonNull).collect(Collectors.toList()) - : Collections.emptyList(); - - //TODO - reduce complexity - for (DataQualityDefinitionLanguageParser.WithConditionContext conditionContext : conditionContexts) { - if (conditionContext.withThresholdCondition() != null) { - if (dqRuleType.isThresholdSupported()) { - DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = - conditionContext.withThresholdCondition().numberBasedCondition(); - - if (ctx == null) { - return Either.fromLeft( - String.format("Empty threshold condition provided for rule type: %s", ruleType)); - } else { - Optional possibleCond = - parseNumberBasedCondition(conditionContext - .withThresholdCondition().numberBasedCondition()); - if (possibleCond.isPresent()) { - thresholdCondition = possibleCond.get(); - } else { - return Either.fromLeft( - String.format("Unable to parse threshold condition " + - "provided for rule type: %s", ruleType)); - } - } + Condition thresholdCondition = null; + if (dqRuleContext.withThresholdCondition() != null) { + if (dqRuleType.isThresholdSupported()) { + DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = + dqRuleContext.withThresholdCondition().numberBasedCondition(); + if (ctx == null) { + return Either.fromLeft( + String.format("Empty threshold condition provided for rule type: %s", ruleType)); } else { - return Either.fromLeft(String.format("Threshold condition not supported " + - "for rule type: %s", ruleType)); - } - } - - if (conditionContext.withHashAlgorithmCondition() != null) { - if (dqRuleType.isHashAlgoSupported()) { - DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx = - conditionContext.withHashAlgorithmCondition().stringBasedCondition(); - - if (ctx == null) { - return Either.fromLeft( - String.format("Empty algorithm condition provided for rule type: %s", ruleType)); + Optional possibleCond = + parseNumberBasedCondition(dqRuleContext.withThresholdCondition().numberBasedCondition()); + if (possibleCond.isPresent()) { + thresholdCondition = possibleCond.get(); } else { - Optional possibleCond = - parseStringBasedCondition(conditionContext - .withHashAlgorithmCondition().stringBasedCondition()); - if (possibleCond.isPresent()) { - hashAlgoCondition = possibleCond.get(); - } else { - return Either.fromLeft( - String.format("Unable to parse algorithm condition provided for rule type: %s", - ruleType)); - } + return Either.fromLeft( + String.format("Unable to parse threshold condition provided for rule type: %s", + ruleType)); } - } else { - return Either.fromLeft(String.format("Algorithm condition " + - "not supported for rule type: %s", ruleType)); } - } - if (conditionContext.withDataFrameCondition() != null) { - //all hashAlgo rules also support inferring from dataframes - if (dqRuleType.isHashAlgoSupported()) { - dataFrameCondition = true; - } else { - return Either.fromLeft(String.format("DataFrame condition not supported " + - "for rule type: %s", ruleType)); - } + } else { + return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType)); } } @@ -362,13 +315,42 @@ private Either getDQRule( } } + Map tags = new HashMap<>(); + List tagContexts = + dqRuleContext.tagWithCondition(); + if (tagContexts != null && !tagContexts.isEmpty()) { + for (DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext : tagContexts) { + if (!isTagValid(tagContext.stringBasedCondition())) { + return Either.fromLeft("Only EQUAL_TO condition is supported for tags."); + } + String tagKey = getKeyFromTag(tagContext.tagValues()); + Optional valueCondition = parseStringBasedCondition(tagContext.stringBasedCondition()); + if (valueCondition.isPresent()) { + StringBasedCondition stringCondition = (StringBasedCondition) valueCondition.get(); + String tagValue = stringCondition.getOperands().get(0).getOperand(); + tags.put(tagKey, tagValue); + } else { + return Either.fromLeft(String.format("Error while parsing tag: %s", tagKey)); + } + } + } + return Either.fromRight( - DQRule.createFromParameterValueMap( - dqRuleType, parameterMap, condition, thresholdCondition, hashAlgoCondition, - whereClause, dataFrameCondition) + DQRule.createFromParameterValueMap( + dqRuleType, parameterMap, condition, thresholdCondition, whereClause, tags) ); } + private boolean isTagValid(DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx) { + return ctx.EQUAL_TO() != null; + } + + private String getKeyFromTag(DataQualityDefinitionLanguageParser.TagValuesContext tagValuesContext) { + Optional identifierKey = Optional.ofNullable(tagValuesContext.IDENTIFIER()).map(ParseTree::getText); + Optional stringKey = Optional.ofNullable(tagValuesContext.quotedString()).map(ParseTree::getText); + return removeQuotes(identifierKey.orElseGet(stringKey::get)); + } + private Either getDQAnalyzer( DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) { String analyzerType = dqAnalyzerContext.analyzerType().getText(); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 4d5bd4f..5be8d35 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -42,9 +42,7 @@ import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotSame; -import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -233,7 +231,9 @@ private static Stream provideRawRules() { Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\"]"), Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"]"), Arguments.of("FileMatch in [\"hashList\",\"hashList\"]"), - Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with \"hashAlgorithm\" = \"MD5\""), + Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\" with \"randomTagThing\" = \"@sampom\""), + Arguments.of("FileMatch \"S3://PATH1\" in [\"a\"] with \"tag1\" = \"sampom\" with \"tag2\" = \"pomsam\""), Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9") ); @@ -242,10 +242,10 @@ private static Stream provideRawRules() { @Test void test_checksumRuleParsing() throws Exception { String fileRules = "Rules = [ " + - "FileMatch in [\"exampleHash\"] with hashAlgorithm = \"MD5\" with dataFrame ," + - "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with hashAlgorithm = \"SHA-256\" ," + + "FileMatch in [\"exampleHash\"] with \"hashAlgorithm\" = \"MD5\" with \"dataFrame\" = \"true\" ," + + "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with \"hashAlgorithm\" = \"SHA-256\" ," + "FileMatch \"s3://sampom-bucket3/\" in [\"exampleHash3\"] ," + - "FileMatch in [\"exampleHash4\"] with dataFrame" + + "FileMatch in [\"exampleHash4\"] with \"dataFrame\" = \"true\"" + "]"; DQRuleset dqRuleset = parser.parse(fileRules); List ruleList = dqRuleset.getRules(); @@ -253,26 +253,23 @@ void test_checksumRuleParsing() throws Exception { DQRule rule0 = ruleList.get(0); assertEquals("FileMatch", rule0.getRuleType()); assertEquals("exampleHash", ((StringBasedCondition) rule0.getCondition()).getOperands().get(0).getOperand()); - assertEquals("MD5", ((StringBasedCondition) rule0.getHashAlgoCondition()).getOperands().get(0).getOperand()); - assertEquals(true, rule0.getDataFrameCondition()); + assertEquals("MD5", rule0.getTags().get("hashAlgorithm")); + assertEquals("true", rule0.getTags().get("dataFrame")); DQRule rule1 = ruleList.get(1); assertEquals("FileMatch", rule1.getRuleType()); assertEquals("s3://sampom-bucket2/", rule1.getParameters().get("DataPath")); assertEquals("exampleHash2", ((StringBasedCondition) rule1.getCondition()).getOperands().get(0).getOperand()); - assertEquals("SHA-256", ((StringBasedCondition) rule1.getHashAlgoCondition()).getOperands().get(0).getOperand()); - assertEquals(null, rule1.getDataFrameCondition()); + assertEquals("SHA-256", rule1.getTags().get("hashAlgorithm")); DQRule rule2 = ruleList.get(2); assertEquals("FileMatch", rule2.getRuleType()); assertEquals("s3://sampom-bucket3/", rule2.getParameters().get("DataPath")); assertEquals("exampleHash3", ((StringBasedCondition) rule2.getCondition()).getOperands().get(0).getOperand()); - assertEquals(null, rule2.getDataFrameCondition()); DQRule rule3 = ruleList.get(3); assertEquals("FileMatch", rule3.getRuleType()); assertEquals("exampleHash4", ((StringBasedCondition) rule3.getCondition()).getOperands().get(0).getOperand()); - assertEquals(true, rule3.getDataFrameCondition()); } @Test From b5384ef3aa76dcce633c3718c8daf4359f5d2ab2 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Wed, 9 Oct 2024 11:44:45 -0400 Subject: [PATCH 36/50] Introduces FileFreshness & Parsing --- configuration/rules/rules-config.json | 30 +++++++++-- .../ml/dataquality/dqdl/model/DQRuleType.java | 3 -- .../ml/dataquality/dqdl/model/DQRuleTest.java | 52 ++++++++++++++++++- .../dqdl/parser/InvalidDQRulesetTest.java | 6 ++- 4 files changed, 82 insertions(+), 9 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 0d0f548..f46f5ee 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -440,7 +440,6 @@ "return_type": "BOOLEAN", "is_threshold_supported": false, "is_where_clause_supported": false, - "is_hash_algo_supported": true, "scope": "file", "experimental": true }, @@ -457,7 +456,6 @@ "return_type": "STRING", "is_threshold_supported": false, "is_where_clause_supported": false, - "is_hash_algo_supported": true, "scope": "file", "experimental": true }, @@ -468,7 +466,6 @@ "return_type": "STRING", "is_threshold_supported": false, "is_where_clause_supported": false, - "is_hash_algo_supported": true, "scope": "file", "experimental": true }, @@ -485,7 +482,32 @@ "return_type": "NUMBER", "is_threshold_supported": false, "is_where_clause_supported": false, - "is_hash_algo_supported": true, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileFreshness", + "description": "Checks the age of a filepath against a specified date.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "Data Path for FileFreshness." + } + ], + "return_type": "DATE", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileFreshness", + "description": "Checks the age of an inferred file against a specified date.", + "parameters": [], + "return_type": "DATE", + "is_threshold_supported": true, + "is_where_clause_supported": false, "scope": "file", "experimental": true } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 7a7061e..4728ff3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -32,7 +32,6 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; - private final boolean isHashAlgoSupported; private final boolean isExcludedAtRowLevelInCompositeRules; private final boolean isWhereClauseSupported; private final boolean isAnalyzerOnly; @@ -46,7 +45,6 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "return_type") String returnType, // boolean defaults to false if not present @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, - @JsonProperty(value = "is_hash_algo_supported") boolean isHashAlgoSupported, @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules") boolean isExcludedAtRowLevelInCompositeRules, @JsonProperty(value = "is_where_clause_supported") @@ -59,7 +57,6 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; - this.isHashAlgoSupported = isHashAlgoSupported; this.isExcludedAtRowLevelInCompositeRules = isExcludedAtRowLevelInCompositeRules; this.isWhereClauseSupported = isWhereClauseSupported; this.isAnalyzerOnly = isAnalyzerOnly; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 5be8d35..352d1bc 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -235,10 +235,52 @@ private static Stream provideRawRules() { Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\" with \"randomTagThing\" = \"@sampom\""), Arguments.of("FileMatch \"S3://PATH1\" in [\"a\"] with \"tag1\" = \"sampom\" with \"tag2\" = \"pomsam\""), Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), - Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9") + Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9"), + Arguments.of("FileFreshness \"S3://PATH\" between \"2023-02-07\" and \"2024-07-15\""), + Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)") ); } + @Test + void test_fileFileFreshnessParsing() throws Exception { + String fileRules = "Rules = [ " + + "FileFreshness \"S3://path\" between \"2023-02-07\" and \"2024-07-15\", " + + "FileFreshness \"S3://path\" > (now() - 3 days), " + + "FileFreshness \"S3://path\" < (now() - 4 days), " + + "FileFreshness between \"2023-02-07\" and \"2024-07-15\" " + + "]"; + DQRuleset dqRuleset = parser.parse(fileRules); + List ruleList = dqRuleset.getRules(); + DQRule rule0 = ruleList.get(0); + + DateBasedCondition c0 = (DateBasedCondition) rule0.getCondition(); + assertEquals("FileFreshness", rule0.getRuleType()); + assertEquals("S3://path", rule0.getParameters().get("DataPath")); + assertEquals("2023-02-07", removeQuotes(c0.getOperands().get(0).getFormattedExpression())); + assertEquals("2024-07-15", removeQuotes(c0.getOperands().get(1).getFormattedExpression())); + + DQRule rule1 = ruleList.get(1); + DateBasedCondition c1 = (DateBasedCondition) rule1.getCondition(); + assertEquals("FileFreshness", rule1.getRuleType()); + assertEquals("S3://path", rule1.getParameters().get("DataPath")); + assertEquals("GREATER_THAN", c1.getOperator().toString()); + assertEquals("(now() - 3 days)", c1.getOperands().get(0).getFormattedExpression()); + + DQRule rule2 = ruleList.get(2); + DateBasedCondition c2 = (DateBasedCondition) rule2.getCondition(); + assertEquals("FileFreshness", rule2.getRuleType()); + assertEquals("S3://path", rule2.getParameters().get("DataPath")); + assertEquals("LESS_THAN", c2.getOperator().toString()); + assertEquals("(now() - 4 days)", c2.getOperands().get(0).getFormattedExpression()); + + DQRule rule3 = ruleList.get(3); + DateBasedCondition c3 = (DateBasedCondition) rule3.getCondition(); + assertEquals("FileFreshness", rule3.getRuleType()); + assertFalse(rule3.getParameters().containsKey("DataPath")); + assertEquals("2023-02-07", removeQuotes(c3.getOperands().get(0).getFormattedExpression())); + assertEquals("2024-07-15", removeQuotes(c3.getOperands().get(1).getFormattedExpression())); + } + @Test void test_checksumRuleParsing() throws Exception { String fileRules = "Rules = [ " + @@ -705,4 +747,12 @@ private T deserialize(byte[] b, Class cls) throws IO Object o = objectStream.readObject(); return cls.cast(o); } + + private String removeQuotes(String quotedString) { + if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) { + quotedString = quotedString.substring(1); + quotedString = quotedString.substring(0, quotedString.length() - 1); + } + return quotedString; + } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 0fb1883..357d1ba 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -92,7 +92,11 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileMatch s3Path with noHashAlgorithm ]"), Arguments.of("FileMatch \"S3://PATH\" \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), Arguments.of("Rules = [ FileMatch S3://PATH1 ]"), - Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]") + Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]"), + Arguments.of("FileFreshness between \"2024-07-15\""), + Arguments.of("FileFreshness \"S3://PATH\" between and \"2024-07-15\""), + Arguments.of("FileFreshness \"S3://PATH\" \"S3://PATH\""), + Arguments.of("FileFreshness > (now() 3 days)") ); } From 0ad9af8467cb4fb7bad84e0858566cb167c77c2c Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Thu, 10 Oct 2024 18:11:04 -0400 Subject: [PATCH 37/50] [TDQ] FileFreshness Rule Translation --- .../dataquality/dqdl/model/condition/date/DateExpression.java | 4 +++- .../dqdl/model/condition/date/DateExpressionTest.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java index 1353ffb..57ea08e 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java @@ -14,6 +14,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; +import lombok.Getter; import java.io.Serializable; import java.time.LocalDateTime; @@ -55,6 +56,7 @@ public enum DateExpressionOperator { PLUS } + @Getter @AllArgsConstructor public static class CurrentDateExpression extends DateExpression { private final DateExpressionOperator operator; @@ -78,7 +80,7 @@ public LocalDateTime getEvaluatedExpression() { ? duration.getAmount() * 24 : duration.getAmount(); - LocalDateTime dt = LocalDateTime.now(); + LocalDateTime dt = LocalDateTime.now().withMinute(0).withSecond(0).withNano(0); switch (operator) { case MINUS: return dt.minusHours(hours); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java index 71e6c30..432d818 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java @@ -79,7 +79,7 @@ public void test_currentDateExpressionEvaluatedExpressionForHours() { DateExpression.DateExpressionOperator operator = DateExpression.DateExpressionOperator.PLUS; - LocalDateTime currentDate = LocalDateTime.now(); + LocalDateTime currentDate = LocalDateTime.now().withMinute(0).withSecond(0).withNano(0); DateExpression.CurrentDateExpression currentDateExpression = new DateExpression.CurrentDateExpression(operator, duration); From 22dee2ab558beefedd4132610097238d108912bb Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Wed, 16 Oct 2024 15:29:52 -0400 Subject: [PATCH 38/50] [TDQ] FileUniqueness Rule Translation and Execution --- configuration/rules/rules-config.json | 10 ++++++++++ .../glue/ml/dataquality/dqdl/model/DQRuleTest.java | 4 +++- .../dataquality/dqdl/parser/InvalidDQRulesetTest.java | 10 ++++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index f46f5ee..2ee94e9 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -485,6 +485,16 @@ "scope": "file", "experimental": true }, + { + "rule_type_name": "FileUniqueness", + "description": "Checks the contents of an inferred folder and the uniqueness of each file within.", + "parameters": [], + "return_type": "NUMBER", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, { "rule_type_name": "FileFreshness", "description": "Checks the age of a filepath against a specified date.", diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 352d1bc..811fc99 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -237,7 +237,9 @@ private static Stream provideRawRules() { Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9"), Arguments.of("FileFreshness \"S3://PATH\" between \"2023-02-07\" and \"2024-07-15\""), - Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)") + Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)"), + Arguments.of("FileUniqueness \"S3://PATH\" > 0.9"), + Arguments.of("FileUniqueness > 0.5") ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 357d1ba..b93a2df 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -93,10 +93,12 @@ private static Stream provideInvalidRulesets() { Arguments.of("FileMatch \"S3://PATH\" \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), Arguments.of("Rules = [ FileMatch S3://PATH1 ]"), Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]"), - Arguments.of("FileFreshness between \"2024-07-15\""), - Arguments.of("FileFreshness \"S3://PATH\" between and \"2024-07-15\""), - Arguments.of("FileFreshness \"S3://PATH\" \"S3://PATH\""), - Arguments.of("FileFreshness > (now() 3 days)") + Arguments.of("Rules = [ FileFreshness between \"2024-07-15\" ]"), + Arguments.of("Rules = [ FileFreshness \"S3://PATH\" between and \"2024-07-15\" ]"), + Arguments.of("Rules = [ FileFreshness \"S3://PATH\" \"S3://PATH\" ]"), + Arguments.of("Rules = [ FileFreshness > (now() 3 days) ]"), + Arguments.of("Rules = [ FileUniqueness \"PATH\" ]"), + Arguments.of("Rules = [ FileUniqueness ]") ); } From 39d9b176b6d2cfc835f701602b815fa1e0f85530 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Mon, 21 Oct 2024 11:26:22 -0400 Subject: [PATCH 39/50] [TDQ] - FileSize Spec & Parser Tests --- configuration/rules/rules-config.json | 26 +++++++++++++++++++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 6 ++++- .../dqdl/parser/InvalidDQRulesetTest.java | 3 ++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 2ee94e9..69baf33 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -520,6 +520,32 @@ "is_where_clause_supported": false, "scope": "file", "experimental": true + }, + { + "rule_type_name": "FileSize", + "description": "Checks the size of a filepath.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "Data Path for FileSize." + } + ], + "return_type": "NUMBER", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileSize", + "description": "Checks the size of an inferred file.", + "parameters": [], + "return_type": "NUMBER", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true } ] } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 811fc99..8845da0 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -239,7 +239,11 @@ private static Stream provideRawRules() { Arguments.of("FileFreshness \"S3://PATH\" between \"2023-02-07\" and \"2024-07-15\""), Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)"), Arguments.of("FileUniqueness \"S3://PATH\" > 0.9"), - Arguments.of("FileUniqueness > 0.5") + Arguments.of("FileUniqueness > 0.5"), + Arguments.of("FileSize > 5 with \"unit\" = \"B\""), + Arguments.of("FileSize < 5 with \"unit\" = \"KB\""), + Arguments.of("FileSize = 5 with \"unit\" = \"MB\""), + Arguments.of("FileSize >= 5 with \"unit\" = \"GB\"") ); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index b93a2df..ff6243a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -98,7 +98,8 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileFreshness \"S3://PATH\" \"S3://PATH\" ]"), Arguments.of("Rules = [ FileFreshness > (now() 3 days) ]"), Arguments.of("Rules = [ FileUniqueness \"PATH\" ]"), - Arguments.of("Rules = [ FileUniqueness ]") + Arguments.of("Rules = [ FileUniqueness ]"), + Arguments.of("Rules = [ FileSize ]") ); } From aa075d398b237931c9f04b79a3b78dabf24110e1 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Mon, 26 Aug 2024 22:10:01 -0400 Subject: [PATCH 40/50] Updated parser logic to work with composite rule related grammar changes - The model already has full support for composite rules. - The parser was not utilizing the support completely. With the grammar now updated, the parser is updated as part of this CR to add full support for composite rules. - Added unit tests to verify. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 7 +- .../ml/dataquality/dqdl/model/DQRule.java | 57 ++++++++++-- .../dqdl/parser/DQDLParserListener.java | 67 +++++++++----- .../ml/dataquality/dqdl/model/DQRuleTest.java | 90 ++++++++++++++++++- .../dqdl/parser/InvalidDQRulesetTest.java | 4 +- 5 files changed, 191 insertions(+), 34 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 85d7752..1958d55 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -110,9 +110,10 @@ dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withThresho dqAnalyzer: analyzerType parameterWithConnectorWord*; topLevelRule: - dqRule - | '(' dqRule ')' (AND '(' dqRule ')')* - | '(' dqRule ')' (OR '(' dqRule ')')*; + LPAREN topLevelRule RPAREN + | topLevelRule AND topLevelRule + | topLevelRule OR topLevelRule + | dqRule; // Rules Definition dqRules: topLevelRule (COMMA topLevelRule)*; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 18a995d..12d76e0 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -159,7 +159,7 @@ public String toString() { // where clause syntax should go before threshold if (whereClause != null) { - if (!isBlank(whereClause)) sb.append(" where ").append("\"" + whereClause + "\""); + if (!isBlank(whereClause)) sb.append(" where ").append("\"").append(whereClause).append("\""); } if (thresholdCondition != null) { @@ -178,17 +178,62 @@ public String toString() { } } - return sb.toString().trim(); } else { - for (int i = 0; i < nestedRules.size(); i++) { - sb.append("(").append(nestedRules.get(i).toString()).append(")"); - if (i != nestedRules.size() - 1) { - sb.append(" ").append(operator.toString()).append(" "); + boolean canBeFlattened = usesSameOperator(operator); + + if (canBeFlattened) { + List flattenedListOfRules = getNestedRulesAsFlattenedList(); + for (int i = 0; i < flattenedListOfRules.size(); i++) { + sb.append("(").append(flattenedListOfRules.get(i).toString()).append(")"); + if (i != flattenedListOfRules.size() - 1) { + sb.append(" ").append(operator.toString()).append(" "); + } + } + } else { + for (int i = 0; i < nestedRules.size(); i++) { + sb.append("(").append(nestedRules.get(i).toString()).append(")"); + if (i != nestedRules.size() - 1) { + sb.append(" ").append(operator.toString()).append(" "); + } } } } return sb.toString(); } + + /* + * This function checks if the same operator is used across all the nested rules. + * Example: (RuleA) or (RuleB) or (RuleC) / (RuleA) and (RuleB) and (RuleC) + * + * If that is the case, in order to maintain backwards compatibility, we will update + * toString() method so that we do not add additional parentheses. + */ + private boolean usesSameOperator(DQRuleLogicalOperator op) { + if (nestedRules.isEmpty()) return true; + if (operator != op) return false; + + for (DQRule nestedRule : nestedRules) { + if (!nestedRule.usesSameOperator(op)) { + return false; + } + } + + return true; + } + + // Package private, in order to make it accessible to the tests + List getNestedRulesAsFlattenedList() { + List ret = new ArrayList<>(); + if (nestedRules.isEmpty()) { + ret.add(this); + } else { + for (DQRule nestedRule: nestedRules) { + List nestedRet = nestedRule.getNestedRulesAsFlattenedList(); + ret.addAll(nestedRet); + } + } + return ret; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 7240ec7..79561b4 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -74,6 +74,8 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private static final String ADDITIONAL_SOURCES_KEY = "AdditionalDataSources"; private static final Set ALLOWED_SOURCES_KEYS; + private static final int COMPOSITE_RULE_MAX_NESTING_DEPTH = 5; + static { ALLOWED_METADATA_KEYS = new HashSet<>(); ALLOWED_METADATA_KEYS.add(METADATA_VERSION_KEY); @@ -174,35 +176,54 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu return; } - for (DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc - : dqRulesContext.topLevelRule()) { - if (tlc.AND().size() > 0 || tlc.OR().size() > 0) { - DQRuleLogicalOperator op = tlc.AND().size() > 0 ? DQRuleLogicalOperator.AND : DQRuleLogicalOperator.OR; - List nestedRules = new ArrayList<>(); + for (DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc: dqRulesContext.topLevelRule()) { + Either dqRuleEither = parseTopLevelRule(tlc, 0); + if (dqRuleEither.isLeft()) { + errorMessages.add(dqRuleEither.getLeft()); + return; + } else { + dqRules.add(dqRuleEither.getRight()); + } + } + } - for (DataQualityDefinitionLanguageParser.DqRuleContext rc : tlc.dqRule()) { - Either dqRuleEither = getDQRule(rc); - if (dqRuleEither.isLeft()) { - errorMessages.add(dqRuleEither.getLeft()); - return; - } else { - nestedRules.add(dqRuleEither.getRight()); - } - } + private Either parseTopLevelRule(DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc, + int depth) { + if (tlc.LPAREN() != null && tlc.RPAREN() != null) { + return parseTopLevelRule(tlc.topLevelRule(0), depth); + } else if (tlc.AND() != null || tlc.OR() != null) { + DQRuleLogicalOperator op = tlc.AND() != null ? DQRuleLogicalOperator.AND : DQRuleLogicalOperator.OR; + List> nestedRuleEitherList = + tlc.topLevelRule().stream().map(r -> parseTopLevelRule(r, depth + 1)).collect(Collectors.toList()); - dqRules.add(new DQRule("Composite", null, null, null, op, nestedRules)); - } else if (tlc.dqRule(0) != null) { - Either dqRuleEither = getDQRule(tlc.dqRule(0)); - if (dqRuleEither.isLeft()) { - errorMessages.add(dqRuleEither.getLeft()); - return; + List allErrorMessages = new ArrayList<>(); + List allRules = new ArrayList<>(); + + nestedRuleEitherList.forEach(arg -> { + if (arg.isLeft()) { + allErrorMessages.add(arg.getLeft()); } else { - dqRules.add(dqRuleEither.getRight()); + allRules.add(arg.getRight()); } + }); + + if (allErrorMessages.isEmpty()) { + return Either.fromRight( + new DQRule("Composite", null, null, null, op, allRules) + ); } else { - errorMessages.add("No valid rule found"); - return; + return Either.fromLeft(allErrorMessages.get(0)); } + } else if (tlc.dqRule() != null) { + if (depth > COMPOSITE_RULE_MAX_NESTING_DEPTH) { + return Either.fromLeft( + String.format("Maximum nested expression depth of %s reached for composite rule", + COMPOSITE_RULE_MAX_NESTING_DEPTH)); + } else { + return getDQRule(tlc.dqRule()); + } + } else { + return Either.fromLeft("No valid rule found"); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 8845da0..80ef3ed 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -243,7 +243,10 @@ private static Stream provideRawRules() { Arguments.of("FileSize > 5 with \"unit\" = \"B\""), Arguments.of("FileSize < 5 with \"unit\" = \"KB\""), Arguments.of("FileSize = 5 with \"unit\" = \"MB\""), - Arguments.of("FileSize >= 5 with \"unit\" = \"GB\"") + Arguments.of("FileSize >= 5 with \"unit\" = \"GB\""), + Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") OR (IsUnique \"colA\")"), + Arguments.of("(RowCount > 0) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), + Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))") ); } @@ -719,6 +722,91 @@ void test_withCondition() throws InvalidDataQualityRulesetException { assertEquals(modified.toString(), "RowCount > 20"); } + @Test + void test_withCompositeRuleThatReachesMaxDepth() { + Map ruleIdToRuleMap = getStringStringMap(); + + /* + AND + Rule1 OR + Rule2 AND + Rule3 OR + Rule4 AND + Rule5 Rule6 <----- Depth = 5 which is OK + */ + String compositeRule = "(Rule1) AND ((Rule2) OR ((Rule3) AND ((Rule4) OR ((Rule5) AND (Rule6)))))"; // Template + int ruleCount = 6; + + for (int i = 1; i <= ruleCount; i++) { + String ruleId = String.format("Rule%s", i); + compositeRule = compositeRule.replace(ruleId, ruleIdToRuleMap.get(ruleId)); + } + + String rulesetString = String.format("Rules = [ %s ]", compositeRule); + + try { + DQRuleset ruleset = parser.parse(rulesetString); + assertEquals(1, ruleset.getRules().size()); + + DQRule actualCompositeRule = ruleset.getRules().get(0); + List nestedRules = actualCompositeRule.getNestedRulesAsFlattenedList(); + + assertEquals(compositeRule, actualCompositeRule.toString()); + assertEquals(6, nestedRules.size()); + + List nestedRulesAsStrings = nestedRules.stream().map(DQRule::toString).collect(Collectors.toList()); + + for (int i = 1; i <= ruleCount; i++) { + String ruleId = String.format("Rule%s", i); + assertTrue(nestedRulesAsStrings.contains(ruleIdToRuleMap.get(ruleId))); + } + } catch (InvalidDataQualityRulesetException e) { + fail("This rule that reaches max depth should have been successfully parsed"); + } + } + + @Test + void test_withCompositeRuleThatBreachesMaxDepth() { + Map ruleIdToRuleMap = getStringStringMap(); + /* + AND + Rule1 OR + Rule2 AND + Rule3 OR + Rule4 AND + Rule5 OR + Rule6 Rule 7 <----- Depth = 6 which is not OK + */ + String compositeRule = "(Rule1) AND ((Rule2) OR ((Rule3) OR ((Rule4) OR ((Rule5) AND ((Rule6) OR (Rule7))))))"; + int ruleCount = 7; + + for (int i = 1; i <= ruleCount; i++) { + String ruleId = String.format("Rule%s", i); + compositeRule = compositeRule.replace(ruleId, ruleIdToRuleMap.get(ruleId)); + } + + String rulesetString = String.format("Rules = [ %s ]", compositeRule); + + try { + parser.parse(rulesetString); + fail("This rule that breaches max depth should have failed to parse"); + } catch (InvalidDataQualityRulesetException e) { + assertTrue(e.getMessage().contains("Maximum nested expression depth")); + } + } + + private static Map getStringStringMap() { + Map ruleIdToRuleMap = new HashMap<>(); + ruleIdToRuleMap.put("Rule1", "RowCount > 1"); + ruleIdToRuleMap.put("Rule2", "Completeness of \"colA\" between 0.4 and 1.0"); + ruleIdToRuleMap.put("Rule3", "CustomSql \"select count(*) from primary\" = 10"); + ruleIdToRuleMap.put("Rule4", "ReferentialIntegrity of \"primary.colA\" and \"ref.colA\" = 0.9"); + ruleIdToRuleMap.put("Rule5", "IsUnique \"id\""); + ruleIdToRuleMap.put("Rule6", "ColumnNamesMatchPattern \"[a-zA-Z]*\""); + ruleIdToRuleMap.put("Rule7", "SchemaMatch \"ref\" between 0.8 and 0.9"); + return ruleIdToRuleMap; + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index ff6243a..02d5884 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -99,7 +99,9 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileFreshness > (now() 3 days) ]"), Arguments.of("Rules = [ FileUniqueness \"PATH\" ]"), Arguments.of("Rules = [ FileUniqueness ]"), - Arguments.of("Rules = [ FileSize ]") + Arguments.of("Rules = [ FileSize ]"), + Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), + Arguments.of("((RowCount > 0) AND IsComplete") ); } From 35eb024ea9ebba0ded773a28e8912c9fa8ed04eb Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Mon, 28 Oct 2024 10:53:02 -0400 Subject: [PATCH 41/50] DQDL - SizeBasedCondition --- .../dqdl/DataQualityDefinitionLanguage.g4 | 22 ++- configuration/rules/rules-config.json | 4 +- .../dqdl/model/condition/size/Size.java | 29 ++++ .../condition/size/SizeBasedCondition.java | 96 ++++++++++++ .../size/SizeBasedConditionOperator.java | 24 +++ .../dqdl/model/condition/size/SizeUnit.java | 19 +++ .../dqdl/parser/DQDLParserListener.java | 141 ++++++++++++++++++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 41 ++++- .../dqdl/parser/InvalidDQRulesetTest.java | 3 + 9 files changed, 372 insertions(+), 7 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 1958d55..7c643c3 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -14,6 +14,15 @@ durationUnit: 'days' | 'hours'; durationExpression: (DIGIT | INT) durationUnit; +sizeUnit: + 'B' + | 'KB' + | 'MB' + | 'GB' + | 'TB'; + +sizeExpression: (DIGIT | INT) sizeUnit; + dateExpressionOp: ('-' | '+'); dateExpression: DATE @@ -88,6 +97,16 @@ durationBasedCondition: | NEGATION? EQUAL_TO durationExpression | NOT? IN durationExpressionArray; +sizeExpressionArray: LBRAC sizeExpression (COMMA sizeExpression)* RBRAC; +sizeBasedCondition: + NOT? BETWEEN sizeExpression AND sizeExpression + | GREATER_THAN sizeExpression + | GREATER_THAN_EQUAL_TO sizeExpression + | LESS_THAN sizeExpression + | LESS_THAN_EQUAL_TO sizeExpression + | NEGATION? EQUAL_TO sizeExpression + | NOT? IN sizeExpressionArray; + ruleType: IDENTIFIER; analyzerType: IDENTIFIER; parameter: QUOTED_STRING @@ -100,7 +119,8 @@ condition: numberBasedCondition | stringBasedCondition | dateBasedCondition - | durationBasedCondition; + | durationBasedCondition + | sizeBasedCondition; withThresholdCondition: 'with' 'threshold' numberBasedCondition; diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index 69baf33..0c8a68e 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -531,7 +531,7 @@ "description": "Data Path for FileSize." } ], - "return_type": "NUMBER", + "return_type": "SIZE", "is_threshold_supported": true, "is_where_clause_supported": false, "scope": "file", @@ -541,7 +541,7 @@ "rule_type_name": "FileSize", "description": "Checks the size of an inferred file.", "parameters": [], - "return_type": "NUMBER", + "return_type": "SIZE", "is_threshold_supported": true, "is_where_clause_supported": false, "scope": "file", diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java new file mode 100644 index 0000000..98abdb4 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java @@ -0,0 +1,29 @@ +/* + * Size.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; + +@AllArgsConstructor +@Getter +@EqualsAndHashCode +public class Size implements Serializable { + private final Integer amount; + private final SizeUnit unit; + + public String getFormattedSize() { + return String.format("%s %s", amount, unit.name().toUpperCase()); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java new file mode 100644 index 0000000..7a2a34e --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java @@ -0,0 +1,96 @@ +/* + * SizeBasedCondition.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.util.List; +import java.util.stream.Collectors; + +@Getter +@EqualsAndHashCode(callSuper = true) +public class SizeBasedCondition extends Condition { + private final SizeBasedConditionOperator operator; + private final List operands; + + public SizeBasedCondition(final String conditionAsString, + final SizeBasedConditionOperator operator, + final List operands) { + super(conditionAsString); + this.operator = operator; + this.operands = operands; + } + + @Override + public String getFormattedCondition() { + if (this.operands.isEmpty()) return ""; + + switch (operator) { + case BETWEEN: + return String.format("between %s and %s", + operands.get(0).getFormattedSize(), + operands.get(1).getFormattedSize()); + case NOT_BETWEEN: + return String.format("not between %s and %s", + operands.get(0).getFormattedSize(), + operands.get(1).getFormattedSize()); + case GREATER_THAN: + return String.format("> %s", operands.get(0).getFormattedSize()); + case GREATER_THAN_EQUAL_TO: + return String.format(">= %s", operands.get(0).getFormattedSize()); + case LESS_THAN: + return String.format("< %s", operands.get(0).getFormattedSize()); + case LESS_THAN_EQUAL_TO: + return String.format("<= %s", operands.get(0).getFormattedSize()); + case EQUALS: + return String.format("= %s", operands.get(0).getFormattedSize()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).getFormattedSize()); + case IN: { + List formattedOperands = getFormattedOperands(); + return String.format("in [%s]", String.join(",", formattedOperands)); + } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(",", formattedOperands)); + } + default: + break; + } + + return ""; + } + + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + + private List getFormattedOperands() { + return operands.stream().map(Size::getFormattedSize).collect(Collectors.toList()); + } + + private List getSortedFormattedOperands() { + return operands.stream().map(Size::getFormattedSize).sorted().collect(Collectors.toList()); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java new file mode 100644 index 0000000..d4c9b72 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java @@ -0,0 +1,24 @@ +/* + * SizeBasedConditionOperator.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +public enum SizeBasedConditionOperator { + BETWEEN, + NOT_BETWEEN, + GREATER_THAN, + GREATER_THAN_EQUAL_TO, + LESS_THAN, + LESS_THAN_EQUAL_TO, + EQUALS, + NOT_EQUALS, + IN, + NOT_IN +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java new file mode 100644 index 0000000..076b657 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java @@ -0,0 +1,19 @@ +/* + * SizeUnit.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +public enum SizeUnit { + B, + KB, + MB, + GB, + TB +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 79561b4..672570a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -34,6 +34,10 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.Size; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedConditionOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeUnit; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; @@ -486,6 +490,31 @@ private Either parseCondition( } break; } + case "SIZE": + case "SIZE_ARRAY": { + DataQualityDefinitionLanguageParser.ConditionContext cx = dqRuleContext.condition(); + if (cx == null || (cx.sizeBasedCondition() == null && cx.numberBasedCondition() == null)) { + return Either.fromLeft( + String.format("Unexpected condition for rule of type %s with size return type", + ruleType.getRuleTypeName())); + } else if (cx.sizeBasedCondition() != null) { + Optional possibleCond = + parseSizeBasedCondition(dqRuleContext.condition().sizeBasedCondition()); + + if (possibleCond.isPresent()) { + response = Either.fromRight(possibleCond.get()); + } + } else if (cx.numberBasedCondition() != null) { + Optional possibleCond = + convertNumberToSizeCondition( + parseNumberBasedCondition(dqRuleContext.condition().numberBasedCondition())); + + if (possibleCond.isPresent()) { + response = Either.fromRight(possibleCond.get()); + } + } + break; + } default: break; } @@ -493,6 +522,24 @@ private Either parseCondition( return response; } + private Optional convertNumberToSizeCondition(Optional in) { + if (!in.isPresent() || !(in.get() instanceof NumberBasedCondition)) { + return Optional.empty(); + } + NumberBasedCondition input = (NumberBasedCondition) in.get(); + final String conditionAsString = input.getConditionAsString(); + final SizeBasedConditionOperator operator = SizeBasedConditionOperator.valueOf(input.getOperator().name()); + final List operands = input.getOperands().stream() + .filter(x -> x instanceof AtomicNumberOperand) + .filter(x -> Double.parseDouble(x.getOperand()) % 1 == 0) // filter only integer + .map(x -> new Size(Integer.parseInt(x.getOperand()), SizeUnit.B)) + .collect(Collectors.toList()); + if (operands.size() != input.getOperands().size()) { + return Optional.empty(); + } + return Optional.of(new SizeBasedCondition(conditionAsString, operator, operands)); + } + private Optional parseNumberBasedCondition( DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx) { @@ -849,6 +896,89 @@ private Optional parseDurationBasedCondition( return Optional.ofNullable(condition); } + private Optional parseSizeBasedCondition( + DataQualityDefinitionLanguageParser.SizeBasedConditionContext ctx + ) { + + String exprStr = ctx.getText(); + Condition condition = null; + + if (ctx.BETWEEN() != null && ctx.sizeExpression().size() == 2) { + Optional lower = parseSize(ctx.sizeExpression(0)); + Optional upper = parseSize(ctx.sizeExpression(1)); + if (lower.isPresent() && upper.isPresent()) { + SizeBasedConditionOperator op = (ctx.NOT() != null) ? + SizeBasedConditionOperator.NOT_BETWEEN + : SizeBasedConditionOperator.BETWEEN; + condition = new SizeBasedCondition( + exprStr, op, Arrays.asList(lower.get(), upper.get()) + ); + } + } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.GREATER_THAN_EQUAL_TO, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.GREATER_THAN() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.GREATER_THAN, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.LESS_THAN() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.LESS_THAN, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.LESS_THAN_EQUAL_TO, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.EQUAL_TO() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + SizeBasedConditionOperator op = (ctx.NEGATION() != null) ? + SizeBasedConditionOperator.NOT_EQUALS + : SizeBasedConditionOperator.EQUALS; + condition = new SizeBasedCondition( + exprStr, op, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.IN() != null && + ctx.sizeExpressionArray() != null && + ctx.sizeExpressionArray().sizeExpression().size() > 0) { + + List> sizes = ctx.sizeExpressionArray().sizeExpression().stream() + .map(this::parseSize) + .collect(Collectors.toList()); + + if (sizes.stream().allMatch(Optional::isPresent)) { + SizeBasedConditionOperator op = (ctx.NOT() != null) ? + SizeBasedConditionOperator.NOT_IN + : SizeBasedConditionOperator.IN; + condition = new SizeBasedCondition( + exprStr, op, + sizes.stream().map(Optional::get).collect(Collectors.toList()) + ); + } + } + + return Optional.ofNullable(condition); + } + private Optional parseDateExpression( DataQualityDefinitionLanguageParser.DateExpressionContext ctx) { if (ctx.durationExpression() != null) { @@ -879,6 +1009,17 @@ private Optional parseDuration( } } + private Optional parseSize( + DataQualityDefinitionLanguageParser.SizeExpressionContext ctx) { + int amount = Integer.parseInt(ctx.INT() != null ? ctx.INT().getText() : ctx.DIGIT().getText()); + if (ctx.sizeUnit().exception != null) { + return Optional.empty(); + } else { + SizeUnit unit = SizeUnit.valueOf(ctx.sizeUnit().getText().toUpperCase()); + return Optional.of(new Size(amount, unit)); + } + } + private String removeQuotes(String quotedString) { if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) { quotedString = quotedString.substring(1); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 80ef3ed..7fbf8f6 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -15,6 +15,8 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.Size; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; @@ -240,16 +242,47 @@ private static Stream provideRawRules() { Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)"), Arguments.of("FileUniqueness \"S3://PATH\" > 0.9"), Arguments.of("FileUniqueness > 0.5"), - Arguments.of("FileSize > 5 with \"unit\" = \"B\""), - Arguments.of("FileSize < 5 with \"unit\" = \"KB\""), - Arguments.of("FileSize = 5 with \"unit\" = \"MB\""), - Arguments.of("FileSize >= 5 with \"unit\" = \"GB\""), + Arguments.of("FileSize between 1 B and 1 GB"), + Arguments.of("FileSize not between 50 GB and 1 TB"), + Arguments.of("FileSize > 5 B"), + Arguments.of("FileSize >= 5 KB"), + Arguments.of("FileSize < 5 MB"), + Arguments.of("FileSize <= 5 GB"), + Arguments.of("FileSize = 5 TB"), + Arguments.of("FileSize != 5 B"), + Arguments.of("FileSize in [5 B]"), + Arguments.of("FileSize not in [500 KB,150 GB]"), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") OR (IsUnique \"colA\")"), Arguments.of("(RowCount > 0) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))") ); } + @Test + void test_sizeConditionParsing() throws Exception { + List unitList = Arrays.asList("B", "KB", "MB", "GB", "TB"); + for (String unit : unitList) { + String rule = String.format("Rules = [ FileSize = 2 %s ]", unit); + DQRule parsedRule = parser.parse(rule).getRules().get(0); + assertEquals("FileSize", parsedRule.getRuleType()); + SizeBasedCondition c = (SizeBasedCondition) parsedRule.getCondition(); + assertEquals(unit, c.getOperands().get(0).getUnit().name()); + } + + String defaultByte = "Rules = [ FileSize > 2, FileSize in [3,4,5,6] ]"; + List rules = parser.parse(defaultByte).getRules(); + DQRule parsedRuleNoUnit0 = rules.get(0); + DQRule parsedRuleNoUnit1 = rules.get(1); + assertEquals("FileSize", parsedRuleNoUnit0.getRuleType()); + assertEquals("FileSize", parsedRuleNoUnit1.getRuleType()); + SizeBasedCondition c0 = (SizeBasedCondition) parsedRuleNoUnit0.getCondition(); + SizeBasedCondition c1 = (SizeBasedCondition) parsedRuleNoUnit1.getCondition(); + assertEquals("B", c0.getOperands().get(0).getUnit().name()); + for (Size unit : c1.getOperands()) { + assertEquals("B", unit.getUnit().name()); + } + } + @Test void test_fileFileFreshnessParsing() throws Exception { String fileRules = "Rules = [ " + diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 02d5884..8db8e1a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -100,6 +100,9 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileUniqueness \"PATH\" ]"), Arguments.of("Rules = [ FileUniqueness ]"), Arguments.of("Rules = [ FileSize ]"), + Arguments.of("Rules = [ FileSize > 1 SAM]"), + Arguments.of("Rules = [ FileSize 1 GB]"), + Arguments.of("Rules = [ FileSize <= 1 ZB ]"), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), Arguments.of("((RowCount > 0) AND IsComplete") ); From 38bfc2174e8efe553f1a170b3dd35ca4d6f90a80 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Tue, 29 Oct 2024 05:42:42 -0400 Subject: [PATCH 42/50] Adapt ETL FileFreshness to Parsing Changes + Fix Bug with IN / NOT_IN, Tweak Size --- .../dqdl/model/condition/size/Size.java | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java index 98abdb4..7b6e65b 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java @@ -10,20 +10,45 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; -import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; import lombok.Getter; import java.io.Serializable; -@AllArgsConstructor @Getter @EqualsAndHashCode -public class Size implements Serializable { +public class Size implements Serializable, Comparable { private final Integer amount; private final SizeUnit unit; + private final Long bytes; + + public Size(final Integer amount, final SizeUnit unit) { + this.amount = amount; + this.unit = unit; + this.bytes = convertBytes(amount, unit); + } public String getFormattedSize() { return String.format("%s %s", amount, unit.name().toUpperCase()); } + + private Long convertBytes(Integer bytes, SizeUnit unit) { + switch (unit) { + case KB: + return bytes * 1024L; + case MB: + return bytes * 1024L * 1024L; + case GB: + return bytes * 1024L * 1024L * 1024L; + case TB: + return bytes * 1024L * 1024L * 1024L * 1024L; + default: + return Long.valueOf(bytes); + } + } + + @Override + public int compareTo(Size other) { + return Long.compare(this.getBytes(), other.getBytes()); + } } From df9dab8bebc9b5fa8964e820a4ffcc80019e9d30 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Wed, 30 Oct 2024 22:56:51 +0900 Subject: [PATCH 43/50] DQDL - Refactor Threshold to use Tags --- .../dqdl/DataQualityDefinitionLanguage.g4 | 6 +- .../dqdl/parser/DQDLParserListener.java | 94 +++++++++++-------- .../dqdl/parser/InvalidDQRulesetTest.java | 3 + 3 files changed, 62 insertions(+), 41 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 7c643c3..e88b0d6 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -113,7 +113,7 @@ parameter: QUOTED_STRING | IDENTIFIER; connectorWord: OF | AND; parameterWithConnectorWord: connectorWord? parameter; -tagWithCondition: 'with' tagValues stringBasedCondition; +tagWithCondition: 'with' tagValues (stringBasedCondition | numberBasedCondition); condition: numberBasedCondition @@ -122,11 +122,9 @@ condition: | durationBasedCondition | sizeBasedCondition; -withThresholdCondition: 'with' 'threshold' numberBasedCondition; - whereClause: 'where' quotedString; -dqRule: ruleType parameterWithConnectorWord* condition? whereClause? withThresholdCondition? tagWithCondition*; +dqRule: ruleType parameterWithConnectorWord* condition? whereClause? tagWithCondition*; dqAnalyzer: analyzerType parameterWithConnectorWord*; topLevelRule: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 672570a..ff3dce9 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -45,6 +45,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; +import org.antlr.v4.runtime.misc.Pair; import org.antlr.v4.runtime.tree.ParseTree; import java.lang.reflect.InvocationTargetException; @@ -299,32 +300,6 @@ private Either getDQRule( } } - Condition thresholdCondition = null; - if (dqRuleContext.withThresholdCondition() != null) { - if (dqRuleType.isThresholdSupported()) { - DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = - dqRuleContext.withThresholdCondition().numberBasedCondition(); - - if (ctx == null) { - return Either.fromLeft( - String.format("Empty threshold condition provided for rule type: %s", ruleType)); - } else { - Optional possibleCond = - parseNumberBasedCondition(dqRuleContext.withThresholdCondition().numberBasedCondition()); - if (possibleCond.isPresent()) { - thresholdCondition = possibleCond.get(); - } else { - return Either.fromLeft( - String.format("Unable to parse threshold condition provided for rule type: %s", - ruleType)); - } - } - - } else { - return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType)); - } - } - String whereClause = null; if (dqRuleContext.whereClause() != null) { if (dqRuleType.isWhereClauseSupported()) { @@ -340,22 +315,38 @@ private Either getDQRule( } } + Condition thresholdCondition = null; Map tags = new HashMap<>(); List tagContexts = dqRuleContext.tagWithCondition(); if (tagContexts != null && !tagContexts.isEmpty()) { for (DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext : tagContexts) { - if (!isTagValid(tagContext.stringBasedCondition())) { - return Either.fromLeft("Only EQUAL_TO condition is supported for tags."); - } - String tagKey = getKeyFromTag(tagContext.tagValues()); - Optional valueCondition = parseStringBasedCondition(tagContext.stringBasedCondition()); - if (valueCondition.isPresent()) { - StringBasedCondition stringCondition = (StringBasedCondition) valueCondition.get(); - String tagValue = stringCondition.getOperands().get(0).getOperand(); - tags.put(tagKey, tagValue); + if (tagContext.numberBasedCondition() != null) { + if (dqRuleType.isThresholdSupported()) { + if (thresholdCondition != null) { + return Either.fromLeft("Only one threshold condition at a time is supported."); + } + Either outcome = processThresholdTag(tagContext, ruleType); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); + } else { + thresholdCondition = outcome.getRight(); + } + } else { + return Either.fromLeft(String.format( + "Threshold condition not supported for rule type: %s", ruleType)); + } + } else if (tagContext.stringBasedCondition() != null) { + Either> outcome = processStringTag(tagContext); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); + } else { + Pair pair = outcome.getRight(); + tags.put(pair.a, pair.b); + } } else { - return Either.fromLeft(String.format("Error while parsing tag: %s", tagKey)); + return Either.fromLeft(String.format( + "Invalid tag provided for rule type: %s", ruleType)); } } } @@ -366,8 +357,37 @@ private Either getDQRule( ); } + private Either> processStringTag( + DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext) { + if (!isTagValid(tagContext.stringBasedCondition())) { + return Either.fromLeft("Only EQUAL_TO condition is supported for String tags."); + } + String tagKey = getKeyFromTag(tagContext.tagValues()); + Optional valueCondition = parseStringBasedCondition(tagContext.stringBasedCondition()); + if (valueCondition.isPresent()) { + StringBasedCondition stringCondition = (StringBasedCondition) valueCondition.get(); + String tagValue = stringCondition.getOperands().get(0).getOperand(); + return Either.fromRight(new Pair<>(tagKey, tagValue)); + } else { + return Either.fromLeft(String.format("Error while parsing tag: %s", tagKey)); + } + } + + private Either processThresholdTag( + DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext, String ruleType) { + DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = + tagContext.numberBasedCondition(); + Optional possibleCond = parseNumberBasedCondition(ctx); + if (possibleCond.isPresent()) { + return Either.fromRight(possibleCond.get()); + } else { + return Either.fromLeft(String.format( + "Unable to parse threshold condition provided for rule type: %s", ruleType)); + } + } + private boolean isTagValid(DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx) { - return ctx.EQUAL_TO() != null; + return ctx.EQUAL_TO() != null && ctx.NEGATION() == null; } private String getKeyFromTag(DataQualityDefinitionLanguageParser.TagValuesContext tagValuesContext) { diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 8db8e1a..1a2c0de 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -50,6 +50,7 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ ColumnDataType \"col-A\" ]"), Arguments.of("Rules = [ ColumnDataType \"col-A\" with threshold > 0.7 ]"), Arguments.of("Rules = [ ColumnDataType \"col-A\" \"col-B\" ]"), + Arguments.of("Rules = [ ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9 with threshold > 0.7 ]"), Arguments.of("Rules = [ ColumnValues \"col-A\" matches ]"), Arguments.of("Rules = [ ColumnValues \"col-A\" now() ]"), Arguments.of("Rules = [ ColumnValues \"col-A\" > now() + 1 hours ]"), @@ -101,6 +102,8 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileUniqueness ]"), Arguments.of("Rules = [ FileSize ]"), Arguments.of("Rules = [ FileSize > 1 SAM]"), + Arguments.of("Rules = [ FileSize > 1 KB with exampleTag in [\"SAM\"] ]"), + Arguments.of("Rules = [ FileSize > 1 KB with exampleTag != \"SAM\"]"), Arguments.of("Rules = [ FileSize 1 GB]"), Arguments.of("Rules = [ FileSize <= 1 ZB ]"), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), From 603878ccc1d95a947764e80a00c0df7a4773fcf3 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 5 Nov 2024 12:26:48 -0500 Subject: [PATCH 44/50] Changed visibility of a method in DQRule - This method provides all the nested rules of a composite rule, as a flattened list. - The Backfill Lambda needed to use this method, so we changed the visibility to public. --- src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 12d76e0..83bd493 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -223,8 +223,7 @@ private boolean usesSameOperator(DQRuleLogicalOperator op) { return true; } - // Package private, in order to make it accessible to the tests - List getNestedRulesAsFlattenedList() { + public List getNestedRulesAsFlattenedList() { List ret = new ArrayList<>(); if (nestedRules.isEmpty()) { ret.add(this); From 4c8aa578edc2d35b4945cefe88102c50760d46c1 Mon Sep 17 00:00:00 2001 From: Dongying Song Date: Fri, 1 Nov 2024 00:40:17 -0400 Subject: [PATCH 45/50] Support variable for string array to workaround parameter limits in the PublishDataQualityResult API --- .../dqdl/DataQualityDefinitionLanguage.g4 | 19 +++- .../ml/dataquality/dqdl/model/DQRule.java | 46 +++++++- .../ml/dataquality/dqdl/model/DQVariable.java | 60 ++++++++++ .../string/StringBasedCondition.java | 68 +++++++---- .../variable/VariableReferenceOperand.java | 31 +++++ .../dqdl/parser/DQDLParserListener.java | 107 ++++++++++++++---- .../dqdl/parser/DQDLVariableResolver.java | 59 ++++++++++ .../dataquality/dqdl/model/DQRulesetTest.java | 103 +++++++++++++++++ .../dqdl/model/DQVariableTest.java | 86 ++++++++++++++ .../dqdl/parser/InvalidDQRulesetTest.java | 27 ++++- 10 files changed, 560 insertions(+), 46 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java create mode 100644 tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index e88b0d6..02f2bae 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -69,11 +69,20 @@ numberBasedCondition: | NEGATION? EQUAL_TO number | NOT? IN numberArray; -stringValues : quotedString | NULL | EMPTY | WHITESPACES_ONLY; +variableDereference: '$' IDENTIFIER; + +stringValues: + quotedString + | variableDereference + | NULL + | EMPTY + | WHITESPACES_ONLY; + stringValuesArray: LBRAC stringValues (COMMA stringValues)* RBRAC; stringBasedCondition: NEGATION? EQUAL_TO stringValues | NOT? IN stringValuesArray + | NOT? IN variableDereference | NOT? matchesRegexCondition; tagValues: quotedString | IDENTIFIER; @@ -127,6 +136,12 @@ whereClause: 'where' quotedString; dqRule: ruleType parameterWithConnectorWord* condition? whereClause? tagWithCondition*; dqAnalyzer: analyzerType parameterWithConnectorWord*; +// Variable Declarations +expression: stringValuesArray; +variableDeclaration: + IDENTIFIER EQUAL_TO expression; +variableDeclarations: variableDeclaration*; + topLevelRule: LPAREN topLevelRule RPAREN | topLevelRule AND topLevelRule @@ -156,4 +171,4 @@ metadata: metadataSectionStart EQUAL_TO dictionary; dataSources: dataSourcesSectionStart EQUAL_TO dictionary; rulesOrAnalyzers: rules | analyzers | rules analyzers; -document: metadata? dataSources? rulesOrAnalyzers; +document: metadata? dataSources? variableDeclarations? rulesOrAnalyzers; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 83bd493..c7e61c0 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -19,10 +19,12 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import static com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLVariableResolver.resolveVariablesInCondition; import static com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils.isBlank; @AllArgsConstructor @@ -92,7 +94,8 @@ public DQRule(final String ruleType, public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final LinkedHashMap parameters, final Condition condition) { - return createFromParameterValueMap(ruleType, parameters, condition, null, null, null); + return createFromParameterValueMap(ruleType, parameters, condition, + null, null, null); } public DQRule(final String ruleType, @@ -133,6 +136,47 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, ); } + // Add a new method for creating with variable resolution + public static DQRule createFromParameterValueMapWithVariables(final DQRuleType ruleType, + final LinkedHashMap + parameters, + final Condition condition, + final Condition thresholdCondition, + final String whereClause, + final Map tags, + final Map variables) { + // Create the unresolved rule first + DQRule unresolvedRule = createFromParameterValueMap(ruleType, parameters, condition, + thresholdCondition, whereClause, tags); + + // If there are no variables to resolve, return the unresolved rule + if (variables == null || variables.isEmpty()) { + return unresolvedRule; + } + + Map usedVars = new HashMap<>(); + + // Resolve variables in conditions + Condition resolvedCondition = condition != null + ? resolveVariablesInCondition(condition, variables, usedVars) : null; + Condition resolvedThresholdCondition = thresholdCondition != null + ? resolveVariablesInCondition(thresholdCondition, variables, usedVars) : null; + + // Create the resolved rule + return new DQRule( + ruleType.getRuleTypeName(), + DQRuleParameterValue.createParameterMap(parameters), + parameters, + resolvedCondition, + resolvedThresholdCondition, + DQRuleLogicalOperator.AND, + new ArrayList<>(), + whereClause, + ruleType.isExcludedAtRowLevelInCompositeRules(), + tags + ); + } + public DQRule withNestedRules(final List nestedRules) { return this.toBuilder().nestedRules(nestedRules).build(); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java new file mode 100644 index 0000000..b9774d7 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java @@ -0,0 +1,60 @@ +/* + * DQVariable.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; +import java.util.List; +import java.util.stream.Collectors; + +@AllArgsConstructor +@Getter +@EqualsAndHashCode +public class DQVariable implements Serializable { + + public enum VariableType { + NUMBER, + STRING, + DATE, + DURATION, + NUMBER_ARRAY, + STRING_ARRAY, + DATE_ARRAY, + DURATION_ARRAY + } + + private final String name; + private final VariableType type; + private final T value; + + @Override + public String toString() { + if (value instanceof List) { + return String.format("%s = %s", name, formatArray((List) value)); + } + return String.format("%s = %s", name, formatValue(value)); + } + + private String formatValue(T val) { + if (val == null) return "null"; + if (type == VariableType.STRING) return "\"" + val + "\""; + return val.toString(); + } + + private String formatArray(List list) { + return "[" + list.stream() + .map(Object::toString) + .collect(Collectors.joining(", ")) + "]"; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java index ca62c59..b76c91c 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -23,36 +24,44 @@ public class StringBasedCondition extends Condition { private final StringBasedConditionOperator operator; private final List operands; + private final List unresolvedOperands; public StringBasedCondition(final String conditionAsString, final StringBasedConditionOperator operator, final List operands) { + this(conditionAsString, operator, operands, null); + } + + public StringBasedCondition(final String conditionAsString, + final StringBasedConditionOperator operator, + final List operands, + final List unresolvedOperands) { super(conditionAsString); this.operator = operator; this.operands = operands; + this.unresolvedOperands = unresolvedOperands; } + @Override public String getFormattedCondition() { if (StringUtils.isBlank(conditionAsString)) return ""; + List effectiveOperands = getEffectiveOperands(); + switch (operator) { case MATCHES: - return String.format("matches %s", operands.get(0).formatOperand()); + return String.format("matches %s", effectiveOperands.get(0).formatOperand()); case NOT_MATCHES: - return String.format("not matches %s", operands.get(0).formatOperand()); + return String.format("not matches %s", effectiveOperands.get(0).formatOperand()); case EQUALS: - return String.format("= %s", operands.get(0).formatOperand()); + return String.format("= %s", effectiveOperands.get(0).formatOperand()); case NOT_EQUALS: - return String.format("!= %s", operands.get(0).formatOperand()); - case IN: { - List formattedOperands = getFormattedOperands(); - return String.format("in [%s]", String.join(",", formattedOperands)); - } - case NOT_IN: { - List formattedOperands = getFormattedOperands(); - return String.format("not in [%s]", String.join(",", formattedOperands)); - } + return String.format("!= %s", effectiveOperands.get(0).formatOperand()); + case IN: + return formatInCondition(false, false); + case NOT_IN: + return formatInCondition(true, false); default: break; } @@ -66,26 +75,41 @@ public String getSortedFormattedCondition() { switch (operator) { case IN: - return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + return formatInCondition(false, true); case NOT_IN: - return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + return formatInCondition(true, true); default: return getFormattedCondition(); } } - private List getFormattedOperands() { - List formattedOperands = operands.stream() - .map(StringOperand::formatOperand) - .collect(Collectors.toList()); - return formattedOperands; + private String formatInCondition(boolean isNot, boolean sorted) { + List effectiveOperands = getEffectiveOperands(); + List formattedOperands = sorted + ? getSortedFormattedOperands(effectiveOperands) : getFormattedOperands(effectiveOperands); + String operandStr; + if (formattedOperands.size() == 1 && effectiveOperands.get(0) instanceof VariableReferenceOperand) { + operandStr = formattedOperands.get(0); + } else { + operandStr = "[" + String.join(",", formattedOperands) + "]"; + } + return String.format("%sin %s", isNot ? "not " : "", operandStr); } - private List getSortedFormattedOperands() { - List formattedOperands = operands.stream() + private List getFormattedOperands(List operands) { + return operands.stream() + .map(StringOperand::formatOperand) + .collect(Collectors.toList()); + } + + private List getSortedFormattedOperands(List operands) { + return operands.stream() .map(StringOperand::formatOperand) .sorted() .collect(Collectors.toList()); - return formattedOperands; + } + + private List getEffectiveOperands() { + return unresolvedOperands != null ? unresolvedOperands : operands; } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java new file mode 100644 index 0000000..0bc31e8 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java @@ -0,0 +1,31 @@ +/* + * VariableReferenceOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +@Getter +@EqualsAndHashCode(callSuper = true) +public class VariableReferenceOperand extends StringOperand { + private final String variableName; + + public VariableReferenceOperand(String variableName) { + super(variableName); + this.variableName = variableName; + } + + @Override + public String formatOperand() { + return "$" + variableName; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index ff3dce9..e4622ab 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -18,6 +18,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleParameterValue; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQVariable; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedConditionOperator; @@ -44,6 +45,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; import org.antlr.v4.runtime.misc.Pair; import org.antlr.v4.runtime.tree.ParseTree; @@ -71,6 +73,7 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private List additionalSources; private final List dqRules = new ArrayList<>(); private final List dqAnalyzers = new ArrayList<>(); + private final Map dqVariables = new HashMap<>(); private static final String METADATA_VERSION_KEY = "Version"; private static final Set ALLOWED_METADATA_KEYS; @@ -249,6 +252,50 @@ public void enterDqAnalyzers(DataQualityDefinitionLanguageParser.DqAnalyzersCont } } + @Override + public void enterVariableDeclaration(DataQualityDefinitionLanguageParser.VariableDeclarationContext ctx) { + if (!errorMessages.isEmpty()) { + return; + } + + String variableName = ctx.IDENTIFIER().getText(); + + if (variableName.startsWith(".") || variableName.startsWith("_")) { + errorMessages.add(String.format("Variable name '%s' cannot start with '.' or '_'", variableName)); + return; + } + + if (dqVariables.containsKey(variableName)) { + errorMessages.add("Variable '" + variableName + "' is already defined"); + return; + } + + DQVariable variable = null; + DataQualityDefinitionLanguageParser.ExpressionContext expr = ctx.expression(); + if (expr == null) { + errorMessages.add(String.format("Missing value for variable '%s'", variableName)); + return; + } + + if (expr.stringValuesArray() != null) { + List values = expr.stringValuesArray().stringValues().stream() + .map(sv -> { + if (sv.quotedString() != null) { + return removeQuotes(sv.quotedString().getText()); + } + return sv.getText(); + }) + .collect(Collectors.toList()); + variable = new DQVariable(variableName, DQVariable.VariableType.STRING_ARRAY, values); + } + + if (variable != null) { + dqVariables.put(variableName, variable); + } else { + errorMessages.add(String.format("Failed to parse variable '%s'", variableName)); + } + } + private Either getDQRule( DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { String ruleType = dqRuleContext.ruleType().getText(); @@ -352,8 +399,8 @@ private Either getDQRule( } return Either.fromRight( - DQRule.createFromParameterValueMap( - dqRuleType, parameterMap, condition, thresholdCondition, whereClause, tags) + DQRule.createFromParameterValueMapWithVariables( + dqRuleType, parameterMap, condition, thresholdCondition, whereClause, tags, dqVariables) ); } @@ -689,28 +736,48 @@ private Optional parseStringBasedCondition( String exprStr = ctx.getText(); Condition condition = null; - if (ctx.EQUAL_TO() != null && ctx.stringValues() != null) { + if (ctx.EQUAL_TO() != null) { StringBasedConditionOperator op = (ctx.NEGATION() != null) ? - StringBasedConditionOperator.NOT_EQUALS - : StringBasedConditionOperator.EQUALS; - Optional operand = parseStringOperand(ctx, Optional.of(ctx.stringValues()), op); - if (operand.isPresent()) { - condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand.get())); + StringBasedConditionOperator.NOT_EQUALS + : StringBasedConditionOperator.EQUALS; + + StringOperand operand; + if (ctx.variableDereference() != null) { + operand = new VariableReferenceOperand(ctx.variableDereference().IDENTIFIER().getText()); + } else if (ctx.stringValues() != null) { + Optional parsedOperand = parseStringOperand(ctx, Optional.of(ctx.stringValues()), op); + if (!parsedOperand.isPresent()) { + return Optional.empty(); + } + operand = parsedOperand.get(); + } else { + return Optional.empty(); } - } else if (ctx.IN() != null && - ctx.stringValuesArray() != null && - ctx.stringValuesArray().stringValues().size() > 0) { + + condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand)); + } else if (ctx.IN() != null) { StringBasedConditionOperator op = (ctx.NOT() != null) ? - StringBasedConditionOperator.NOT_IN - : StringBasedConditionOperator.IN; - List> operands = ctx.stringValuesArray().stringValues() - .stream() - .map(s -> parseStringOperand(ctx, Optional.of(s), op)) - .collect(Collectors.toList()); + StringBasedConditionOperator.NOT_IN + : StringBasedConditionOperator.IN; + + List operands; + if (ctx.variableDereference() != null) { + operands = Collections.singletonList( + new VariableReferenceOperand(ctx.variableDereference().IDENTIFIER().getText())); + } else if (ctx.stringValuesArray() != null && ctx.stringValuesArray().stringValues().size() > 0) { + operands = ctx.stringValuesArray().stringValues() + .stream() + .map(s -> parseStringOperand(ctx, Optional.of(s), op)) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(Collectors.toList()); + } else { + return Optional.empty(); + } - condition = new StringBasedCondition(exprStr, op, - operands.stream().map(Optional::get).collect(Collectors.toList()) - ); + if (!operands.isEmpty()) { + condition = new StringBasedCondition(exprStr, op, operands); + } } else if (ctx.matchesRegexCondition() != null) { StringBasedConditionOperator op = (ctx.NOT() != null) ? StringBasedConditionOperator.NOT_MATCHES diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java new file mode 100644 index 0000000..16a7178 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java @@ -0,0 +1,59 @@ +package com.amazonaws.glue.ml.dataquality.dqdl.parser; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQVariable; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public final class DQDLVariableResolver { + + // Private constructor to prevent instantiation + private DQDLVariableResolver() { + throw new AssertionError("Utility class should not be instantiated"); + } + + public static Condition resolveVariablesInCondition(Condition condition, Map variables, + Map usedVars) { + if (!(condition instanceof StringBasedCondition)) { + return condition; + } + + StringBasedCondition stringCondition = (StringBasedCondition) condition; + List resolvedOperands = new ArrayList<>(); + + for (StringOperand operand : stringCondition.getOperands()) { + if (operand instanceof VariableReferenceOperand) { + String varName = operand.getOperand(); + DQVariable variable = variables.get(varName); + if (variable != null) { + usedVars.put(varName, variable); + Object value = variable.getValue(); + if (value instanceof List) { + for (Object listItem : (List) value) { + resolvedOperands.add(new QuotedStringOperand(listItem.toString())); + } + } else { + resolvedOperands.add(new QuotedStringOperand(value.toString())); + } + } else { + resolvedOperands.add(operand); + } + } else { + resolvedOperands.add(operand); + } + } + + return new StringBasedCondition( + stringCondition.getConditionAsString(), + stringCondition.getOperator(), + resolvedOperands, + stringCondition.getOperands() + ); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index 6d37718..cabf267 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -452,6 +452,109 @@ void test_invalidRulesetThrowsException() { } } + @Test + public void testStringVariableResolvedCorrectly() { + String dqdlWithVariable = + "locationVariable = [\"YYZ14\", \"b\", \"c\"]\n" + + "Rules = [ ColumnValues \"Location-id\" in $locationVariable ]"; + String dqdlWithoutVariable = "Rules = [ ColumnValues \"Location-id\" in [\"YYZ14\", \"b\", \"c\"] ]"; + String ruleWithVariable = "ColumnValues \"Location-id\" in $locationVariable"; + String ruleWithoutVariable = "ColumnValues \"Location-id\" in [\"YYZ14\",\"b\",\"c\"]"; + String rulesWithVariable = "Rules = [\n ColumnValues \"Location-id\" in $locationVariable\n]"; + String rulesWithoutVariable = "Rules = [\n ColumnValues \"Location-id\" in [\"YYZ14\",\"b\",\"c\"]\n]"; + + DQRuleset dqRulesetWithVariable = parseDQDL(dqdlWithVariable); + DQRuleset dqRulesetWithoutVariable = parseDQDL(dqdlWithoutVariable); + assertEquals(rulesWithVariable, dqRulesetWithVariable.toString()); + assertEquals(rulesWithoutVariable, dqRulesetWithoutVariable.toString()); + assertEquals(dqRulesetWithoutVariable.getRules().size(), dqRulesetWithVariable.getRules().size()); + assertEquals(ruleWithVariable, + dqRulesetWithVariable.getRules().get(0).toString()); + assertEquals(ruleWithoutVariable, + dqRulesetWithoutVariable.getRules().get(0).toString()); + } + + @Test + public void testStringArrayVariable() { + String dqdl = + "str_arr = [\"a\", \"b\", \"c\"]\n" + + "Rules = [ ColumnValues \"order-id\" in $str_arr ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"order-id\" in $str_arr", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testMultipleRulesWithStringArrayVariable() { + String dqdl = + "codes = [\"A1\", \"B2\", \"C3\"]\n" + + "statuses = [\"active\", \"pending\", \"inactive\"]\n" + + "Rules = [\n" + + " ColumnValues \"product_code\" in $codes,\n" + + " ColumnValues \"status\" in $statuses\n" + + "]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(2, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" in $codes", + dqRuleset.getRules().get(0).toString()); + assertEquals("ColumnValues \"status\" in $statuses", + dqRuleset.getRules().get(1).toString()); + } + + @Test + public void testStringArrayVariableWithNotIn() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in $invalid_codes ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in $invalid_codes", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testUnusedVariable() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in [\"A1\", \"B2\", \"C3\"] ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in [\"A1\",\"B2\",\"C3\"]", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testMultipleVariableDefinitionsOnlyOneUsed() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "invalid_codes1 = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in [\"A1\", \"B2\", \"C3\"] ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in [\"A1\",\"B2\",\"C3\"]", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testVariableDefinitionMissing() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "invalid_codes1 = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in $invalid_codes2 ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in $invalid_codes2", + dqRuleset.getRules().get(0).toString()); + } + + @Test void test_multipleRules() { String dqdl = diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java new file mode 100644 index 0000000..db85392 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java @@ -0,0 +1,86 @@ +/* + * DQVariableTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import org.junit.jupiter.api.Test; +import java.time.Duration; +import java.time.LocalDate; +import java.util.Arrays; +import java.util.List; +import static org.junit.jupiter.api.Assertions.*; + +class DQVariableTest { + + @Test + void testConstructorAndGetters() { + DQVariable intVar = new DQVariable<>("age", DQVariable.VariableType.NUMBER, 30); + assertEquals("age", intVar.getName()); + assertEquals(DQVariable.VariableType.NUMBER, intVar.getType()); + assertEquals(30, intVar.getValue()); + } + + @Test + void testEqualsAndHashCode() { + DQVariable var1 = new DQVariable<>("name", DQVariable.VariableType.STRING, "John"); + DQVariable var2 = new DQVariable<>("name", DQVariable.VariableType.STRING, "John"); + DQVariable var3 = new DQVariable<>("name", DQVariable.VariableType.STRING, "Jane"); + + assertEquals(var1, var2); + assertNotEquals(var1, var3); + assertEquals(var1.hashCode(), var2.hashCode()); + } + + @Test + void testToStringForNumber() { + DQVariable intVar = new DQVariable<>("age", DQVariable.VariableType.NUMBER, 30); + assertEquals("age = 30", intVar.toString()); + } + + @Test + void testToStringForString() { + DQVariable stringVar = new DQVariable<>("name", DQVariable.VariableType.STRING, "John"); + assertEquals("name = \"John\"", stringVar.toString()); + } + + @Test + void testToStringForDate() { + LocalDate date = LocalDate.of(2023, 5, 15); + DQVariable dateVar = new DQVariable<>("birthdate", DQVariable.VariableType.DATE, date); + assertEquals("birthdate = 2023-05-15", dateVar.toString()); + } + + @Test + void testToStringForDuration() { + Duration duration = Duration.ofHours(2); + DQVariable durationVar = new DQVariable<>("timeSpent", DQVariable.VariableType.DURATION, duration); + assertEquals("timeSpent = PT2H", durationVar.toString()); + } + + @Test + void testToStringForNumberArray() { + List numbers = Arrays.asList(1, 2, 3); + DQVariable> arrayVar = new DQVariable<>("numbers", DQVariable.VariableType.NUMBER_ARRAY, numbers); + assertEquals("numbers = [1, 2, 3]", arrayVar.toString()); + } + + @Test + void testToStringForStringArray() { + List names = Arrays.asList("John", "Jane", "Doe"); + DQVariable> arrayVar = new DQVariable<>("names", DQVariable.VariableType.STRING_ARRAY, names); + assertEquals("names = [John, Jane, Doe]", arrayVar.toString()); + } + + @Test + void testToStringForNullValue() { + DQVariable nullVar = new DQVariable<>("nullValue", DQVariable.VariableType.STRING, null); + assertEquals("nullValue = null", nullVar.toString()); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 1a2c0de..087ca11 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -107,7 +107,32 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileSize 1 GB]"), Arguments.of("Rules = [ FileSize <= 1 ZB ]"), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), - Arguments.of("((RowCount > 0) AND IsComplete") + Arguments.of("((RowCount > 0) AND IsComplete"), + Arguments.of("variable > 1"), + Arguments.of("Rules = [ RowCount > $min_count ]"), + Arguments.of("max_size = 1 ZB\nRules = [ FileSize <= $max_size ]"), + Arguments.of("min_count = 100\nRules = [ RowCount > $min_count ]"), + Arguments.of("min count = 100\nRules = [ RowCount > 100 ]"), + Arguments.of("min count = \nRules = [ RowCount > 100 ]"), + Arguments.of("Rules = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ RowCount > 100 ]"), + Arguments.of("Rules = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ RowCount > 100 ]"), + Arguments.of("allowed_counts = [5, 10, 15, 20]\n" + + "Rules = [ ColumnValues \"product_code\" not in $allowed_counts ]"), + Arguments.of(".allowed_counts = [5, 10, 15, 20]\n" + + "Rules = [ ColumnValues \"product_code\" not in $.allowed_counts ]"), + Arguments.of("allowed_counts = [5, 10, 15, 20]\n" + + "Rules = [ ColumnValues \"product_code\" not in random$allowed_counts ]"), + Arguments.of("phone_pattern = '^\\\\d{3}-\\\\d{3}-\\\\d{4}$'\n" + + "Rules = [ ColumnValues \"colA\" matches $phone_pattern ]"), + Arguments.of("allowed_statuses = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ ColumnValues \"colA\" in allowed_statuses ]"), + Arguments.of("allowed_statuses = [\"Active\", \"Pending\", \"Closed\"]\n" + + "allowed_statuses = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ ColumnValues \"colA\" in $allowed_statuses ]"), + Arguments.of("max$value = 1000\nRules = [ RowCount > 1000 ]"), + Arguments.of("base = 10\nfactor = 5\nRules = [ RowCount > $base * $factor ]") ); } From ed66d4ce987bdcd9c4be7dbe3d24fbcb20c17549 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Wed, 27 Nov 2024 13:04:34 -0500 Subject: [PATCH 46/50] [DQDL] Introduce Minutes --- .../dqdl/DataQualityDefinitionLanguage.g4 | 2 +- .../model/condition/date/DateExpression.java | 37 ++++++++++++++---- .../condition/duration/DurationUnit.java | 1 + .../ml/dataquality/dqdl/model/DQRuleTest.java | 22 ++++++++++- .../condition/date/DateExpressionTest.java | 39 +++++++++++++++++-- 5 files changed, 87 insertions(+), 14 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 02f2bae..71bb836 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -10,7 +10,7 @@ analyzersSectionStart: 'Analyzers'; // Expressions dateNow: 'now()'; -durationUnit: 'days' | 'hours'; +durationUnit: 'days' | 'hours' | 'minutes'; durationExpression: (DIGIT | INT) durationUnit; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java index 57ea08e..02d1d0b 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java @@ -11,13 +11,13 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration; -import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; import lombok.Getter; import java.io.Serializable; import java.time.LocalDateTime; +import java.time.ZoneOffset; @EqualsAndHashCode public abstract class DateExpression implements Serializable { @@ -47,7 +47,7 @@ public String getFormattedExpression() { @Override public LocalDateTime getEvaluatedExpression() { - return LocalDateTime.now(); + return LocalDateTime.now(ZoneOffset.UTC); } } @@ -76,16 +76,37 @@ public String getFormattedExpression() { @Override public LocalDateTime getEvaluatedExpression() { - int hours = duration.getUnit().equals(DurationUnit.DAYS) - ? duration.getAmount() * 24 - : duration.getAmount(); + switch (duration.getUnit()) { + case MINUTES: + return evaluateMinutes( + operator, + duration.getAmount(), + LocalDateTime.now(ZoneOffset.UTC) + ); + case HOURS: + return evaluateMinutes( + operator, + duration.getAmount() * 60, + LocalDateTime.now(ZoneOffset.UTC).withMinute(0) + ); + case DAYS: + return evaluateMinutes( + operator, + duration.getAmount() * 60 * 24, + LocalDateTime.now(ZoneOffset.UTC).withMinute(0) + ); + default: + throw new RuntimeException("Unsupported duration unit: " + duration.getUnit()); + } + } - LocalDateTime dt = LocalDateTime.now().withMinute(0).withSecond(0).withNano(0); + private LocalDateTime evaluateMinutes(DateExpressionOperator operator, int minutes, LocalDateTime dt) { + dt = dt.withSecond(0).withNano(0); switch (operator) { case MINUS: - return dt.minusHours(hours); + return dt.minusMinutes(minutes); case PLUS: - return dt.plusHours(hours); + return dt.plusMinutes(minutes); default: return dt; } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java index f48f209..04577f9 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration; public enum DurationUnit { + MINUTES, HOURS, DAYS } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 7fbf8f6..728e42a 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -254,7 +254,10 @@ private static Stream provideRawRules() { Arguments.of("FileSize not in [500 KB,150 GB]"), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") OR (IsUnique \"colA\")"), Arguments.of("(RowCount > 0) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), - Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))") + Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), + Arguments.of("FileFreshness \"S3://PATH\" > (now() - 30 minutes)"), + Arguments.of("FileFreshness \"S3://PATH\" > (now() + 45 minutes)"), + Arguments.of("ColumnValues \"col-A\" < (now() + 4 minutes)") ); } @@ -289,7 +292,10 @@ void test_fileFileFreshnessParsing() throws Exception { "FileFreshness \"S3://path\" between \"2023-02-07\" and \"2024-07-15\", " + "FileFreshness \"S3://path\" > (now() - 3 days), " + "FileFreshness \"S3://path\" < (now() - 4 days), " + - "FileFreshness between \"2023-02-07\" and \"2024-07-15\" " + + "FileFreshness between \"2023-02-07\" and \"2024-07-15\", " + + "FileFreshness > (now() + 35 minutes), " + + "FileFreshness <= (now() - 35 minutes), " + + "FileFreshness = (now() + 70 minutes) " + "]"; DQRuleset dqRuleset = parser.parse(fileRules); List ruleList = dqRuleset.getRules(); @@ -321,6 +327,18 @@ void test_fileFileFreshnessParsing() throws Exception { assertFalse(rule3.getParameters().containsKey("DataPath")); assertEquals("2023-02-07", removeQuotes(c3.getOperands().get(0).getFormattedExpression())); assertEquals("2024-07-15", removeQuotes(c3.getOperands().get(1).getFormattedExpression())); + + DQRule rule4 = ruleList.get(4); + DateBasedCondition c4 = (DateBasedCondition) rule4.getCondition(); + assertEquals("(now() + 35 minutes)", c4.getOperands().get(0).getFormattedExpression()); + + DQRule rule5 = ruleList.get(5); + DateBasedCondition c5 = (DateBasedCondition) rule5.getCondition(); + assertEquals("(now() - 35 minutes)", c5.getOperands().get(0).getFormattedExpression()); + + DQRule rule6 = ruleList.get(6); + DateBasedCondition c6 = (DateBasedCondition) rule6.getCondition(); + assertEquals("(now() + 70 minutes)", c6.getOperands().get(0).getFormattedExpression()); } @Test diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java index 432d818..aa8a1b1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java @@ -15,6 +15,7 @@ import org.junit.jupiter.api.Test; import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.time.temporal.ChronoUnit; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -44,7 +45,7 @@ public void test_currentDateFormattedExpression() { @Test public void test_currentDateEvaluatedExpression() { DateExpression.CurrentDate currentDate = new DateExpression.CurrentDate(); - LocalDateTime dt = LocalDateTime.now(); + LocalDateTime dt = LocalDateTime.now(ZoneOffset.UTC); assertEquals( dt.toString().substring(0, 10), currentDate.getEvaluatedExpression().toString().substring(0, 10) @@ -70,6 +71,26 @@ public void test_currentDateExpressionFormattedExpression() { ); } + @Test + public void test_currentDateExpressionEvaluatedExpressionForMinutes() { + DurationUnit unit = DurationUnit.MINUTES; + int amount = 24; + Duration duration = new Duration(amount, unit); + + DateExpression.DateExpressionOperator operator = + DateExpression.DateExpressionOperator.PLUS; + + LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.UTC).withSecond(0).withNano(0); + DateExpression.CurrentDateExpression currentDateExpression = + new DateExpression.CurrentDateExpression(operator, duration); + + long minutesDiff = ChronoUnit.MINUTES.between( + currentDate, currentDateExpression.getEvaluatedExpression() + ); + + assertEquals(amount, minutesDiff); + } + @Test public void test_currentDateExpressionEvaluatedExpressionForHours() { DurationUnit unit = DurationUnit.HOURS; @@ -79,7 +100,7 @@ public void test_currentDateExpressionEvaluatedExpressionForHours() { DateExpression.DateExpressionOperator operator = DateExpression.DateExpressionOperator.PLUS; - LocalDateTime currentDate = LocalDateTime.now().withMinute(0).withSecond(0).withNano(0); + LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.UTC).withMinute(0).withSecond(0).withNano(0); DateExpression.CurrentDateExpression currentDateExpression = new DateExpression.CurrentDateExpression(operator, duration); @@ -88,6 +109,12 @@ public void test_currentDateExpressionEvaluatedExpressionForHours() { ); assertEquals(amount, hoursDiff); + + long minutesDiff = ChronoUnit.MINUTES.between( + currentDate, currentDateExpression.getEvaluatedExpression() + ); + + assertEquals(amount * 60, minutesDiff); } @Test @@ -99,7 +126,7 @@ public void test_currentDateExpressionEvaluatedExpressionForDays() { DateExpression.DateExpressionOperator operator = DateExpression.DateExpressionOperator.MINUS; - LocalDateTime currentDate = LocalDateTime.now(); + LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.UTC); DateExpression.CurrentDateExpression currentDateExpression = new DateExpression.CurrentDateExpression(operator, duration); @@ -108,5 +135,11 @@ public void test_currentDateExpressionEvaluatedExpressionForDays() { ); assertTrue(amount * 24 + hoursDiff <= 1); + + long minutesDiff = ChronoUnit.MINUTES.between( + currentDate, currentDateExpression.getEvaluatedExpression() + ); + + assertTrue(amount * 24 * 60 + minutesDiff <= 1); } } From 43b91abb4ccd4935b8301c6e777df79a23419fc3 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Tue, 3 Dec 2024 13:13:26 -0500 Subject: [PATCH 47/50] DQDL - AM/PM Parsing --- configuration/dqdl/CommonLexerRules.g4 | 4 ++ .../dqdl/DataQualityDefinitionLanguage.g4 | 3 ++ .../model/condition/date/DateExpression.java | 16 +++++++ .../dqdl/parser/DQDLParserListener.java | 28 ++++++++++++ .../ml/dataquality/dqdl/model/DQRuleTest.java | 44 ++++++++++++++++++- .../dqdl/parser/InvalidDQRulesetTest.java | 9 ++++ 6 files changed, 102 insertions(+), 2 deletions(-) diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4 index f4ee958..194de8e 100644 --- a/configuration/dqdl/CommonLexerRules.g4 +++ b/configuration/dqdl/CommonLexerRules.g4 @@ -30,6 +30,10 @@ NEGATION: '!'; DIGIT: [0-9]; DATE: QUOTE DIGIT DIGIT DIGIT DIGIT '-' DIGIT DIGIT '-' DIGIT DIGIT QUOTE; +TIME: + QUOTE (DIGIT | DIGIT DIGIT) ':' DIGIT DIGIT (' AM' | ' PM') QUOTE; +MIL_TIME: + QUOTE DIGIT DIGIT ':' DIGIT DIGIT QUOTE; INT: DIGIT+; DECIMAL: INT '.' INT; QUOTED_STRING: QUOTE (ESC | .)*? QUOTE; diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 71bb836..6e5418a 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -23,11 +23,14 @@ sizeUnit: sizeExpression: (DIGIT | INT) sizeUnit; +timeExpression: TIME | MIL_TIME; + dateExpressionOp: ('-' | '+'); dateExpression: DATE | dateNow | LPAREN dateNow dateExpressionOp durationExpression RPAREN + | timeExpression | NULL; atomicNumber: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java index 02d1d0b..b14c3d1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java @@ -51,6 +51,22 @@ public LocalDateTime getEvaluatedExpression() { } } + @AllArgsConstructor + public static class StaticDateTime extends DateExpression { + private final LocalDateTime dateTime; + private final String dateTimeString; + + @Override + public String getFormattedExpression() { + return "\"" + dateTimeString + "\""; + } + + @Override + public LocalDateTime getEvaluatedExpression() { + return dateTime; + } + } + public enum DateExpressionOperator { MINUS, PLUS diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index e4622ab..3e9c851 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -52,6 +52,11 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -82,6 +87,9 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private static final String ADDITIONAL_SOURCES_KEY = "AdditionalDataSources"; private static final Set ALLOWED_SOURCES_KEYS; + private static final String MILITARY_TIME_FORMAT = "HH:mm"; + private static final String AMPM_TIME_FORMAT = "h:mm a"; + private static final int COMPOSITE_RULE_MAX_NESTING_DEPTH = 5; static { @@ -1080,11 +1088,31 @@ private Optional parseDateExpression( return Optional.of(new DateExpression.CurrentDate()); } else if (ctx.NULL() != null) { return Optional.of(new NullDateExpression()); + } else if (ctx.timeExpression() != null) { + final String time = removeQuotes(ctx.timeExpression().MIL_TIME() != null + ? ctx.timeExpression().MIL_TIME().getText() + : ctx.timeExpression().TIME().getText()); + final String pattern = ctx.timeExpression().MIL_TIME() != null + ? MILITARY_TIME_FORMAT + : AMPM_TIME_FORMAT; + return parseTime(time, pattern); } else { return Optional.of(new DateExpression.StaticDate(removeQuotes(ctx.DATE().getText()))); } } + private Optional parseTime(final String in, final String pattern) { + try { + final DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern); + final LocalTime time = LocalTime.parse(in, formatter); + final LocalDateTime dateTime = time.atDate(LocalDateTime.now(ZoneOffset.UTC).toLocalDate()); + return Optional.of(new DateExpression.StaticDateTime(dateTime, in)); + } catch (final DateTimeParseException e) { + errorMessages.add(String.format("Error Parsing Date: %s. %s.", in, e.getMessage())); + return Optional.empty(); + } + } + private Optional parseDuration( DataQualityDefinitionLanguageParser.DurationExpressionContext ctx) { int amount = Integer.parseInt(ctx.INT() != null ? ctx.INT().getText() : ctx.DIGIT().getText()); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 728e42a..3975454 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -32,9 +32,12 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; +import java.text.SimpleDateFormat; +import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -142,6 +145,12 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"order-id\" not in [1,2,3,4]"), Arguments.of("ColumnValues \"order-id\" in [\"1\",\"2\",\"3\",\"4\"]"), Arguments.of("ColumnValues \"order-id\" not in [\"1\",\"2\",\"3\",\"4\"]"), + Arguments.of("ColumnValues \"col-A\" < (now() + 4 minutes)"), + Arguments.of("ColumnValues \"col-A\" < (now() - 25 minutes)"), + Arguments.of("ColumnValues \"col-A\" > \"9:30 AM\""), + Arguments.of("ColumnValues \"col-A\" > \"9:30 PM\""), + Arguments.of("ColumnValues \"col-A\" > \"19:30\""), + Arguments.of("ColumnValues \"col-A\" between \"9:00 AM\" and \"21:50\""), Arguments.of("Sum \"col_A-B.C\" > 100.0"), Arguments.of("Sum \"col_A-B.C\" > -100.0"), Arguments.of("Sum \"col_A-B.C\" > -100.0 where \"col-A > 100\""), @@ -206,8 +215,10 @@ private static Stream provideRawRules() { Arguments.of("ColumnValues \"col-A\" = 1 with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"2022-01-01\" with threshold > 0.98"), Arguments.of("DataFreshness \"col-A\" <= 3 days"), - Arguments.of("DataFreshness \"col-A\" > 30 hours"), + Arguments.of("DataFreshness \"col-A\" > 3 minutes"), + Arguments.of("DataFreshness \"col-A\" > 90 minutes"), Arguments.of("DataFreshness \"col-A\" between 2 days and 4 days"), + Arguments.of("DataFreshness \"col-A\" between 2 minutes and 4 minutes"), Arguments.of("DataFreshness \"col-A\" <= 3 days where \"col-A > 100\""), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" between 0.4 and 0.6"), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" > 0.98"), @@ -257,10 +268,39 @@ private static Stream provideRawRules() { Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), Arguments.of("FileFreshness \"S3://PATH\" > (now() - 30 minutes)"), Arguments.of("FileFreshness \"S3://PATH\" > (now() + 45 minutes)"), - Arguments.of("ColumnValues \"col-A\" < (now() + 4 minutes)") + Arguments.of("FileFreshness \"S3://PATH\" > \"9:30 AM\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"9:30 PM\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"09:30\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"13:30\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 PM\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 AM\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"09:30\" and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between (now() - 2 hours) and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between (now() + 5 minutes) and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"2024-01-01\" and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"2024-01-01\" and (now() + 10 minutes)") ); } + @Test + void test_AMPM_Parsing() throws Exception { + String rule = "Rules = [ FileFreshness \"S3://PATH\" between \"9:15 AM\" and \"21:45\" ]"; + DQRule parsedRule = parser.parse(rule).getRules().get(0); + DateBasedCondition c1 = (DateBasedCondition) parsedRule.getCondition(); + DateExpression d1 = c1.getOperands().get(0); + DateExpression d2 = c1.getOperands().get(1); + Date today = new Date(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + String todayStr = sdf.format(today); + assertEquals("FileFreshness", parsedRule.getRuleType()); + assertEquals("between \"9:15 AM\" and \"21:45\"", c1.getFormattedCondition()); + assertEquals("\"9:15 AM\"" ,d1.getFormattedExpression()); + assertEquals("\"21:45\"" ,d2.getFormattedExpression()); + assertEquals(todayStr + "T09:15", d1.getEvaluatedExpression().toString()); + assertEquals(todayStr + "T21:45", d2.getEvaluatedExpression().toString()); + } + @Test void test_sizeConditionParsing() throws Exception { List unitList = Arrays.asList("B", "KB", "MB", "GB", "TB"); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 087ca11..278edc4 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -62,6 +62,9 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ ColumnValues \"col-A\" in [1,\"2\"] ]"), Arguments.of("Rules = [ DataFreshness \"col-A\" <= 3 ]"), Arguments.of("Rules = [ DataFreshness \"col-A\" > 30 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 9:30 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 9:30 SM ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 25:30 ]"), Arguments.of("Rules = [ DataFreshness \"col-A\" between 2 and 4 days ]"), Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference\" \"col-A1\" ]"), Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" = 0.99 ]"), @@ -106,6 +109,12 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileSize > 1 KB with exampleTag != \"SAM\"]"), Arguments.of("Rules = [ FileSize 1 GB]"), Arguments.of("Rules = [ FileSize <= 1 ZB ]"), + Arguments.of("Rules = [ FileFreshness > 13:50 AM ]"), + Arguments.of("Rules = [ FileFreshness > 13:50 PM ]"), + Arguments.of("Rules = [ FileFreshness > 25:00 ]"), + Arguments.of("Rules = [ FileFreshness > 9:30 ]"), + Arguments.of("Rules = [ FileFreshness > 9:30 SM ]"), + Arguments.of("Rules = [ FileFreshness > 22:1s ]"), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), Arguments.of("((RowCount > 0) AND IsComplete"), Arguments.of("variable > 1"), From 8226a2308f0fbb9bc997a1e2c6a775edffd66825 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Mon, 9 Dec 2024 17:18:17 -0500 Subject: [PATCH 48/50] TimeZone Override for DateBasedCondition --- .../dqdl/parser/DQDLParserListener.java | 94 +++++++++++-------- .../ml/dataquality/dqdl/model/DQRuleTest.java | 17 +++- .../dqdl/parser/InvalidDQRulesetTest.java | 1 + 3 files changed, 70 insertions(+), 42 deletions(-) diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 3e9c851..06d6ff4 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -52,11 +52,15 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; +import java.time.ZoneId; import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; +import java.time.zone.ZoneRulesException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -331,30 +335,6 @@ private Either getDQRule( LinkedHashMap parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); - Condition condition; - - List> conditions = Arrays.stream(dqRuleType.getReturnType().split("\\|")) - .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext)) - .collect(Collectors.toList()); - - Optional> optionalCondition = conditions.stream().filter(Either::isRight).findFirst(); - if (optionalCondition.isPresent()) { - if (optionalCondition.get().isRight()) { - condition = optionalCondition.get().getRight(); - } else { - return Either.fromLeft(optionalCondition.get().getLeft()); - } - } else { - Optional> optionalFailedCondition = - conditions.stream().filter(Either::isLeft).findFirst(); - if (optionalFailedCondition.isPresent()) { - return Either.fromLeft(optionalFailedCondition.get().getLeft()); - } else { - return Either.fromLeft( - String.format("Error while parsing condition for rule with rule type: %s", ruleType)); - } - } - String whereClause = null; if (dqRuleContext.whereClause() != null) { if (dqRuleType.isWhereClauseSupported()) { @@ -406,6 +386,30 @@ private Either getDQRule( } } + Condition condition; + + List> conditions = Arrays.stream(dqRuleType.getReturnType().split("\\|")) + .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext, tags)) + .collect(Collectors.toList()); + + Optional> optionalCondition = conditions.stream().filter(Either::isRight).findFirst(); + if (optionalCondition.isPresent()) { + if (optionalCondition.get().isRight()) { + condition = optionalCondition.get().getRight(); + } else { + return Either.fromLeft(optionalCondition.get().getLeft()); + } + } else { + Optional> optionalFailedCondition = + conditions.stream().filter(Either::isLeft).findFirst(); + if (optionalFailedCondition.isPresent()) { + return Either.fromLeft(optionalFailedCondition.get().getLeft()); + } else { + return Either.fromLeft( + String.format("Error while parsing condition for rule with rule type: %s", ruleType)); + } + } + return Either.fromRight( DQRule.createFromParameterValueMapWithVariables( dqRuleType, parameterMap, condition, thresholdCondition, whereClause, tags, dqVariables) @@ -486,7 +490,8 @@ private Either getDQAnalyzer( private Either parseCondition( DQRuleType ruleType, String returnType, - DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { + DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext, + Map tags) { Either response = Either.fromLeft(String.format("Error parsing condition for return type: %s", returnType)); @@ -541,7 +546,7 @@ private Either parseCondition( ruleType.getRuleTypeName())); } else { Optional possibleCond = - parseDateBasedCondition(dqRuleContext.condition().dateBasedCondition()); + parseDateBasedCondition(dqRuleContext.condition().dateBasedCondition(), tags); if (possibleCond.isPresent()) { response = Either.fromRight(possibleCond.get()); @@ -833,14 +838,14 @@ private Optional parseStringOperand( } private Optional parseDateBasedCondition( - DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx) { + DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx, Map tags) { String exprStr = ctx.getText(); Condition condition = null; if (ctx.BETWEEN() != null && ctx.dateExpression().size() == 2) { - Optional lower = parseDateExpression(ctx.dateExpression(0)); - Optional upper = parseDateExpression(ctx.dateExpression(1)); + Optional lower = parseDateExpression(ctx.dateExpression(0), tags); + Optional upper = parseDateExpression(ctx.dateExpression(1), tags); if (lower.isPresent() && upper.isPresent()) { DateBasedConditionOperator op = (ctx.NOT() != null) ? DateBasedConditionOperator.NOT_BETWEEN @@ -850,35 +855,35 @@ private Optional parseDateBasedCondition( ); } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(operand.get()) ); } } else if (ctx.GREATER_THAN() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(operand.get()) ); } } else if (ctx.LESS_THAN() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.LESS_THAN, Collections.singletonList(operand.get()) ); } } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(operand.get()) ); } } else if (ctx.EQUAL_TO() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { DateBasedConditionOperator op = (ctx.NEGATION() != null) ? DateBasedConditionOperator.NOT_EQUALS @@ -891,9 +896,8 @@ private Optional parseDateBasedCondition( ctx.dateExpressionArray() != null && ctx.dateExpressionArray().dateExpression().size() > 0) { List> expressions = ctx.dateExpressionArray().dateExpression().stream() - .map(this::parseDateExpression) + .map(x -> parseDateExpression(x, tags)) .collect(Collectors.toList()); - if (expressions.stream().allMatch(Optional::isPresent)) { DateBasedConditionOperator op = (ctx.NOT() != null) ? DateBasedConditionOperator.NOT_IN @@ -1075,7 +1079,7 @@ private Optional parseSizeBasedCondition( } private Optional parseDateExpression( - DataQualityDefinitionLanguageParser.DateExpressionContext ctx) { + DataQualityDefinitionLanguageParser.DateExpressionContext ctx, Map tags) { if (ctx.durationExpression() != null) { Optional duration = parseDuration(ctx.durationExpression()); return duration.map(value -> new DateExpression.CurrentDateExpression( @@ -1095,21 +1099,29 @@ private Optional parseDateExpression( final String pattern = ctx.timeExpression().MIL_TIME() != null ? MILITARY_TIME_FORMAT : AMPM_TIME_FORMAT; - return parseTime(time, pattern); + final String timeZone = tags.getOrDefault("timeZone", "UTC"); + return parseTime(time, pattern, timeZone); } else { return Optional.of(new DateExpression.StaticDate(removeQuotes(ctx.DATE().getText()))); } } - private Optional parseTime(final String in, final String pattern) { + private Optional parseTime(final String in, final String pattern, final String timeZone) { try { + final ZoneId zoneId = ZoneId.of(timeZone); // https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html final DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern); final LocalTime time = LocalTime.parse(in, formatter); - final LocalDateTime dateTime = time.atDate(LocalDateTime.now(ZoneOffset.UTC).toLocalDate()); - return Optional.of(new DateExpression.StaticDateTime(dateTime, in)); + final LocalDate today = LocalDate.now(); + final LocalDateTime localDateTime = LocalDateTime.of(today, time); + final ZonedDateTime zonedDateTime = localDateTime.atZone(zoneId); + final ZonedDateTime utcTime = zonedDateTime.withZoneSameInstant(ZoneOffset.UTC); + return Optional.of(new DateExpression.StaticDateTime(utcTime.toLocalDateTime(), in)); } catch (final DateTimeParseException e) { errorMessages.add(String.format("Error Parsing Date: %s. %s.", in, e.getMessage())); return Optional.empty(); + } catch (final ZoneRulesException e) { + errorMessages.add(String.format("Error Parsing Time Zone: %s. %s.", timeZone, e.getMessage())); + return Optional.empty(); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 3975454..9a8b150 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -33,7 +33,6 @@ import java.io.ObjectOutputStream; import java.io.Serializable; import java.text.SimpleDateFormat; -import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -273,6 +272,7 @@ private static Stream provideRawRules() { Arguments.of("FileFreshness \"S3://PATH\" > \"09:30\""), Arguments.of("FileFreshness \"S3://PATH\" > \"13:30\""), Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with \"timeZone\" = \"America/New_York\""), Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 PM\""), Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 AM\""), Arguments.of("FileFreshness \"S3://PATH\" between \"09:30\" and \"21:45\""), @@ -283,6 +283,21 @@ private static Stream provideRawRules() { ); } + @Test + void test_Timezone() throws Exception { + String rule = "Rules = [ FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\", FileFreshness > \"19:30\" with timeZone = \"Asia/Dubai\", FileFreshness > \"9:30 AM\" ]"; + List rules = parser.parse(rule).getRules(); + DateBasedCondition c1 = (DateBasedCondition) rules.get(0).getCondition(); + DateBasedCondition c2 = (DateBasedCondition) rules.get(1).getCondition(); + DateBasedCondition c3 = (DateBasedCondition) rules.get(2).getCondition(); + Date today = new Date(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + String todayStr = sdf.format(today); + assertEquals(todayStr + "T14:30", c1.getOperands().get(0).getEvaluatedExpression().toString()); + assertEquals(todayStr + "T15:30", c2.getOperands().get(0).getEvaluatedExpression().toString()); + assertEquals(todayStr + "T09:30", c3.getOperands().get(0).getEvaluatedExpression().toString()); + } + @Test void test_AMPM_Parsing() throws Exception { String rule = "Rules = [ FileFreshness \"S3://PATH\" between \"9:15 AM\" and \"21:45\" ]"; diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 278edc4..e09f2a6 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -115,6 +115,7 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileFreshness > 9:30 ]"), Arguments.of("Rules = [ FileFreshness > 9:30 SM ]"), Arguments.of("Rules = [ FileFreshness > 22:1s ]"), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with \"timeZone\" = \"America/Dubai\""), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), Arguments.of("((RowCount > 0) AND IsComplete"), Arguments.of("variable > 1"), From 77dd8d4006c72e6b0805a97c1603873257aee157 Mon Sep 17 00:00:00 2001 From: Dongying Song Date: Tue, 10 Dec 2024 13:37:53 -0500 Subject: [PATCH 49/50] Support string variable --- .../dqdl/DataQualityDefinitionLanguage.g4 | 5 ++++- .../dqdl/parser/DQDLParserListener.java | 17 +++++++++++------ .../dataquality/dqdl/model/DQRulesetTest.java | 11 +++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 6e5418a..2ce274c 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -140,7 +140,10 @@ dqRule: ruleType parameterWithConnectorWord* condition? whereClause? tagWithCond dqAnalyzer: analyzerType parameterWithConnectorWord*; // Variable Declarations -expression: stringValuesArray; +expression: + stringValues + | stringValuesArray; + variableDeclaration: IDENTIFIER EQUAL_TO expression; variableDeclarations: variableDeclaration*; diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 06d6ff4..1bce4c6 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -291,14 +291,12 @@ public void enterVariableDeclaration(DataQualityDefinitionLanguageParser.Variabl if (expr.stringValuesArray() != null) { List values = expr.stringValuesArray().stringValues().stream() - .map(sv -> { - if (sv.quotedString() != null) { - return removeQuotes(sv.quotedString().getText()); - } - return sv.getText(); - }) + .map(this::processStringValues) .collect(Collectors.toList()); variable = new DQVariable(variableName, DQVariable.VariableType.STRING_ARRAY, values); + } else if (expr.stringValues() != null) { + String value = processStringValues(expr.stringValues()); + variable = new DQVariable(variableName, DQVariable.VariableType.STRING, value); } if (variable != null) { @@ -1217,4 +1215,11 @@ private boolean isValidEnumValue(String value) { } } + private String processStringValues(DataQualityDefinitionLanguageParser.StringValuesContext sv) { + if (sv.quotedString() != null) { + return removeQuotes(sv.quotedString().getText()); + } + return sv.getText(); + } + } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index cabf267..73df39d 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -554,6 +554,17 @@ public void testVariableDefinitionMissing() { dqRuleset.getRules().get(0).toString()); } + @Test + public void testStringVariable() { + String dqdl = + "sqlString = \"select id from primary where age < 100\"\n" + + "Rules = [ CustomSql \"select id from primary where age < 100\" ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("CustomSql \"select id from primary where age < 100\"", + dqRuleset.getRules().get(0).toString()); + } @Test void test_multipleRules() { From 97bd8852567d74e108aadb7848e74179dd345572 Mon Sep 17 00:00:00 2001 From: Sam Pomerantz Date: Mon, 16 Dec 2024 12:20:02 -0500 Subject: [PATCH 50/50] Updated Tag Parsing. Fix URI Bug. --- .../dqdl/DataQualityDefinitionLanguage.g4 | 2 +- .../ml/dataquality/dqdl/model/DQRule.java | 23 ++-- .../dqdl/model/condition/string/Tag.java | 46 +++++++ .../dqdl/parser/DQDLParserListener.java | 122 ++++++++++++------ .../ml/dataquality/dqdl/model/DQRuleTest.java | 29 ++++- .../dqdl/parser/InvalidDQRulesetTest.java | 2 + 6 files changed, 165 insertions(+), 59 deletions(-) create mode 100644 src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index 2ce274c..26b4d6d 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -87,7 +87,7 @@ stringBasedCondition: | NOT? IN stringValuesArray | NOT? IN variableDereference | NOT? matchesRegexCondition; -tagValues: quotedString | IDENTIFIER; +tagValues: IDENTIFIER; dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC; dateBasedCondition: diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index c7e61c0..b115a0e 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag; import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Builder; @@ -24,6 +25,7 @@ import java.util.List; import java.util.Map; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag.convertToStringMap; import static com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLVariableResolver.resolveVariablesInCondition; import static com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils.isBlank; @@ -41,7 +43,7 @@ public class DQRule implements Serializable, HasRuleTypeAndParameters { private final List nestedRules; private final String whereClause; private Boolean isExcludedAtRowLevelInCompositeRules = false; - private Map tags; + private Map tags; // Adding this constructor so as to not break the Data Quality ETL package. public DQRule(final String ruleType, @@ -118,7 +120,7 @@ public static DQRule createFromParameterValueMap(final DQRuleType ruleType, final Condition condition, final Condition thresholdCondition, final String whereClause, - final Map tags) { + final Map tags) { DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; List nestedRules = new ArrayList<>(); @@ -143,7 +145,7 @@ public static DQRule createFromParameterValueMapWithVariables(final DQRuleType r final Condition condition, final Condition thresholdCondition, final String whereClause, - final Map tags, + final Map tags, final Map variables) { // Create the unresolved rule first DQRule unresolvedRule = createFromParameterValueMap(ruleType, parameters, condition, @@ -185,6 +187,10 @@ public DQRule withCondition(final Condition condition) { return this.toBuilder().condition(condition).build(); } + public Map getTags() { + return convertToStringMap(tags); + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -211,14 +217,9 @@ public String toString() { if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition); } - if (tags != null && !tags.isEmpty()) { - sb.append(" "); - for (Map.Entry entry : tags.entrySet()) { - sb.append("with \"") - .append(entry.getKey()) - .append("\" = \"") - .append(entry.getValue()) - .append("\" "); + if (tags != null) { + for (Map.Entry entry : tags.entrySet()) { + sb.append(entry.getValue()); } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java new file mode 100644 index 0000000..5b03058 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java @@ -0,0 +1,46 @@ +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import lombok.AllArgsConstructor; + +import java.io.Serializable; +import java.util.Collections; +import java.util.Map; +import java.util.stream.Collectors; + +@AllArgsConstructor +public class Tag implements Serializable { + private final String key; + private final String value; + + public static Map convertToStringMap(Map tags) { + if (tags == null) { + return Collections.emptyMap(); + } + return tags.entrySet().stream() + .collect(Collectors.toMap( + entry -> entry.getValue().getKey(), + entry -> entry.getValue().getValue() + )); + } + + public String getKey() { + return removeQuotes(this.key); + } + + public String getValue() { + return removeQuotes(this.value); + } + + @Override + public String toString() { + return String.format(" with %s = %s", key, value); + } + + private String removeQuotes(String quotedString) { + if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) { + quotedString = quotedString.substring(1); + quotedString = quotedString.substring(0, quotedString.length() - 1); + } + return quotedString; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 1bce4c6..3fa355d 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -45,10 +45,10 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; -import org.antlr.v4.runtime.misc.Pair; -import org.antlr.v4.runtime.tree.ParseTree; +import org.antlr.v4.runtime.ParserRuleContext; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; @@ -73,6 +73,8 @@ import java.util.Set; import java.util.stream.Collectors; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag.convertToStringMap; + public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListener { private final DQDLErrorListener errorListener; private final List errorMessages = new ArrayList<>(); @@ -90,6 +92,7 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private static final String PRIMARY_SOURCE_KEY = "Primary"; private static final String ADDITIONAL_SOURCES_KEY = "AdditionalDataSources"; private static final Set ALLOWED_SOURCES_KEYS; + private static final String THRESHOLD_KEY = "threshold"; private static final String MILITARY_TIME_FORMAT = "HH:mm"; private static final String AMPM_TIME_FORMAT = "h:mm a"; @@ -349,45 +352,49 @@ private Either getDQRule( } Condition thresholdCondition = null; - Map tags = new HashMap<>(); + Map tags = new HashMap<>(); List tagContexts = - dqRuleContext.tagWithCondition(); - if (tagContexts != null && !tagContexts.isEmpty()) { - for (DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext : tagContexts) { - if (tagContext.numberBasedCondition() != null) { - if (dqRuleType.isThresholdSupported()) { - if (thresholdCondition != null) { - return Either.fromLeft("Only one threshold condition at a time is supported."); - } - Either outcome = processThresholdTag(tagContext, ruleType); - if (outcome.isLeft()) { - return Either.fromLeft(outcome.getLeft()); - } else { - thresholdCondition = outcome.getRight(); - } + (dqRuleContext.tagWithCondition() == null) ? new ArrayList<>() : dqRuleContext.tagWithCondition(); + for (DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext : tagContexts) { + if (tagContext.stringBasedCondition() != null) { + //process plain string tag + final Either outcome = processStringTag(tagContext); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); + } else { + final Tag tag = outcome.getRight(); + tags.put(tag.getKey(), tag); + } + } else if (tagContext.numberBasedCondition() != null) { + final String tagName = tagContext.tagValues().getText(); + if (tagName.equalsIgnoreCase(THRESHOLD_KEY)) { + //process threshold tag + final Either outcome = + processThresholdTag(dqRuleType, thresholdCondition, tagContext, ruleType); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); } else { - return Either.fromLeft(String.format( - "Threshold condition not supported for rule type: %s", ruleType)); + thresholdCondition = outcome.getRight(); } - } else if (tagContext.stringBasedCondition() != null) { - Either> outcome = processStringTag(tagContext); + } else { + //convert number tag into string tag + final Either outcome = processNumberTag(tagContext, tagName); if (outcome.isLeft()) { return Either.fromLeft(outcome.getLeft()); } else { - Pair pair = outcome.getRight(); - tags.put(pair.a, pair.b); + final Tag tag = outcome.getRight(); + tags.put(tag.getKey(), tag); } - } else { - return Either.fromLeft(String.format( - "Invalid tag provided for rule type: %s", ruleType)); } + } else { + return Either.fromLeft(String.format("Invalid tag provided for rule type: %s", ruleType)); } } Condition condition; List> conditions = Arrays.stream(dqRuleType.getReturnType().split("\\|")) - .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext, tags)) + .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext, convertToStringMap(tags))) .collect(Collectors.toList()); Optional> optionalCondition = conditions.stream().filter(Either::isRight).findFirst(); @@ -414,17 +421,48 @@ private Either getDQRule( ); } - private Either> processStringTag( + private Either processThresholdTag(DQRuleType dqRuleType, + Condition thresholdCondition, + DataQualityDefinitionLanguageParser + .TagWithConditionContext tagContext, + String ruleType) { + if (dqRuleType.isThresholdSupported()) { + if (thresholdCondition != null) { + return Either.fromLeft("Only one threshold condition at a time is supported."); + } + return processThresholdTag(tagContext, ruleType); + } else { + return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType)); + } + } + + private Either processNumberTag(DataQualityDefinitionLanguageParser + .TagWithConditionContext tagContext, + String tagName) { + if (!isTagValid(tagContext.numberBasedCondition())) { + return Either.fromLeft("Number tags only support the equality operator."); + } + final List numberContexts = + tagContext.numberBasedCondition().number(); + if (numberContexts != null && !numberContexts.isEmpty()) { + final String tagValue = numberContexts.get(0).getText(); + return Either.fromRight(new Tag(tagName, tagValue)); + } else { + return Either.fromLeft(String.format("Error Parsing Tag %s", tagName)); + } + } + + private Either processStringTag( DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext) { if (!isTagValid(tagContext.stringBasedCondition())) { - return Either.fromLeft("Only EQUAL_TO condition is supported for String tags."); + return Either.fromLeft("String tags only support the equality operator."); } - String tagKey = getKeyFromTag(tagContext.tagValues()); + String tagKey = tagContext.tagValues().getText(); Optional valueCondition = parseStringBasedCondition(tagContext.stringBasedCondition()); if (valueCondition.isPresent()) { StringBasedCondition stringCondition = (StringBasedCondition) valueCondition.get(); - String tagValue = stringCondition.getOperands().get(0).getOperand(); - return Either.fromRight(new Pair<>(tagKey, tagValue)); + String tagValue = stringCondition.getOperands().get(0).formatOperand(); + return Either.fromRight(new Tag(tagKey, tagValue)); } else { return Either.fromLeft(String.format("Error while parsing tag: %s", tagKey)); } @@ -443,14 +481,18 @@ private Either processThresholdTag( } } - private boolean isTagValid(DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx) { - return ctx.EQUAL_TO() != null && ctx.NEGATION() == null; - } - - private String getKeyFromTag(DataQualityDefinitionLanguageParser.TagValuesContext tagValuesContext) { - Optional identifierKey = Optional.ofNullable(tagValuesContext.IDENTIFIER()).map(ParseTree::getText); - Optional stringKey = Optional.ofNullable(tagValuesContext.quotedString()).map(ParseTree::getText); - return removeQuotes(identifierKey.orElseGet(stringKey::get)); + private boolean isTagValid(ParserRuleContext ctx) { + if (ctx instanceof DataQualityDefinitionLanguageParser.StringBasedConditionContext) { + final DataQualityDefinitionLanguageParser.StringBasedConditionContext stringCtx = + (DataQualityDefinitionLanguageParser.StringBasedConditionContext) ctx; + return stringCtx.EQUAL_TO() != null && stringCtx.NEGATION() == null; + } else if (ctx instanceof DataQualityDefinitionLanguageParser.NumberBasedConditionContext) { + final DataQualityDefinitionLanguageParser.NumberBasedConditionContext numberCtx = + (DataQualityDefinitionLanguageParser.NumberBasedConditionContext) ctx; + return numberCtx.EQUAL_TO() != null && numberCtx.NEGATION() == null; + } else { + return false; + } } private Either getDQAnalyzer( diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 9a8b150..3139102 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -243,9 +243,9 @@ private static Stream provideRawRules() { Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\"]"), Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"]"), Arguments.of("FileMatch in [\"hashList\",\"hashList\"]"), - Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with \"hashAlgorithm\" = \"MD5\""), - Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\" with \"randomTagThing\" = \"@sampom\""), - Arguments.of("FileMatch \"S3://PATH1\" in [\"a\"] with \"tag1\" = \"sampom\" with \"tag2\" = \"pomsam\""), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), + Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\" with randomTagThing = \"@sampom\""), + Arguments.of("FileMatch \"S3://PATH1\" in [\"a\"] with tag1 = \"sampom\" with tag2 = \"pomsam\""), Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9"), Arguments.of("FileFreshness \"S3://PATH\" between \"2023-02-07\" and \"2024-07-15\""), @@ -272,7 +272,7 @@ private static Stream provideRawRules() { Arguments.of("FileFreshness \"S3://PATH\" > \"09:30\""), Arguments.of("FileFreshness \"S3://PATH\" > \"13:30\""), Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\""), - Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with \"timeZone\" = \"America/New_York\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with timeZone = \"America/New_York\""), Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 PM\""), Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 AM\""), Arguments.of("FileFreshness \"S3://PATH\" between \"09:30\" and \"21:45\""), @@ -283,6 +283,21 @@ private static Stream provideRawRules() { ); } + @Test + void test_TagFormatting() throws Exception { + final String rule = "Rules = [ " + + "FileFreshness > \"9:30 AM\" with recentFiles = 1, " + + "FileFreshness > \"9:30 AM\" with recentFiles = \"1\", " + + "FileFreshness > \"9:30 AM\" with matchFileName = \"True\", " + + "FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\" " + + "]"; + List rules = parser.parse(rule).getRules(); + assertEquals("FileFreshness > \"9:30 AM\" with recentFiles = 1", rules.get(0).toString()); + assertEquals("FileFreshness > \"9:30 AM\" with recentFiles = \"1\"", rules.get(1).toString()); + assertEquals("FileFreshness > \"9:30 AM\" with matchFileName = \"True\"", rules.get(2).toString()); + assertEquals("FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\"", rules.get(3).toString()); + } + @Test void test_Timezone() throws Exception { String rule = "Rules = [ FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\", FileFreshness > \"19:30\" with timeZone = \"Asia/Dubai\", FileFreshness > \"9:30 AM\" ]"; @@ -399,10 +414,10 @@ void test_fileFileFreshnessParsing() throws Exception { @Test void test_checksumRuleParsing() throws Exception { String fileRules = "Rules = [ " + - "FileMatch in [\"exampleHash\"] with \"hashAlgorithm\" = \"MD5\" with \"dataFrame\" = \"true\" ," + - "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with \"hashAlgorithm\" = \"SHA-256\" ," + + "FileMatch in [\"exampleHash\"] with hashAlgorithm = \"MD5\" with dataFrame = \"true\" ," + + "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with hashAlgorithm = \"SHA-256\" ," + "FileMatch \"s3://sampom-bucket3/\" in [\"exampleHash3\"] ," + - "FileMatch in [\"exampleHash4\"] with \"dataFrame\" = \"true\"" + + "FileMatch in [\"exampleHash4\"] with dataFrame = \"true\"" + "]"; DQRuleset dqRuleset = parser.parse(fileRules); List ruleList = dqRuleset.getRules(); diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index e09f2a6..7dcc615 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -115,6 +115,8 @@ private static Stream provideInvalidRulesets() { Arguments.of("Rules = [ FileFreshness > 9:30 ]"), Arguments.of("Rules = [ FileFreshness > 9:30 SM ]"), Arguments.of("Rules = [ FileFreshness > 22:1s ]"), + Arguments.of("Rules = [ FileFreshness \"S3://PATH\" > \"9:30 PM\" with threshold > 1 with threshold = 2 ]"), + Arguments.of("FileFreshness > \"9:30 AM\" with timeZone = \"America/New_Chicago\""), Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with \"timeZone\" = \"America/Dubai\""), Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), Arguments.of("((RowCount > 0) AND IsComplete"),