diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4 index d1890a6..194de8e 100644 --- a/configuration/dqdl/CommonLexerRules.g4 +++ b/configuration/dqdl/CommonLexerRules.g4 @@ -12,6 +12,10 @@ LPAREN: '('; RPAREN: ')'; AND: 'and' | 'AND'; OR: 'or' | 'OR'; +OF: 'of' | 'OF'; +NULL: 'null' | 'NULL'; +EMPTY: 'empty' | 'EMPTY'; +WHITESPACES_ONLY: 'whitespaces_only' | 'WHITESPACES_ONLY'; BETWEEN: 'between'; EQUAL_TO: '='; @@ -20,10 +24,16 @@ GREATER_THAN_EQUAL_TO: '>='; LESS_THAN: '<'; LESS_THAN_EQUAL_TO: '<='; IN: 'in'; +NOT: 'not'; +NEGATION: '!'; DIGIT: [0-9]; DATE: QUOTE DIGIT DIGIT DIGIT DIGIT '-' DIGIT DIGIT '-' DIGIT DIGIT QUOTE; +TIME: + QUOTE (DIGIT | DIGIT DIGIT) ':' DIGIT DIGIT (' AM' | ' PM') QUOTE; +MIL_TIME: + QUOTE DIGIT DIGIT ':' DIGIT DIGIT QUOTE; INT: DIGIT+; DECIMAL: INT '.' INT; QUOTED_STRING: QUOTE (ESC | .)*? QUOTE; @@ -31,7 +41,7 @@ NEGATIVE: '-'; LINE_COMMENT: '#' .*? '\r'? '\n' -> skip; // Match "#" stuff '\n' -IDENTIFIER: [a-zA-Z0-9]+; +IDENTIFIER: [a-zA-Z0-9_.]+; WS: [ \t\n]+ -> skip; diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4 index e5cf0d4..26b4d6d 100644 --- a/configuration/dqdl/DataQualityDefinitionLanguage.g4 +++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4 @@ -1,25 +1,39 @@ -grammar DataQualityDefinitionLanguage; // "parser grammars for DQDL" +grammar DataQualityDefinitionLanguage; // "parser grammars for DQDL" import CommonLexerRules; // Sections metadataSectionStart: 'Metadata'; dataSourcesSectionStart: 'DataSources'; rulesSectionStart: 'Rules'; +analyzersSectionStart: 'Analyzers'; // Expressions dateNow: 'now()'; -durationUnit: 'days' | 'hours'; +durationUnit: 'days' | 'hours' | 'minutes'; durationExpression: (DIGIT | INT) durationUnit; +sizeUnit: + 'B' + | 'KB' + | 'MB' + | 'GB' + | 'TB'; + +sizeExpression: (DIGIT | INT) sizeUnit; + +timeExpression: TIME | MIL_TIME; + dateExpressionOp: ('-' | '+'); dateExpression: DATE | dateNow - | LPAREN dateNow dateExpressionOp durationExpression RPAREN; + | LPAREN dateNow dateExpressionOp durationExpression RPAREN + | timeExpression + | NULL; -number: +atomicNumber: DIGIT | NEGATIVE DIGIT | INT @@ -27,72 +41,132 @@ number: | DECIMAL | NEGATIVE DECIMAL; +functionParameters: + number + | number (COMMA number)*; + +functionCall: + IDENTIFIER LPAREN RPAREN + | IDENTIFIER LPAREN functionParameters RPAREN; + +numberOp: '+' | '-' | '/' | '*'; + +number: + number numberOp number + | functionCall + | LPAREN number RPAREN + | atomicNumber + | NULL; + quotedString: QUOTED_STRING; matchesRegexCondition: 'matches' quotedString; numberArray: LBRAC number (COMMA number)* RBRAC; numberBasedCondition: - BETWEEN number AND number + NOT? BETWEEN number AND number | GREATER_THAN number | GREATER_THAN_EQUAL_TO number | LESS_THAN number | LESS_THAN_EQUAL_TO number - | EQUAL_TO number - | IN numberArray; + | NEGATION? EQUAL_TO number + | NOT? IN numberArray; + +variableDereference: '$' IDENTIFIER; + +stringValues: + quotedString + | variableDereference + | NULL + | EMPTY + | WHITESPACES_ONLY; -quotedStringArray: LBRAC quotedString (COMMA quotedString)* RBRAC; +stringValuesArray: LBRAC stringValues (COMMA stringValues)* RBRAC; stringBasedCondition: - EQUAL_TO quotedString - | IN quotedStringArray - | matchesRegexCondition; + NEGATION? EQUAL_TO stringValues + | NOT? IN stringValuesArray + | NOT? IN variableDereference + | NOT? matchesRegexCondition; +tagValues: IDENTIFIER; dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC; dateBasedCondition: - BETWEEN dateExpression AND dateExpression + NOT? BETWEEN dateExpression AND dateExpression | GREATER_THAN dateExpression | GREATER_THAN_EQUAL_TO dateExpression | LESS_THAN dateExpression | LESS_THAN_EQUAL_TO dateExpression - | EQUAL_TO dateExpression - | IN dateExpressionArray; + | NEGATION? EQUAL_TO dateExpression + | NOT? IN dateExpressionArray; durationExpressionArray: LBRAC durationExpression (COMMA durationExpression)* RBRAC; durationBasedCondition: - BETWEEN durationExpression AND durationExpression + NOT? BETWEEN durationExpression AND durationExpression | GREATER_THAN durationExpression | GREATER_THAN_EQUAL_TO durationExpression | LESS_THAN durationExpression | LESS_THAN_EQUAL_TO durationExpression - | EQUAL_TO durationExpression - | IN durationExpressionArray; + | NEGATION? EQUAL_TO durationExpression + | NOT? IN durationExpressionArray; + +sizeExpressionArray: LBRAC sizeExpression (COMMA sizeExpression)* RBRAC; +sizeBasedCondition: + NOT? BETWEEN sizeExpression AND sizeExpression + | GREATER_THAN sizeExpression + | GREATER_THAN_EQUAL_TO sizeExpression + | LESS_THAN sizeExpression + | LESS_THAN_EQUAL_TO sizeExpression + | NEGATION? EQUAL_TO sizeExpression + | NOT? IN sizeExpressionArray; ruleType: IDENTIFIER; -parameter: (QUOTED_STRING | INT | DIGIT); +analyzerType: IDENTIFIER; +parameter: QUOTED_STRING + | IDENTIFIER; +connectorWord: OF | AND; +parameterWithConnectorWord: connectorWord? parameter; +tagWithCondition: 'with' tagValues (stringBasedCondition | numberBasedCondition); condition: numberBasedCondition | stringBasedCondition | dateBasedCondition - | durationBasedCondition; + | durationBasedCondition + | sizeBasedCondition; -withThresholdCondition: 'with' 'threshold' numberBasedCondition; +whereClause: 'where' quotedString; -dqRule: ruleType parameter* condition? withThresholdCondition?; +dqRule: ruleType parameterWithConnectorWord* condition? whereClause? tagWithCondition*; +dqAnalyzer: analyzerType parameterWithConnectorWord*; + +// Variable Declarations +expression: + stringValues + | stringValuesArray; + +variableDeclaration: + IDENTIFIER EQUAL_TO expression; +variableDeclarations: variableDeclaration*; topLevelRule: - dqRule - | '(' dqRule ')' (AND '(' dqRule ')')* - | '(' dqRule ')' (OR '(' dqRule ')')*; + LPAREN topLevelRule RPAREN + | topLevelRule AND topLevelRule + | topLevelRule OR topLevelRule + | dqRule; // Rules Definition dqRules: topLevelRule (COMMA topLevelRule)*; +dqAnalyzers: dqAnalyzer (COMMA dqAnalyzer)*; // Top Level Document rules: rulesSectionStart EQUAL_TO LBRAC dqRules RBRAC | rulesSectionStart EQUAL_TO LBRAC RBRAC; // empty array +analyzers: + analyzersSectionStart EQUAL_TO LBRAC dqAnalyzers RBRAC + | analyzersSectionStart EQUAL_TO LBRAC RBRAC; // empty array + // This dictionary does not support nested dictionaries. Just strings and arrays. dictionary: LCURL pair (COMMA pair)* RCURL; pair: QUOTED_STRING COLON pairValue; @@ -101,5 +175,6 @@ array: LBRAC QUOTED_STRING (COMMA QUOTED_STRING)* RBRAC; metadata: metadataSectionStart EQUAL_TO dictionary; dataSources: dataSourcesSectionStart EQUAL_TO dictionary; +rulesOrAnalyzers: rules | analyzers | rules analyzers; -document: metadata? dataSources? rules; +document: metadata? dataSources? variableDeclarations? rulesOrAnalyzers; diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json index bb4dea8..0c8a68e 100644 --- a/configuration/rules/rules-config.json +++ b/configuration/rules/rules-config.json @@ -4,7 +4,9 @@ "rule_type_name": "RowCount", "description": "Check the number of rows in the dataset", "parameters": [], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "table" }, { "rule_type_name": "RowCountMatch", @@ -16,13 +18,15 @@ "description": "Alias of reference dataset" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" }, { "rule_type_name": "ColumnCount", "description": "Checks the number of columns in the dataset", "parameters": [], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "Completeness", @@ -34,7 +38,10 @@ "description": "Name of column to check completeness of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_excluded_at_row_level_in_composite_rules": true, + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "IsComplete", @@ -46,11 +53,13 @@ "description": "Name of column to check completeness of" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "ColumnDataType", - "description": "Check the data type of the given column", + "description": "Check the data type of the given column. Supported values: Boolean, Date, Timestamp, Integer, Double, Float, Long", "parameters": [ { "type": "String", @@ -59,7 +68,9 @@ } ], "return_type": "STRING", - "is_threshold_supported": true + "is_threshold_supported": true, + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "ColumnNamesMatchPattern", @@ -71,7 +82,8 @@ "description": "Pattern to match against the names of the columns" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "table" }, { "rule_type_name": "ColumnExists", @@ -83,7 +95,8 @@ "description": "Name of column to check existence of" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "scope": "column" }, { "rule_type_name": "ColumnCorrelation", @@ -100,7 +113,9 @@ "description": "Name of second column" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "Uniqueness", @@ -112,7 +127,10 @@ "description": "Name of column to check uniqueness of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_excluded_at_row_level_in_composite_rules": true, + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "IsUnique", @@ -124,7 +142,9 @@ "description": "Name of column to check uniqueness of" } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "Mean", @@ -136,7 +156,9 @@ "description": "Name of column to check mean of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "Sum", @@ -148,7 +170,9 @@ "description": "Name of column to check sum of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "StandardDeviation", @@ -160,7 +184,9 @@ "description": "Name of column to check standard deviation of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "Entropy", @@ -172,7 +198,9 @@ "description": "Name of column to check entropy of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "DistinctValuesCount", @@ -184,7 +212,9 @@ "description": "Name of column to check distinct values count of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "UniqueValueRatio", @@ -196,7 +226,9 @@ "description": "Name of column to check unique value ratio of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "ColumnLength", @@ -208,7 +240,9 @@ "description": "Name of column to check the length of the values of" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "IsPrimaryKey", @@ -221,7 +255,9 @@ "is_var_arg": true } ], - "return_type": "BOOLEAN" + "return_type": "BOOLEAN", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "ColumnValues", @@ -234,7 +270,9 @@ } ], "return_type": "STRING_ARRAY|NUMBER_ARRAY|DATE_ARRAY", - "is_threshold_supported": true + "is_threshold_supported": true, + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "DataFreshness", @@ -246,7 +284,9 @@ "description": "Name of column to check the freshness of" } ], - "return_type": "DURATION_ARRAY" + "return_type": "DURATION_ARRAY", + "is_where_clause_supported": true, + "scope": "column" }, { "rule_type_name": "CustomSql", @@ -259,7 +299,8 @@ } ], "return_type": "NUMBER|BOOLEAN", - "is_threshold_supported": true + "is_threshold_supported": true, + "scope": "table" }, { "rule_type_name": "ReferentialIntegrity", @@ -276,7 +317,9 @@ "description": "Alias of reference dataset and comma separated names of columns from reference dataset. The alias and the names should be separated by a period. The names should be enclosed in curly brackets." } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_excluded_at_row_level_in_composite_rules": true, + "scope": "table" }, { "rule_type_name": "DatasetMatch", @@ -293,7 +336,9 @@ "description": "Mappings of key columns used for joining the two datasets" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "is_excluded_at_row_level_in_composite_rules": true, + "scope": "table" }, { "rule_type_name": "DatasetMatch", @@ -315,7 +360,8 @@ "description": "Mappings of columns used for matching" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "SchemaMatch", @@ -327,7 +373,8 @@ "description": "Alias of reference dataset" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "table" }, { "rule_type_name": "AggregateMatch", @@ -344,7 +391,161 @@ "description": "The second aggregate expression" } ], - "return_type": "NUMBER" + "return_type": "NUMBER", + "scope": "column" + }, + { + "rule_type_name": "DetectAnomalies", + "description": "Checks if the current value of the metric is anomalous with respect to the historical values", + "parameters": [ + { + "type": "String", + "name": "MetricParameter", + "description": "The parameters required to evaluate the metric. The first parameter must be the metric name.", + "is_var_arg": true + } + ], + "return_type": "BOOLEAN", + "scope": "column" + }, + { + "rule_type_name": "AllStatistics", + "is_analyzer_only": true, + "description": "Analyzer Only. Produces a collection of statistics.", + "parameters": [ + { + "type": "String", + "name": "TargetColumn", + "description": "Name of the column to analyze" + } + ], + "return_type": "NUMBER", + "scope": "column" + }, + { + "rule_type_name": "FileMatch", + "description": "Match Files/Directories against Files/Directories.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "File/Directory for comparison" + }, + { + "type": "String", + "name": "CompareDataPath", + "description": "Other File/Directory for comparison" + } + ], + "return_type": "BOOLEAN", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileMatch", + "description": "Match Files/Directories inferred from DataFrames against a list of checksum values.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "File/Directory for comparison" + } + ], + "return_type": "STRING", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileMatch", + "description": "Match Files/Directories inferred from DataFrames against a list of checksum values.", + "parameters": [], + "return_type": "STRING", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileUniqueness", + "description": "Checks the contents of a folder and the uniqueness of each file within.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "Data Path for FileUniqueness." + } + ], + "return_type": "NUMBER", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileUniqueness", + "description": "Checks the contents of an inferred folder and the uniqueness of each file within.", + "parameters": [], + "return_type": "NUMBER", + "is_threshold_supported": false, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileFreshness", + "description": "Checks the age of a filepath against a specified date.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "Data Path for FileFreshness." + } + ], + "return_type": "DATE", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileFreshness", + "description": "Checks the age of an inferred file against a specified date.", + "parameters": [], + "return_type": "DATE", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileSize", + "description": "Checks the size of a filepath.", + "parameters": [ + { + "type": "String", + "name": "DataPath", + "description": "Data Path for FileSize." + } + ], + "return_type": "SIZE", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true + }, + { + "rule_type_name": "FileSize", + "description": "Checks the size of an inferred file.", + "parameters": [], + "return_type": "SIZE", + "is_threshold_supported": true, + "is_where_clause_supported": false, + "scope": "file", + "experimental": true } ] -} \ No newline at end of file +} diff --git a/pom.xml b/pom.xml index 7021831..417a331 100644 --- a/pom.xml +++ b/pom.xml @@ -15,6 +15,7 @@ 2.12.7.1 5.9.1 1.18.28 + 2.0.16 3.11.0 1.8 1.8 @@ -56,6 +57,14 @@ provided + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + provided + + org.antlr diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java new file mode 100644 index 0000000..9bf56ea --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java @@ -0,0 +1,49 @@ +/* + * DQAnalyzer.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import lombok.AllArgsConstructor; +import lombok.Getter; + +import java.util.LinkedHashMap; +import java.util.Map; + +@AllArgsConstructor +@Getter +public class DQAnalyzer implements HasRuleTypeAndParameters { + private final String ruleType; + private final Map parameters; + private final Map parameterValueMap; + + public DQAnalyzer(final String ruleType, + final Map parameters) { + this.ruleType = ruleType; + this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(this.parameters); + } + + public static DQAnalyzer createFromValueMap(final String ruleType, + final LinkedHashMap parameters) { + return new DQAnalyzer(ruleType, DQRuleParameterValue.createParameterMap(parameters), parameters); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(ruleType); + + if (parameterValueMap != null) { + parameterValueMap.values().forEach(p -> sb.append(" ").append(p.toString())); + } + + return sb.toString(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java index 57e35ae..b115a0e 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java @@ -11,37 +11,93 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag; +import lombok.AccessLevel; import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; import java.io.Serializable; import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag.convertToStringMap; +import static com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLVariableResolver.resolveVariablesInCondition; import static com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils.isBlank; @AllArgsConstructor @Getter @EqualsAndHashCode -public class DQRule implements Serializable { +@Builder(toBuilder = true, access = AccessLevel.PRIVATE) +public class DQRule implements Serializable, HasRuleTypeAndParameters { private final String ruleType; private final Map parameters; + private final Map parameterValueMap; private final Condition condition; private final Condition thresholdCondition; private final DQRuleLogicalOperator operator; private final List nestedRules; + private final String whereClause; + private Boolean isExcludedAtRowLevelInCompositeRules = false; + private Map tags; + + // Adding this constructor so as to not break the Data Quality ETL package. + public DQRule(final String ruleType, + final Map parameters, + final Condition condition, + final Condition thresholdCondition, + final DQRuleLogicalOperator operator, + final List nestedRules, + final String whereClause) { + this.ruleType = ruleType; + this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); + this.condition = condition; + this.thresholdCondition = thresholdCondition; + this.operator = operator; + this.nestedRules = nestedRules; + this.whereClause = whereClause; + } + + public DQRule(final String ruleType, + final Map parameters, + final Condition condition, + final Condition thresholdCondition, + final DQRuleLogicalOperator operator, + final List nestedRules) { + this.ruleType = ruleType; + this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); + this.condition = condition; + this.thresholdCondition = thresholdCondition; + this.operator = operator; + this.nestedRules = nestedRules; + this.whereClause = null; + } public DQRule(final String ruleType, final Map parameters, final Condition condition) { this.ruleType = ruleType; this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = null; this.operator = DQRuleLogicalOperator.AND; this.nestedRules = new ArrayList<>(); + this.whereClause = null; + } + + // Can't overload the constructor above, due to type erasure + public static DQRule createFromParameterValueMap(final DQRuleType ruleType, + final LinkedHashMap parameters, + final Condition condition) { + return createFromParameterValueMap(ruleType, parameters, condition, + null, null, null); } public DQRule(final String ruleType, @@ -50,10 +106,89 @@ public DQRule(final String ruleType, final Condition thresholdCondition) { this.ruleType = ruleType; this.parameters = parameters; + this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters); this.condition = condition; this.thresholdCondition = thresholdCondition; this.operator = DQRuleLogicalOperator.AND; this.nestedRules = new ArrayList<>(); + this.whereClause = null; + } + + // Can't overload the constructor above, due to type erasure + public static DQRule createFromParameterValueMap(final DQRuleType ruleType, + final LinkedHashMap parameters, + final Condition condition, + final Condition thresholdCondition, + final String whereClause, + final Map tags) { + DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; + List nestedRules = new ArrayList<>(); + + return new DQRule( + ruleType.getRuleTypeName(), + DQRuleParameterValue.createParameterMap(parameters), + parameters, + condition, + thresholdCondition, + operator, + nestedRules, + whereClause, + ruleType.isExcludedAtRowLevelInCompositeRules(), + tags + ); + } + + // Add a new method for creating with variable resolution + public static DQRule createFromParameterValueMapWithVariables(final DQRuleType ruleType, + final LinkedHashMap + parameters, + final Condition condition, + final Condition thresholdCondition, + final String whereClause, + final Map tags, + final Map variables) { + // Create the unresolved rule first + DQRule unresolvedRule = createFromParameterValueMap(ruleType, parameters, condition, + thresholdCondition, whereClause, tags); + + // If there are no variables to resolve, return the unresolved rule + if (variables == null || variables.isEmpty()) { + return unresolvedRule; + } + + Map usedVars = new HashMap<>(); + + // Resolve variables in conditions + Condition resolvedCondition = condition != null + ? resolveVariablesInCondition(condition, variables, usedVars) : null; + Condition resolvedThresholdCondition = thresholdCondition != null + ? resolveVariablesInCondition(thresholdCondition, variables, usedVars) : null; + + // Create the resolved rule + return new DQRule( + ruleType.getRuleTypeName(), + DQRuleParameterValue.createParameterMap(parameters), + parameters, + resolvedCondition, + resolvedThresholdCondition, + DQRuleLogicalOperator.AND, + new ArrayList<>(), + whereClause, + ruleType.isExcludedAtRowLevelInCompositeRules(), + tags + ); + } + + public DQRule withNestedRules(final List nestedRules) { + return this.toBuilder().nestedRules(nestedRules).build(); + } + + public DQRule withCondition(final Condition condition) { + return this.toBuilder().condition(condition).build(); + } + + public Map getTags() { + return convertToStringMap(tags); } @Override @@ -63,8 +198,8 @@ public String toString() { if (nestedRules == null || nestedRules.isEmpty()) { sb.append(ruleType); - if (parameters != null) { - parameters.values().forEach(p -> sb.append(" ").append("\"").append(p).append("\"")); + if (parameterValueMap != null) { + parameterValueMap.values().forEach(p -> sb.append(" ").append(p.toString())); } if (condition != null) { @@ -72,21 +207,77 @@ public String toString() { if (!isBlank(formattedCondition)) sb.append(" ").append(condition.getFormattedCondition()); } + // where clause syntax should go before threshold + if (whereClause != null) { + if (!isBlank(whereClause)) sb.append(" where ").append("\"").append(whereClause).append("\""); + } + if (thresholdCondition != null) { String formattedCondition = thresholdCondition.getFormattedCondition(); if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition); } - return sb.toString(); + if (tags != null) { + for (Map.Entry entry : tags.entrySet()) { + sb.append(entry.getValue()); + } + } + + return sb.toString().trim(); } else { - for (int i = 0; i < nestedRules.size(); i++) { - sb.append("(").append(nestedRules.get(i).toString()).append(")"); - if (i != nestedRules.size() - 1) { - sb.append(" ").append(operator.toString()).append(" "); + boolean canBeFlattened = usesSameOperator(operator); + + if (canBeFlattened) { + List flattenedListOfRules = getNestedRulesAsFlattenedList(); + for (int i = 0; i < flattenedListOfRules.size(); i++) { + sb.append("(").append(flattenedListOfRules.get(i).toString()).append(")"); + if (i != flattenedListOfRules.size() - 1) { + sb.append(" ").append(operator.toString()).append(" "); + } + } + } else { + for (int i = 0; i < nestedRules.size(); i++) { + sb.append("(").append(nestedRules.get(i).toString()).append(")"); + if (i != nestedRules.size() - 1) { + sb.append(" ").append(operator.toString()).append(" "); + } } } } return sb.toString(); } + + /* + * This function checks if the same operator is used across all the nested rules. + * Example: (RuleA) or (RuleB) or (RuleC) / (RuleA) and (RuleB) and (RuleC) + * + * If that is the case, in order to maintain backwards compatibility, we will update + * toString() method so that we do not add additional parentheses. + */ + private boolean usesSameOperator(DQRuleLogicalOperator op) { + if (nestedRules.isEmpty()) return true; + if (operator != op) return false; + + for (DQRule nestedRule : nestedRules) { + if (!nestedRule.usesSameOperator(op)) { + return false; + } + } + + return true; + } + + public List getNestedRulesAsFlattenedList() { + List ret = new ArrayList<>(); + if (nestedRules.isEmpty()) { + ret.add(this); + } else { + for (DQRule nestedRule: nestedRules) { + List nestedRet = nestedRule.getNestedRulesAsFlattenedList(); + ret.addAll(nestedRet); + } + } + return ret; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java new file mode 100644 index 0000000..c61643a --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java @@ -0,0 +1,74 @@ +/* + * DQRuleParameterValue.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +@AllArgsConstructor +@Getter +@EqualsAndHashCode +public class DQRuleParameterValue implements Serializable { + private static final String EMPTY_CONNECTOR = ""; + + private final String value; + private final boolean isQuoted; + + // We could use an Optional here, instead of resorting to an empty string. + // But this needs to be serializable for Spark. + // Optional has presented problems in that regard. + private final String connectorWord; + + public DQRuleParameterValue(final String value) { + this.value = value; + this.isQuoted = false; + this.connectorWord = EMPTY_CONNECTOR; + } + + public DQRuleParameterValue(final String value, final boolean isQuoted) { + this.value = value; + this.isQuoted = isQuoted; + this.connectorWord = EMPTY_CONNECTOR; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (!EMPTY_CONNECTOR.equals(connectorWord)) sb.append(connectorWord).append(" "); + String surroundBy = isQuoted ? "\"" : ""; + sb.append(surroundBy).append(value).append(surroundBy); + return sb.toString(); + } + + public static Map createParameterValueMap(Map parameters) { + Map map = new HashMap<>(); + if (parameters == null) return map; + + // Add quotes when converting from the map of string values, and do not use connector word. + // This is to maintain backwards compatibility. + boolean isQuoted = true; + parameters.forEach((k, v) -> map.put(k, new DQRuleParameterValue(v, isQuoted))); + + return map; + } + + public static Map createParameterMap(Map parameters) { + Map paramValuesAsStringsMap = new LinkedHashMap<>(); + parameters.forEach((k, v) -> paramValuesAsStringsMap.put(k, v.getValue())); + return paramValuesAsStringsMap; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java index 59f33ea..4728ff3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java @@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets; import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; @@ -33,17 +32,36 @@ public class DQRuleType { private final List parameters; private final String returnType; private final boolean isThresholdSupported; + private final boolean isExcludedAtRowLevelInCompositeRules; + private final boolean isWhereClauseSupported; + private final boolean isAnalyzerOnly; + private final String scope; + private final boolean isExperimental; + @SuppressWarnings("checkstyle:parameternumber") public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, @JsonProperty(value = "description") String description, @JsonProperty(value = "parameters") List parameters, @JsonProperty(value = "return_type") String returnType, - @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported) { + // boolean defaults to false if not present + @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported, + @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules") + boolean isExcludedAtRowLevelInCompositeRules, + @JsonProperty(value = "is_where_clause_supported") + boolean isWhereClauseSupported, + @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly, + @JsonProperty(value = "scope") String scope, + @JsonProperty(value = "experimental") boolean isExperimental) { this.ruleTypeName = ruleTypeName; this.description = description; this.parameters = parameters; this.returnType = returnType; this.isThresholdSupported = isThresholdSupported; + this.isExcludedAtRowLevelInCompositeRules = isExcludedAtRowLevelInCompositeRules; + this.isWhereClauseSupported = isWhereClauseSupported; + this.isAnalyzerOnly = isAnalyzerOnly; + this.scope = scope; + this.isExperimental = isExperimental; if (parameters.isEmpty()) { return; @@ -60,11 +78,9 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName, } public Optional verifyParameters(List expectedParameters, - List actualParameters) { + List actualParameters) { if (!expectedParameters.isEmpty()) { - - boolean isVarArg = expectedParameters.get( - expectedParameters.size() - 1).isVarArg(); + boolean isVarArg = expectedParameters.get(expectedParameters.size() - 1).isVarArg(); if (isVarArg) { if (expectedParameters.size() > actualParameters.size()) { @@ -82,9 +98,9 @@ public Optional verifyParameters(List expectedParameter return Optional.empty(); } - public Map createParameterMap(List dqRuleTypeParameters, - List actualParameters) { - Map parameterMap = new LinkedHashMap<>(); + public LinkedHashMap createParameterMap(List dqRuleTypeParameters, + List actualParameters) { + LinkedHashMap parameterMap = new LinkedHashMap<>(); for (int i = 0; i < dqRuleTypeParameters.size(); i++) { String dqRuleTypeParameterName = dqRuleTypeParameters.get(i).getName(); @@ -98,7 +114,7 @@ public Map createParameterMap(List dqRuleTypePa for (int j = counter; j < actualParameters.size(); j++) { String newDqRuleTypeParameterName = dqRuleTypeParameterName + (j + 1); - String actualParameterName = actualParameters.get(j); + DQRuleParameterValue actualParameterName = actualParameters.get(j); parameterMap.put(newDqRuleTypeParameterName, actualParameterName); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java index d836b1e..dec8804 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java @@ -30,14 +30,20 @@ public class DQRuleset { private final String primarySourceName; private final List additionalDataSourcesNames; private final List rules; + private final List analyzers; private static final String LINE_SEP = System.lineSeparator(); public DQRuleset(final List rules) { + this(rules, new ArrayList<>()); + } + + public DQRuleset(final List rules, final List analyzers) { this.metadata = new HashMap<>(); this.primarySourceName = null; this.additionalDataSourcesNames = new ArrayList<>(); this.rules = rules; + this.analyzers = analyzers; } @Override @@ -75,12 +81,23 @@ public String toString() { "}"; } - String rulesStr = "Rules = [" + LINE_SEP + - rules.stream() - .map(i -> " " + i) - .collect(Collectors.joining("," + LINE_SEP)) + - LINE_SEP + "]"; + String rulesStr = ""; + if (!rules.isEmpty()) { + rulesStr = "Rules = [" + LINE_SEP + + rules.stream() + .map(i -> " " + i) + .collect(Collectors.joining("," + LINE_SEP)) + + LINE_SEP + "]"; + } + String analyzersStr = ""; + if (!analyzers.isEmpty()) { + analyzersStr = "Analyzers = [" + LINE_SEP + + analyzers.stream() + .map(i -> " " + i) + .collect(Collectors.joining("," + LINE_SEP)) + + LINE_SEP + "]"; + } StringBuilder sb = new StringBuilder(); if (!metadataStr.isEmpty()) { @@ -91,7 +108,16 @@ public String toString() { sb.append(sourcesStr).append(LINE_SEP).append(LINE_SEP); } - sb.append(rulesStr); + if (!rulesStr.isEmpty()) { + sb.append(rulesStr); + } + + if (!analyzersStr.isEmpty()) { + if (!rulesStr.isEmpty()) { + sb.append(LINE_SEP).append(LINE_SEP); + } + sb.append(analyzersStr); + } return sb.toString(); } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java new file mode 100644 index 0000000..b9774d7 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java @@ -0,0 +1,60 @@ +/* + * DQVariable.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; +import java.util.List; +import java.util.stream.Collectors; + +@AllArgsConstructor +@Getter +@EqualsAndHashCode +public class DQVariable implements Serializable { + + public enum VariableType { + NUMBER, + STRING, + DATE, + DURATION, + NUMBER_ARRAY, + STRING_ARRAY, + DATE_ARRAY, + DURATION_ARRAY + } + + private final String name; + private final VariableType type; + private final T value; + + @Override + public String toString() { + if (value instanceof List) { + return String.format("%s = %s", name, formatArray((List) value)); + } + return String.format("%s = %s", name, formatValue(value)); + } + + private String formatValue(T val) { + if (val == null) return "null"; + if (type == VariableType.STRING) return "\"" + val + "\""; + return val.toString(); + } + + private String formatArray(List list) { + return "[" + list.stream() + .map(Object::toString) + .collect(Collectors.joining(", ")) + "]"; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java new file mode 100644 index 0000000..36d6bfc --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java @@ -0,0 +1,20 @@ +/* + * HasRuleTypeAndParameters.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import java.util.Map; + +public interface HasRuleTypeAndParameters { + + String getRuleType(); + + Map getParameters(); +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java index dc59445..900ba88 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java @@ -10,6 +10,8 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.OperandEvaluator; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -27,4 +29,11 @@ public Condition(final String conditionAsString) { public String getFormattedCondition() { return this.conditionAsString; } + + public String getSortedFormattedCondition() { + return this.conditionAsString; + } + public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) { + throw new UnsupportedOperationException(); + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java index 0439f01..0458f98 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java @@ -42,6 +42,11 @@ public String getFormattedCondition() { operands.get(0).getFormattedExpression(), operands.get(1).getFormattedExpression() ); + case NOT_BETWEEN: + return String.format("not between %s and %s", + operands.get(0).getFormattedExpression(), + operands.get(1).getFormattedExpression() + ); case GREATER_THAN: return String.format("> %s", operands.get(0).getFormattedExpression()); case GREATER_THAN_EQUAL_TO: @@ -52,16 +57,49 @@ public String getFormattedCondition() { return String.format("<= %s", operands.get(0).getFormattedExpression()); case EQUALS: return String.format("= %s", operands.get(0).getFormattedExpression()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).getFormattedExpression()); case IN: { - List formattedOperands = operands.stream() - .map(DateExpression::getFormattedExpression) - .collect(Collectors.toList()); + List formattedOperands = getFormattedOperands(); return String.format("in [%s]", String.join(",", formattedOperands)); } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(",", formattedOperands)); + } default: break; } return ""; } + + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + + private List getFormattedOperands() { + List formattedOperands = operands.stream() + .map(DateExpression::getFormattedExpression) + .collect(Collectors.toList()); + return formattedOperands; + } + + private List getSortedFormattedOperands() { + List formattedOperands = operands.stream() + .map(DateExpression::getFormattedExpression) + .sorted() + .collect(Collectors.toList()); + return formattedOperands; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java index 09bab98..565f771 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java @@ -12,10 +12,13 @@ public enum DateBasedConditionOperator { BETWEEN, + NOT_BETWEEN, GREATER_THAN, GREATER_THAN_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, EQUALS, - IN + NOT_EQUALS, + IN, + NOT_IN } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java index 1353ffb..b14c3d1 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java @@ -11,12 +11,13 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration; -import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; +import lombok.Getter; import java.io.Serializable; import java.time.LocalDateTime; +import java.time.ZoneOffset; @EqualsAndHashCode public abstract class DateExpression implements Serializable { @@ -46,7 +47,23 @@ public String getFormattedExpression() { @Override public LocalDateTime getEvaluatedExpression() { - return LocalDateTime.now(); + return LocalDateTime.now(ZoneOffset.UTC); + } + } + + @AllArgsConstructor + public static class StaticDateTime extends DateExpression { + private final LocalDateTime dateTime; + private final String dateTimeString; + + @Override + public String getFormattedExpression() { + return "\"" + dateTimeString + "\""; + } + + @Override + public LocalDateTime getEvaluatedExpression() { + return dateTime; } } @@ -55,6 +72,7 @@ public enum DateExpressionOperator { PLUS } + @Getter @AllArgsConstructor public static class CurrentDateExpression extends DateExpression { private final DateExpressionOperator operator; @@ -74,16 +92,37 @@ public String getFormattedExpression() { @Override public LocalDateTime getEvaluatedExpression() { - int hours = duration.getUnit().equals(DurationUnit.DAYS) - ? duration.getAmount() * 24 - : duration.getAmount(); + switch (duration.getUnit()) { + case MINUTES: + return evaluateMinutes( + operator, + duration.getAmount(), + LocalDateTime.now(ZoneOffset.UTC) + ); + case HOURS: + return evaluateMinutes( + operator, + duration.getAmount() * 60, + LocalDateTime.now(ZoneOffset.UTC).withMinute(0) + ); + case DAYS: + return evaluateMinutes( + operator, + duration.getAmount() * 60 * 24, + LocalDateTime.now(ZoneOffset.UTC).withMinute(0) + ); + default: + throw new RuntimeException("Unsupported duration unit: " + duration.getUnit()); + } + } - LocalDateTime dt = LocalDateTime.now(); + private LocalDateTime evaluateMinutes(DateExpressionOperator operator, int minutes, LocalDateTime dt) { + dt = dt.withSecond(0).withNano(0); switch (operator) { case MINUS: - return dt.minusHours(hours); + return dt.minusMinutes(minutes); case PLUS: - return dt.plusHours(hours); + return dt.plusMinutes(minutes); default: return dt; } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java new file mode 100644 index 0000000..0ce87ac --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java @@ -0,0 +1,26 @@ +/* + * NullDateExpression.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date; + +import java.time.LocalDateTime; + +public class NullDateExpression extends DateExpression { + + @Override + public String getFormattedExpression() { + return "NULL"; + } + + @Override + public LocalDateTime getEvaluatedExpression() { + return null; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java index 8629c4a..1f6f80a 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -40,6 +41,10 @@ public String getFormattedCondition() { return String.format("between %s and %s", operands.get(0).getFormattedDuration(), operands.get(1).getFormattedDuration()); + case NOT_BETWEEN: + return String.format("not between %s and %s", + operands.get(0).getFormattedDuration(), + operands.get(1).getFormattedDuration()); case GREATER_THAN: return String.format("> %s", operands.get(0).getFormattedDuration()); case GREATER_THAN_EQUAL_TO: @@ -50,11 +55,15 @@ public String getFormattedCondition() { return String.format("<= %s", operands.get(0).getFormattedDuration()); case EQUALS: return String.format("= %s", operands.get(0).getFormattedDuration()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).getFormattedDuration()); case IN: { - List formattedOperands = operands.stream() - .map(Duration::getFormattedDuration) - .collect(Collectors.toList()); - return String.format("in [%s]", String.join(", ", formattedOperands)); + List formattedOperands = getFormattedOperands(); + return String.format("in [%s]", String.join(",", formattedOperands)); + } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(",", formattedOperands)); } default: break; @@ -62,4 +71,26 @@ public String getFormattedCondition() { return ""; } + + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + + private List getFormattedOperands() { + return operands.stream().map(Duration::getFormattedDuration).collect(Collectors.toList()); + } + + private List getSortedFormattedOperands() { + return operands.stream().map(Duration::getFormattedDuration).sorted().collect(Collectors.toList()); + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java index 099d410..966b432 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java @@ -12,10 +12,13 @@ public enum DurationBasedConditionOperator { BETWEEN, + NOT_BETWEEN, GREATER_THAN, GREATER_THAN_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, EQUALS, - IN + NOT_EQUALS, + IN, + NOT_IN } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java index f48f209..04577f9 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration; public enum DurationUnit { + MINUTES, HOURS, DAYS } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java new file mode 100644 index 0000000..9945b3b --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java @@ -0,0 +1,30 @@ +/* + * AtomicNumberOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +/* + * Atomic number operands are decimal numbers like 1.0, 3.14 etc that can be used in number based conditions. + * They are used for defining static thresholds on rules. + */ +public class AtomicNumberOperand extends NumericOperand { + public AtomicNumberOperand(final String operand) { + super(operand); + } + + @Override + public String toString() { + if (this.isParenthesized()) { + return String.format("(%s)", getOperand()); + } else { + return getOperand(); + } + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java new file mode 100644 index 0000000..3088131 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java @@ -0,0 +1,50 @@ +/* + * BinaryExpressionOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import lombok.EqualsAndHashCode; +import lombok.Getter; + +/* + * A BinaryExpressionOperand is a numerical expression that consists of two operands and an operator. + * The operands can themselves be binary expression operands or atomic number operands or function call operands. + * The operator can be one of: +, -, /, * + * The purpose of this operand is for combining with a dynamic function call operand to create dynamic rule thresholds. + */ +@Getter +@EqualsAndHashCode(callSuper = true) +public class BinaryExpressionOperand extends NumericOperand { + private final String operator; + private final NumericOperand operand1; + private final NumericOperand operand2; + + public BinaryExpressionOperand(final String operand, + final String operator, + final NumericOperand operand1, + final NumericOperand operand2, + final boolean isParenthesized) { + super(operand, isParenthesized); + this.operator = operator; + this.operand1 = operand1; + this.operand2 = operand2; + } + + @Override + public String toString() { + String formatted = String.format("%s %s %s", + this.operand1.toString(), this.operator, this.operand2.toString()); + if (this.isParenthesized()) { + return String.format("(%s)", formatted); + } else { + return formatted; + } + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java new file mode 100644 index 0000000..1cf2183 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java @@ -0,0 +1,49 @@ +/* + * FunctionCallOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.util.List; +import java.util.stream.Collectors; + +/* + * A Function Call operand is a special operand that takes operands as parameters returns a number. + * The parameters can themselves be function call operands, or atomic number operands or binary expression operands. + * Each function must be implemented by an instance of "OperandEvaluator", provided at the time of evaluation. + * Through the use of function call operands, we introduce the concept of dynamic rules in DQDL. + */ +@Getter +@EqualsAndHashCode(callSuper = true) +public class FunctionCallOperand extends NumericOperand { + private final String functionName; + private final List operands; + + public FunctionCallOperand(final String operand, + final String functionName, + final List operands) { + super(operand); + this.functionName = functionName; + this.operands = operands; + } + + @Override + public String toString() { + String params = this.operands.stream().map(NumericOperand::toString).collect(Collectors.joining(",")); + String formatted = String.format("%s(%s)", this.functionName, params); + if (this.isParenthesized()) { + return String.format("(%s)", formatted); + } else { + return formatted; + } + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java new file mode 100644 index 0000000..edad45d --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java @@ -0,0 +1,23 @@ +/* + * NullNumericOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +public class NullNumericOperand extends NumericOperand { + + public NullNumericOperand(final String operand) { + super(operand.toUpperCase()); + } + + @Override + public String toString() { + return getOperand(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java index 7ec0ad9..6bd5f58 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java @@ -10,55 +10,118 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; +import static java.lang.Math.abs; import lombok.EqualsAndHashCode; import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import java.text.DecimalFormat; import java.util.List; import java.util.stream.Collectors; @Getter @EqualsAndHashCode(callSuper = true) +@Slf4j public class NumberBasedCondition extends Condition { private final NumberBasedConditionOperator operator; - private final List operands; + private final List operands; + + private static final DecimalFormat OP_FORMAT = new DecimalFormat("#.###"); public NumberBasedCondition(final String conditionAsString, final NumberBasedConditionOperator operator, - final List operands) { + final List operands) { super(conditionAsString); this.operator = operator; this.operands = operands; } - public Boolean evaluate(Double metric) { + @Override + public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) { if (operands == null) return false; - List operandsAsDouble = operands.stream().map(Double::parseDouble).collect(Collectors.toList()); + List operandsAsDouble = operands.stream() + .map(operand -> evaluator.evaluate(dqRule, operand)).collect(Collectors.toList()); + + + log.info(String.format("Evaluating condition for rule: %s", dqRule)); + List formatOps = operandsAsDouble.stream().map(OP_FORMAT::format).collect(Collectors.toList()); + String formatMetric = OP_FORMAT.format(metric); switch (operator) { case BETWEEN: if (operands.size() != 2) return false; - else return metric > operandsAsDouble.get(0) && metric < operandsAsDouble.get(1); + else { + boolean result = metric > operandsAsDouble.get(0) && metric < operandsAsDouble.get(1); + log.info("{} between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result); + return result; + } + case NOT_BETWEEN: + if (operands.size() != 2) return false; + else { + boolean result = metric <= operandsAsDouble.get(0) || metric >= operandsAsDouble.get(1); + log.info("{} not between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result); + return result; + } case GREATER_THAN_EQUAL_TO: if (operands.size() != 1) return false; - else return metric >= operandsAsDouble.get(0); + else { + boolean result = metric >= operandsAsDouble.get(0); + log.info("{} >= {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case GREATER_THAN: if (operands.size() != 1) return false; - else return metric > operandsAsDouble.get(0); + else { + boolean result = metric > operandsAsDouble.get(0); + log.info("{} > {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case LESS_THAN_EQUAL_TO: if (operands.size() != 1) return false; - else return metric <= operandsAsDouble.get(0); + else { + boolean result = metric <= operandsAsDouble.get(0); + log.info("{} <= {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case LESS_THAN: if (operands.size() != 1) return false; - else return metric < operandsAsDouble.get(0); + else { + boolean result = metric < operandsAsDouble.get(0); + log.info("{} < {}? {}", formatMetric, formatOps.get(0), result); + return result; + } case EQUALS: if (operands.size() != 1) return false; - else return metric.equals(operandsAsDouble.get(0)); - case IN: - return operandsAsDouble.contains(metric); + else { + boolean result = isOperandEqualToMetric(metric, operandsAsDouble.get(0)); + log.info("{} == {}? {}", formatMetric, formatOps.get(0), result); + return result; + } + case NOT_EQUALS: + if (operands.size() != 1) return false; + else { + boolean result = !isOperandEqualToMetric(metric, operandsAsDouble.get(0)); + log.info("{} != {}? {}", formatMetric, formatOps.get(0), result); + return result; + } + case IN: { + boolean result = operandsAsDouble.stream().anyMatch(operand -> + isOperandEqualToMetric(metric, operand)); + log.info("{} in {}? {}", formatMetric, formatOps, result); + return result; + } + case NOT_IN: { + boolean result = !operandsAsDouble.stream().anyMatch(operand -> + isOperandEqualToMetric(metric, operand)); + log.info("{} not in {}? {}", formatMetric, formatOps, result); + return result; + } default: + log.error("Unknown operator"); return false; } } @@ -69,23 +132,70 @@ public String getFormattedCondition() { switch (operator) { case BETWEEN: - return String.format("between %s and %s", operands.get(0), operands.get(1)); + return String.format("between %s and %s", operands.get(0).toString(), operands.get(1).toString()); + case NOT_BETWEEN: + return String.format("not between %s and %s", operands.get(0).toString(), operands.get(1).toString()); case GREATER_THAN: - return String.format("> %s", operands.get(0)); + return String.format("> %s", operands.get(0).toString()); case GREATER_THAN_EQUAL_TO: - return String.format(">= %s", operands.get(0)); + return String.format(">= %s", operands.get(0).toString()); case LESS_THAN: - return String.format("< %s", operands.get(0)); + return String.format("< %s", operands.get(0).toString()); case LESS_THAN_EQUAL_TO: - return String.format("<= %s", operands.get(0)); + return String.format("<= %s", operands.get(0).toString()); case EQUALS: - return String.format("= %s", operands.get(0)); + return String.format("= %s", operands.get(0).toString()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).toString()); case IN: - return String.format("in [%s]", String.join(",", operands)); + return String.format("in [%s]", getFormattedOperands()); + case NOT_IN: + return String.format("not in [%s]", getFormattedOperands()); default: break; } return ""; } + + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", getSortedFormattedOperands()); + case NOT_IN: + return String.format("not in [%s]", getSortedFormattedOperands()); + default: + return getFormattedCondition(); + } + } + + private String getFormattedOperands() { + return operands.stream() + .map(NumericOperand::toString) + .collect(Collectors.joining(",")); + } + + private String getSortedFormattedOperands() { + return operands.stream() + .map(NumericOperand::toString) + .sorted((s1, s2) -> { + if (s1.equalsIgnoreCase("NULL") && s2.equalsIgnoreCase("NULL")) { + return 0; // Treat both NULLs as equal + } else if (s1.equalsIgnoreCase("NULL")) { + return 1; // Treat NULL as greater than any other value + } else if (s2.equalsIgnoreCase("NULL")) { + return -1; // Treat NULL as greater than any other value + } else { + return Double.compare(Double.parseDouble(s1), Double.parseDouble(s2)); + } + }) + .collect(Collectors.joining(",")); + } + + protected boolean isOperandEqualToMetric(Double metric, Double operand) { + return abs(metric - operand) <= 0.00001; + } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java index 828e5f9..cd109d3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java @@ -12,10 +12,13 @@ public enum NumberBasedConditionOperator { BETWEEN, + NOT_BETWEEN, GREATER_THAN, GREATER_THAN_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, EQUALS, - IN + NOT_EQUALS, + IN, + NOT_IN } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java new file mode 100644 index 0000000..9623996 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java @@ -0,0 +1,30 @@ +/* + * NumericOperand.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; + +@AllArgsConstructor +@Getter +@EqualsAndHashCode +public abstract class NumericOperand implements Serializable { + private final String operand; + private final boolean isParenthesized; + + public NumericOperand(final String operand) { + this.operand = operand; + isParenthesized = false; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java new file mode 100644 index 0000000..be2449a --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java @@ -0,0 +1,24 @@ +/* + * OperandEvaluator.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; + +import java.io.Serializable; + +/** + * Class encapsulates implementation logic for resolving NumericOperand to a number (double). + */ +public abstract class OperandEvaluator implements Serializable { + + // resolve operand to number + public abstract Double evaluate(DQRule rule, NumericOperand operand); +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java new file mode 100644 index 0000000..7b6e65b --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java @@ -0,0 +1,54 @@ +/* + * Size.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; + +@Getter +@EqualsAndHashCode +public class Size implements Serializable, Comparable { + private final Integer amount; + private final SizeUnit unit; + private final Long bytes; + + public Size(final Integer amount, final SizeUnit unit) { + this.amount = amount; + this.unit = unit; + this.bytes = convertBytes(amount, unit); + } + + public String getFormattedSize() { + return String.format("%s %s", amount, unit.name().toUpperCase()); + } + + private Long convertBytes(Integer bytes, SizeUnit unit) { + switch (unit) { + case KB: + return bytes * 1024L; + case MB: + return bytes * 1024L * 1024L; + case GB: + return bytes * 1024L * 1024L * 1024L; + case TB: + return bytes * 1024L * 1024L * 1024L * 1024L; + default: + return Long.valueOf(bytes); + } + } + + @Override + public int compareTo(Size other) { + return Long.compare(this.getBytes(), other.getBytes()); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java new file mode 100644 index 0000000..7a2a34e --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java @@ -0,0 +1,96 @@ +/* + * SizeBasedCondition.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.util.List; +import java.util.stream.Collectors; + +@Getter +@EqualsAndHashCode(callSuper = true) +public class SizeBasedCondition extends Condition { + private final SizeBasedConditionOperator operator; + private final List operands; + + public SizeBasedCondition(final String conditionAsString, + final SizeBasedConditionOperator operator, + final List operands) { + super(conditionAsString); + this.operator = operator; + this.operands = operands; + } + + @Override + public String getFormattedCondition() { + if (this.operands.isEmpty()) return ""; + + switch (operator) { + case BETWEEN: + return String.format("between %s and %s", + operands.get(0).getFormattedSize(), + operands.get(1).getFormattedSize()); + case NOT_BETWEEN: + return String.format("not between %s and %s", + operands.get(0).getFormattedSize(), + operands.get(1).getFormattedSize()); + case GREATER_THAN: + return String.format("> %s", operands.get(0).getFormattedSize()); + case GREATER_THAN_EQUAL_TO: + return String.format(">= %s", operands.get(0).getFormattedSize()); + case LESS_THAN: + return String.format("< %s", operands.get(0).getFormattedSize()); + case LESS_THAN_EQUAL_TO: + return String.format("<= %s", operands.get(0).getFormattedSize()); + case EQUALS: + return String.format("= %s", operands.get(0).getFormattedSize()); + case NOT_EQUALS: + return String.format("!= %s", operands.get(0).getFormattedSize()); + case IN: { + List formattedOperands = getFormattedOperands(); + return String.format("in [%s]", String.join(",", formattedOperands)); + } + case NOT_IN: { + List formattedOperands = getFormattedOperands(); + return String.format("not in [%s]", String.join(",", formattedOperands)); + } + default: + break; + } + + return ""; + } + + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return String.format("in [%s]", String.join(",", getSortedFormattedOperands())); + case NOT_IN: + return String.format("not in [%s]", String.join(",", getSortedFormattedOperands())); + default: + return getFormattedCondition(); + } + } + + private List getFormattedOperands() { + return operands.stream().map(Size::getFormattedSize).collect(Collectors.toList()); + } + + private List getSortedFormattedOperands() { + return operands.stream().map(Size::getFormattedSize).sorted().collect(Collectors.toList()); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java new file mode 100644 index 0000000..d4c9b72 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java @@ -0,0 +1,24 @@ +/* + * SizeBasedConditionOperator.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +public enum SizeBasedConditionOperator { + BETWEEN, + NOT_BETWEEN, + GREATER_THAN, + GREATER_THAN_EQUAL_TO, + LESS_THAN, + LESS_THAN_EQUAL_TO, + EQUALS, + NOT_EQUALS, + IN, + NOT_IN +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java new file mode 100644 index 0000000..076b657 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java @@ -0,0 +1,19 @@ +/* + * SizeUnit.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size; + +public enum SizeUnit { + B, + KB, + MB, + GB, + TB +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java new file mode 100644 index 0000000..1969c8e --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java @@ -0,0 +1,17 @@ +/* + * Keyword.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +public enum Keyword { + NULL, + EMPTY, + WHITESPACES_ONLY +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java new file mode 100644 index 0000000..23431ba --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java @@ -0,0 +1,28 @@ +/* + * KeywordStringOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import lombok.EqualsAndHashCode; + +@EqualsAndHashCode(callSuper = true) +public class KeywordStringOperand extends StringOperand { + final Keyword operand; + + public KeywordStringOperand(final Keyword operand) { + super(operand.toString()); + this.operand = operand; + } + + @Override + public String formatOperand() { + return getOperand().toString(); + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java new file mode 100644 index 0000000..dcf74f3 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java @@ -0,0 +1,22 @@ +/* + * QuotedStringOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +public class QuotedStringOperand extends StringOperand { + public QuotedStringOperand(final String operand) { + super(operand); + } + + @Override + public String formatOperand() { + return "\"" + getOperand() + "\""; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java index 2bc54d0..b76c91c 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java @@ -11,6 +11,7 @@ package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -22,31 +23,45 @@ @EqualsAndHashCode(callSuper = true) public class StringBasedCondition extends Condition { private final StringBasedConditionOperator operator; - private final List operands; + private final List operands; + private final List unresolvedOperands; public StringBasedCondition(final String conditionAsString, final StringBasedConditionOperator operator, - final List operands) { + final List operands) { + this(conditionAsString, operator, operands, null); + } + + public StringBasedCondition(final String conditionAsString, + final StringBasedConditionOperator operator, + final List operands, + final List unresolvedOperands) { super(conditionAsString); this.operator = operator; this.operands = operands; + this.unresolvedOperands = unresolvedOperands; } + @Override public String getFormattedCondition() { if (StringUtils.isBlank(conditionAsString)) return ""; + List effectiveOperands = getEffectiveOperands(); + switch (operator) { case MATCHES: - return String.format("matches %s", formatOperand(operands.get(0))); + return String.format("matches %s", effectiveOperands.get(0).formatOperand()); + case NOT_MATCHES: + return String.format("not matches %s", effectiveOperands.get(0).formatOperand()); case EQUALS: - return String.format("= %s", formatOperand(operands.get(0))); - case IN: { - List formattedOperands = operands.stream() - .map(this::formatOperand) - .collect(Collectors.toList()); - return String.format("in [%s]", String.join(",", formattedOperands)); - } + return String.format("= %s", effectiveOperands.get(0).formatOperand()); + case NOT_EQUALS: + return String.format("!= %s", effectiveOperands.get(0).formatOperand()); + case IN: + return formatInCondition(false, false); + case NOT_IN: + return formatInCondition(true, false); default: break; } @@ -54,7 +69,47 @@ public String getFormattedCondition() { return ""; } - private String formatOperand(String operand) { - return "\"" + operand + "\""; + @Override + public String getSortedFormattedCondition() { + if (StringUtils.isBlank(conditionAsString)) return ""; + + switch (operator) { + case IN: + return formatInCondition(false, true); + case NOT_IN: + return formatInCondition(true, true); + default: + return getFormattedCondition(); + } + } + + private String formatInCondition(boolean isNot, boolean sorted) { + List effectiveOperands = getEffectiveOperands(); + List formattedOperands = sorted + ? getSortedFormattedOperands(effectiveOperands) : getFormattedOperands(effectiveOperands); + String operandStr; + if (formattedOperands.size() == 1 && effectiveOperands.get(0) instanceof VariableReferenceOperand) { + operandStr = formattedOperands.get(0); + } else { + operandStr = "[" + String.join(",", formattedOperands) + "]"; + } + return String.format("%sin %s", isNot ? "not " : "", operandStr); + } + + private List getFormattedOperands(List operands) { + return operands.stream() + .map(StringOperand::formatOperand) + .collect(Collectors.toList()); + } + + private List getSortedFormattedOperands(List operands) { + return operands.stream() + .map(StringOperand::formatOperand) + .sorted() + .collect(Collectors.toList()); + } + + private List getEffectiveOperands() { + return unresolvedOperands != null ? unresolvedOperands : operands; } } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java index f3bd814..afed9f0 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java @@ -12,6 +12,9 @@ public enum StringBasedConditionOperator { EQUALS, + NOT_EQUALS, IN, - MATCHES + NOT_IN, + MATCHES, + NOT_MATCHES } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java new file mode 100644 index 0000000..e121197 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java @@ -0,0 +1,28 @@ +/* + * StringOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import lombok.EqualsAndHashCode; +import lombok.Getter; + +import java.io.Serializable; + +@EqualsAndHashCode +@Getter +public abstract class StringOperand implements Serializable { + private final String operand; + + public StringOperand(final String operand) { + this.operand = operand; + } + + public abstract String formatOperand(); +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java new file mode 100644 index 0000000..5b03058 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java @@ -0,0 +1,46 @@ +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import lombok.AllArgsConstructor; + +import java.io.Serializable; +import java.util.Collections; +import java.util.Map; +import java.util.stream.Collectors; + +@AllArgsConstructor +public class Tag implements Serializable { + private final String key; + private final String value; + + public static Map convertToStringMap(Map tags) { + if (tags == null) { + return Collections.emptyMap(); + } + return tags.entrySet().stream() + .collect(Collectors.toMap( + entry -> entry.getValue().getKey(), + entry -> entry.getValue().getValue() + )); + } + + public String getKey() { + return removeQuotes(this.key); + } + + public String getValue() { + return removeQuotes(this.value); + } + + @Override + public String toString() { + return String.format(" with %s = %s", key, value); + } + + private String removeQuotes(String quotedString) { + if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) { + quotedString = quotedString.substring(1); + quotedString = quotedString.substring(0, quotedString.length() - 1); + } + return quotedString; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java new file mode 100644 index 0000000..0bc31e8 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java @@ -0,0 +1,31 @@ +/* + * VariableReferenceOperand.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +@Getter +@EqualsAndHashCode(callSuper = true) +public class VariableReferenceOperand extends StringOperand { + private final String variableName; + + public VariableReferenceOperand(String variableName) { + super(variableName); + this.variableName = variableName; + } + + @Override + public String formatOperand() { + return "$" + variableName; + } +} diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java index d84bf8f..7ac2dc3 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java @@ -16,6 +16,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; +import lombok.extern.slf4j.Slf4j; import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.CharStreams; import org.antlr.v4.runtime.CommonTokenStream; @@ -24,10 +25,12 @@ import java.util.List; +@Slf4j public class DQDLParser { private static final String PARSING_ERROR_MESSAGE_PREFIX = "Parsing Error"; public DQRuleset parse(String dqdl) throws InvalidDataQualityRulesetException { + CharStream input = CharStreams.fromString(dqdl); DQDLErrorListener errorListener = new DQDLErrorListener(); @@ -41,15 +44,18 @@ public DQRuleset parse(String dqdl) throws InvalidDataQualityRulesetException { parser.addErrorListener(errorListener); DQDLParserListener listener = new DQDLParserListener(errorListener); - ParseTreeWalker.DEFAULT.walk(listener, parser.document()); - + try { + ParseTreeWalker.DEFAULT.walk(listener, parser.document()); + } catch (StringIndexOutOfBoundsException e) { + log.error(e.getMessage(), e); + throw new InvalidDataQualityRulesetException("Invalid DQDL."); + } Either, DQRuleset> dqRulesetEither = listener.getParsedRuleset(); - if (dqRulesetEither.isLeft()) { throw new InvalidDataQualityRulesetException(generateExceptionMessage(dqRulesetEither.getLeft())); - } else { - return dqRulesetEither.getRight(); } + return dqRulesetEither.getRight(); + } private String generateExceptionMessage(List errorMessages) { diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java index 657df11..3fa355d 100644 --- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java @@ -10,38 +10,71 @@ package com.amazonaws.glue.ml.dataquality.dqdl.parser; +import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener; +import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQAnalyzer; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleLogicalOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleParameterValue; import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQVariable; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.NullDateExpression; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedConditionOperator; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.AtomicNumberOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.BinaryExpressionOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.FunctionCallOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NullNumericOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.Size; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedConditionOperator; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeUnit; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; -import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; import com.amazonaws.glue.ml.dataquality.dqdl.util.Either; -import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener; -import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser; -import org.antlr.v4.runtime.RuleContext; - +import org.antlr.v4.runtime.ParserRuleContext; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.time.zone.ZoneRulesException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag.convertToStringMap; + public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListener { private final DQDLErrorListener errorListener; private final List errorMessages = new ArrayList<>(); @@ -50,6 +83,8 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private String primarySource; private List additionalSources; private final List dqRules = new ArrayList<>(); + private final List dqAnalyzers = new ArrayList<>(); + private final Map dqVariables = new HashMap<>(); private static final String METADATA_VERSION_KEY = "Version"; private static final Set ALLOWED_METADATA_KEYS; @@ -57,6 +92,12 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene private static final String PRIMARY_SOURCE_KEY = "Primary"; private static final String ADDITIONAL_SOURCES_KEY = "AdditionalDataSources"; private static final Set ALLOWED_SOURCES_KEYS; + private static final String THRESHOLD_KEY = "threshold"; + + private static final String MILITARY_TIME_FORMAT = "HH:mm"; + private static final String AMPM_TIME_FORMAT = "h:mm a"; + + private static final int COMPOSITE_RULE_MAX_NESTING_DEPTH = 5; static { ALLOWED_METADATA_KEYS = new HashSet<>(); @@ -72,8 +113,13 @@ public DQDLParserListener(DQDLErrorListener errorListener) { } public Either, DQRuleset> getParsedRuleset() { + // Only add this error message if we did not walk the tree due to empty rules or analyzers sections. + if (errorMessages.isEmpty() && dqRules.isEmpty() && dqAnalyzers.isEmpty()) { + errorMessages.add("No rules or analyzers provided."); + } + if (errorMessages.isEmpty() && errorListener.getErrorMessages().isEmpty()) { - return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules)); + return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules, dqAnalyzers)); } else { List allErrorMessages = new ArrayList<>(); allErrorMessages.addAll(errorMessages); @@ -85,8 +131,18 @@ public Either, DQRuleset> getParsedRuleset() { @Override public void enterMetadata(DataQualityDefinitionLanguageParser.MetadataContext ctx) { - for (DataQualityDefinitionLanguageParser.PairContext pairContext - : ctx.dictionary().pair()) { + // The logic below, just above the loop is a guard against an NPE caused by empty dictionaries. + // Need to investigate why dictionaryContext.pair() returns 1 element, + // which is an empty string, for an empty dictionary. + // We would not have this problem if dictionaryContext.pair() returned 0 entries in the list. + DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary(); + List dictionaryErrors = validateDictionary(dictionaryContext); + if (!dictionaryErrors.isEmpty()) { + errorMessages.addAll(dictionaryErrors); + return; + } + + for (DataQualityDefinitionLanguageParser.PairContext pairContext: dictionaryContext.pair()) { String key = removeEscapes(removeQuotes(pairContext.QUOTED_STRING().getText())); if (!ALLOWED_METADATA_KEYS.contains(key)) { errorMessages.add("Unsupported key provided in Metadata section"); @@ -99,16 +155,15 @@ public void enterMetadata(DataQualityDefinitionLanguageParser.MetadataContext ct } @Override - public void enterRules(DataQualityDefinitionLanguageParser.RulesContext ctx) { - if (ctx.dqRules() == null) { - errorMessages.add("No rules provided."); + public void enterDataSources(DataQualityDefinitionLanguageParser.DataSourcesContext ctx) { + DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary(); + List dictionaryErrors = validateDictionary(dictionaryContext); + if (!dictionaryErrors.isEmpty()) { + errorMessages.addAll(dictionaryErrors); + return; } - } - @Override - public void enterDataSources(DataQualityDefinitionLanguageParser.DataSourcesContext ctx) { - for (DataQualityDefinitionLanguageParser.PairContext pairContext - : ctx.dictionary().pair()) { + for (DataQualityDefinitionLanguageParser.PairContext pairContext: dictionaryContext.pair()) { String key = removeEscapes(removeQuotes(pairContext.QUOTED_STRING().getText())); if (!ALLOWED_SOURCES_KEYS.contains(key)) { @@ -144,44 +199,121 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu return; } - for (DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc - : dqRulesContext.topLevelRule()) { - if (tlc.AND().size() > 0 || tlc.OR().size() > 0) { - DQRuleLogicalOperator op = tlc.AND().size() > 0 ? DQRuleLogicalOperator.AND : DQRuleLogicalOperator.OR; - List nestedRules = new ArrayList<>(); + for (DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc: dqRulesContext.topLevelRule()) { + Either dqRuleEither = parseTopLevelRule(tlc, 0); + if (dqRuleEither.isLeft()) { + errorMessages.add(dqRuleEither.getLeft()); + return; + } else { + dqRules.add(dqRuleEither.getRight()); + } + } + } - for (DataQualityDefinitionLanguageParser.DqRuleContext rc : tlc.dqRule()) { - Either dqRuleEither = getDQRule(rc); - if (dqRuleEither.isLeft()) { - errorMessages.add(dqRuleEither.getLeft()); - return; - } else { - nestedRules.add(dqRuleEither.getRight()); - } - } + private Either parseTopLevelRule(DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc, + int depth) { + if (tlc.LPAREN() != null && tlc.RPAREN() != null) { + return parseTopLevelRule(tlc.topLevelRule(0), depth); + } else if (tlc.AND() != null || tlc.OR() != null) { + DQRuleLogicalOperator op = tlc.AND() != null ? DQRuleLogicalOperator.AND : DQRuleLogicalOperator.OR; + List> nestedRuleEitherList = + tlc.topLevelRule().stream().map(r -> parseTopLevelRule(r, depth + 1)).collect(Collectors.toList()); - dqRules.add(new DQRule("Composite", null, null, null, op, nestedRules)); - } else if (tlc.dqRule(0) != null) { - Either dqRuleEither = getDQRule(tlc.dqRule(0)); - if (dqRuleEither.isLeft()) { - errorMessages.add(dqRuleEither.getLeft()); - return; + List allErrorMessages = new ArrayList<>(); + List allRules = new ArrayList<>(); + + nestedRuleEitherList.forEach(arg -> { + if (arg.isLeft()) { + allErrorMessages.add(arg.getLeft()); } else { - dqRules.add(dqRuleEither.getRight()); + allRules.add(arg.getRight()); } + }); + + if (allErrorMessages.isEmpty()) { + return Either.fromRight( + new DQRule("Composite", null, null, null, op, allRules) + ); } else { - errorMessages.add("No valid rule found"); + return Either.fromLeft(allErrorMessages.get(0)); + } + } else if (tlc.dqRule() != null) { + if (depth > COMPOSITE_RULE_MAX_NESTING_DEPTH) { + return Either.fromLeft( + String.format("Maximum nested expression depth of %s reached for composite rule", + COMPOSITE_RULE_MAX_NESTING_DEPTH)); + } else { + return getDQRule(tlc.dqRule()); + } + } else { + return Either.fromLeft("No valid rule found"); + } + } + + @Override + public void enterDqAnalyzers(DataQualityDefinitionLanguageParser.DqAnalyzersContext dqAnalyzersContext) { + if (!errorMessages.isEmpty()) { + return; + } + + for (DataQualityDefinitionLanguageParser.DqAnalyzerContext dac: dqAnalyzersContext.dqAnalyzer()) { + Either dqAnalyzerEither = getDQAnalyzer(dac); + if (dqAnalyzerEither.isLeft()) { + errorMessages.add(dqAnalyzerEither.getLeft()); return; + } else { + dqAnalyzers.add(dqAnalyzerEither.getRight()); } } } + @Override + public void enterVariableDeclaration(DataQualityDefinitionLanguageParser.VariableDeclarationContext ctx) { + if (!errorMessages.isEmpty()) { + return; + } + + String variableName = ctx.IDENTIFIER().getText(); + + if (variableName.startsWith(".") || variableName.startsWith("_")) { + errorMessages.add(String.format("Variable name '%s' cannot start with '.' or '_'", variableName)); + return; + } + + if (dqVariables.containsKey(variableName)) { + errorMessages.add("Variable '" + variableName + "' is already defined"); + return; + } + + DQVariable variable = null; + DataQualityDefinitionLanguageParser.ExpressionContext expr = ctx.expression(); + if (expr == null) { + errorMessages.add(String.format("Missing value for variable '%s'", variableName)); + return; + } + + if (expr.stringValuesArray() != null) { + List values = expr.stringValuesArray().stringValues().stream() + .map(this::processStringValues) + .collect(Collectors.toList()); + variable = new DQVariable(variableName, DQVariable.VariableType.STRING_ARRAY, values); + } else if (expr.stringValues() != null) { + String value = processStringValues(expr.stringValues()); + variable = new DQVariable(variableName, DQVariable.VariableType.STRING, value); + } + + if (variable != null) { + dqVariables.put(variableName, variable); + } else { + errorMessages.add(String.format("Failed to parse variable '%s'", variableName)); + } + } + private Either getDQRule( DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { String ruleType = dqRuleContext.ruleType().getText(); - List parameters = dqRuleContext.parameter().stream() - .map(p -> p.getText().replaceAll("\"", "")) - .collect(Collectors.toList()); + + List parameters = parseParameters(dqRuleContext.parameterWithConnectorWord()); Optional optionalDQRuleType = DQRuleType.getRuleType(ruleType, parameters.size()); @@ -191,19 +323,79 @@ private Either getDQRule( DQRuleType dqRuleType = optionalDQRuleType.get(); + if (dqRuleType.isAnalyzerOnly()) { + return Either.fromLeft(String.format("Analyzer Type: %s is not supported in rules section", ruleType)); + } + Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters); if (errorMessage.isPresent()) { return Either.fromLeft(String.format(errorMessage.get() + ": %s", ruleType)); } - Map parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); + LinkedHashMap parameterMap = + dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); + + String whereClause = null; + if (dqRuleContext.whereClause() != null) { + if (dqRuleType.isWhereClauseSupported()) { + DataQualityDefinitionLanguageParser.WhereClauseContext ctx = dqRuleContext.whereClause(); + if (ctx.quotedString().getText().isEmpty() || ctx.quotedString().getText().equals("\"\"")) { + return Either.fromLeft( + String.format("Empty where condition provided for rule type: %s", ruleType)); + } else { + whereClause = removeQuotes(ctx.quotedString().getText()); + } + } else { + return Either.fromLeft(String.format("Where clause is not supported for rule type: %s", ruleType)); + } + } + + Condition thresholdCondition = null; + Map tags = new HashMap<>(); + List tagContexts = + (dqRuleContext.tagWithCondition() == null) ? new ArrayList<>() : dqRuleContext.tagWithCondition(); + for (DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext : tagContexts) { + if (tagContext.stringBasedCondition() != null) { + //process plain string tag + final Either outcome = processStringTag(tagContext); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); + } else { + final Tag tag = outcome.getRight(); + tags.put(tag.getKey(), tag); + } + } else if (tagContext.numberBasedCondition() != null) { + final String tagName = tagContext.tagValues().getText(); + if (tagName.equalsIgnoreCase(THRESHOLD_KEY)) { + //process threshold tag + final Either outcome = + processThresholdTag(dqRuleType, thresholdCondition, tagContext, ruleType); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); + } else { + thresholdCondition = outcome.getRight(); + } + } else { + //convert number tag into string tag + final Either outcome = processNumberTag(tagContext, tagName); + if (outcome.isLeft()) { + return Either.fromLeft(outcome.getLeft()); + } else { + final Tag tag = outcome.getRight(); + tags.put(tag.getKey(), tag); + } + } + } else { + return Either.fromLeft(String.format("Invalid tag provided for rule type: %s", ruleType)); + } + } Condition condition; List> conditions = Arrays.stream(dqRuleType.getReturnType().split("\\|")) - .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext)) - .collect(Collectors.toList()); + .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext, convertToStringMap(tags))) + .collect(Collectors.toList()); Optional> optionalCondition = conditions.stream().filter(Either::isRight).findFirst(); if (optionalCondition.isPresent()) { @@ -214,50 +406,132 @@ private Either getDQRule( } } else { Optional> optionalFailedCondition = - conditions.stream().filter(Either::isLeft).findFirst(); + conditions.stream().filter(Either::isLeft).findFirst(); if (optionalFailedCondition.isPresent()) { return Either.fromLeft(optionalFailedCondition.get().getLeft()); } else { return Either.fromLeft( - String.format("Error while parsing condition for rule with rule type: %s", ruleType)); + String.format("Error while parsing condition for rule with rule type: %s", ruleType)); } } - Condition thresholdCondition = null; - if (dqRuleContext.withThresholdCondition() != null) { - if (dqRuleType.isThresholdSupported()) { - DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = - dqRuleContext.withThresholdCondition().numberBasedCondition(); - - if (ctx == null) { - return Either.fromLeft( - String.format("Empty threshold condition provided for rule type: %s", ruleType)); - } else { - Optional possibleCond = - parseNumberBasedCondition(dqRuleContext.withThresholdCondition().numberBasedCondition()); - if (possibleCond.isPresent()) { - thresholdCondition = possibleCond.get(); - } else { - return Either.fromLeft( - String.format("Unable to parse threshold condition provided for rule type: %s", ruleType)); - } - } + return Either.fromRight( + DQRule.createFromParameterValueMapWithVariables( + dqRuleType, parameterMap, condition, thresholdCondition, whereClause, tags, dqVariables) + ); + } - } else { - return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType)); + private Either processThresholdTag(DQRuleType dqRuleType, + Condition thresholdCondition, + DataQualityDefinitionLanguageParser + .TagWithConditionContext tagContext, + String ruleType) { + if (dqRuleType.isThresholdSupported()) { + if (thresholdCondition != null) { + return Either.fromLeft("Only one threshold condition at a time is supported."); } + return processThresholdTag(tagContext, ruleType); + } else { + return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType)); } + } - return Either.fromRight( - new DQRule(dqRuleType.getRuleTypeName(), parameterMap, condition, - thresholdCondition, DQRuleLogicalOperator.AND, new ArrayList<>()) - ); + private Either processNumberTag(DataQualityDefinitionLanguageParser + .TagWithConditionContext tagContext, + String tagName) { + if (!isTagValid(tagContext.numberBasedCondition())) { + return Either.fromLeft("Number tags only support the equality operator."); + } + final List numberContexts = + tagContext.numberBasedCondition().number(); + if (numberContexts != null && !numberContexts.isEmpty()) { + final String tagValue = numberContexts.get(0).getText(); + return Either.fromRight(new Tag(tagName, tagValue)); + } else { + return Either.fromLeft(String.format("Error Parsing Tag %s", tagName)); + } + } + + private Either processStringTag( + DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext) { + if (!isTagValid(tagContext.stringBasedCondition())) { + return Either.fromLeft("String tags only support the equality operator."); + } + String tagKey = tagContext.tagValues().getText(); + Optional valueCondition = parseStringBasedCondition(tagContext.stringBasedCondition()); + if (valueCondition.isPresent()) { + StringBasedCondition stringCondition = (StringBasedCondition) valueCondition.get(); + String tagValue = stringCondition.getOperands().get(0).formatOperand(); + return Either.fromRight(new Tag(tagKey, tagValue)); + } else { + return Either.fromLeft(String.format("Error while parsing tag: %s", tagKey)); + } + } + + private Either processThresholdTag( + DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext, String ruleType) { + DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx = + tagContext.numberBasedCondition(); + Optional possibleCond = parseNumberBasedCondition(ctx); + if (possibleCond.isPresent()) { + return Either.fromRight(possibleCond.get()); + } else { + return Either.fromLeft(String.format( + "Unable to parse threshold condition provided for rule type: %s", ruleType)); + } + } + + private boolean isTagValid(ParserRuleContext ctx) { + if (ctx instanceof DataQualityDefinitionLanguageParser.StringBasedConditionContext) { + final DataQualityDefinitionLanguageParser.StringBasedConditionContext stringCtx = + (DataQualityDefinitionLanguageParser.StringBasedConditionContext) ctx; + return stringCtx.EQUAL_TO() != null && stringCtx.NEGATION() == null; + } else if (ctx instanceof DataQualityDefinitionLanguageParser.NumberBasedConditionContext) { + final DataQualityDefinitionLanguageParser.NumberBasedConditionContext numberCtx = + (DataQualityDefinitionLanguageParser.NumberBasedConditionContext) ctx; + return numberCtx.EQUAL_TO() != null && numberCtx.NEGATION() == null; + } else { + return false; + } + } + + private Either getDQAnalyzer( + DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) { + String analyzerType = dqAnalyzerContext.analyzerType().getText(); + + List parameters = parseParameters(dqAnalyzerContext.parameterWithConnectorWord()); + + // We just use the DQ Rule names to validate what analyzer names to allow. + // This might change closer to re:Invent, but keeping it simple for now. + Optional optionalDQAnalyzerType = DQRuleType.getRuleType(analyzerType, parameters.size()); + + if (!optionalDQAnalyzerType.isPresent()) { + return Either.fromLeft(String.format("Analyzer Type: %s is not valid", analyzerType)); + } + + DQRuleType dqRuleType = optionalDQAnalyzerType.get(); + + if (dqRuleType.getReturnType().equals("BOOLEAN")) { + return Either.fromLeft(String.format("Analyzer Type: %s is not supported", analyzerType)); + } + + Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters); + + if (errorMessage.isPresent()) { + return Either.fromLeft(String.format(errorMessage.get() + ": %s", analyzerType)); + } + + LinkedHashMap parameterMap = + dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters); + + return Either.fromRight(DQAnalyzer.createFromValueMap(analyzerType, parameterMap)); } private Either parseCondition( DQRuleType ruleType, String returnType, - DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) { + DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext, + Map tags) { Either response = Either.fromLeft(String.format("Error parsing condition for return type: %s", returnType)); @@ -312,7 +586,7 @@ private Either parseCondition( ruleType.getRuleTypeName())); } else { Optional possibleCond = - parseDateBasedCondition(dqRuleContext.condition().dateBasedCondition()); + parseDateBasedCondition(dqRuleContext.condition().dateBasedCondition(), tags); if (possibleCond.isPresent()) { response = Either.fromRight(possibleCond.get()); @@ -336,6 +610,31 @@ private Either parseCondition( } break; } + case "SIZE": + case "SIZE_ARRAY": { + DataQualityDefinitionLanguageParser.ConditionContext cx = dqRuleContext.condition(); + if (cx == null || (cx.sizeBasedCondition() == null && cx.numberBasedCondition() == null)) { + return Either.fromLeft( + String.format("Unexpected condition for rule of type %s with size return type", + ruleType.getRuleTypeName())); + } else if (cx.sizeBasedCondition() != null) { + Optional possibleCond = + parseSizeBasedCondition(dqRuleContext.condition().sizeBasedCondition()); + + if (possibleCond.isPresent()) { + response = Either.fromRight(possibleCond.get()); + } + } else if (cx.numberBasedCondition() != null) { + Optional possibleCond = + convertNumberToSizeCondition( + parseNumberBasedCondition(dqRuleContext.condition().numberBasedCondition())); + + if (possibleCond.isPresent()) { + response = Either.fromRight(possibleCond.get()); + } + } + break; + } default: break; } @@ -343,6 +642,24 @@ private Either parseCondition( return response; } + private Optional convertNumberToSizeCondition(Optional in) { + if (!in.isPresent() || !(in.get() instanceof NumberBasedCondition)) { + return Optional.empty(); + } + NumberBasedCondition input = (NumberBasedCondition) in.get(); + final String conditionAsString = input.getConditionAsString(); + final SizeBasedConditionOperator operator = SizeBasedConditionOperator.valueOf(input.getOperator().name()); + final List operands = input.getOperands().stream() + .filter(x -> x instanceof AtomicNumberOperand) + .filter(x -> Double.parseDouble(x.getOperand()) % 1 == 0) // filter only integer + .map(x -> new Size(Integer.parseInt(x.getOperand()), SizeUnit.B)) + .collect(Collectors.toList()); + if (operands.size() != input.getOperands().size()) { + return Optional.empty(); + } + return Optional.of(new SizeBasedCondition(conditionAsString, operator, operands)); + } + private Optional parseNumberBasedCondition( DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx) { @@ -350,118 +667,283 @@ private Optional parseNumberBasedCondition( Condition condition = null; if (ctx.BETWEEN() != null && ctx.number().size() == 2) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.BETWEEN, - Arrays.asList(ctx.number(0).getText(), ctx.number(1).getText())); + Optional operand1 = parseNumericOperand(ctx.number(0), false); + Optional operand2 = parseNumericOperand(ctx.number(1), false); + + if (operand1.isPresent() && operand2.isPresent()) { + NumberBasedConditionOperator op = (ctx.NOT() != null) ? + NumberBasedConditionOperator.NOT_BETWEEN + : NumberBasedConditionOperator.BETWEEN; + condition = new NumberBasedCondition(exprStr, op, Arrays.asList(operand1.get(), operand2.get())); + } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.GREATER_THAN_EQUAL_TO, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.GREATER_THAN_EQUAL_TO, + Collections.singletonList(operand.get())); + } } else if (ctx.GREATER_THAN() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.GREATER_THAN, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.GREATER_THAN, + Collections.singletonList(operand.get())); + } } else if (ctx.LESS_THAN() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.LESS_THAN, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.LESS_THAN, + Collections.singletonList(operand.get())); + } } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.LESS_THAN_EQUAL_TO, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + condition = new NumberBasedCondition( + exprStr, NumberBasedConditionOperator.LESS_THAN_EQUAL_TO, + Collections.singletonList(operand.get())); + } } else if (ctx.EQUAL_TO() != null && ctx.number().size() == 1) { - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.EQUALS, - Collections.singletonList(ctx.number(0).getText())); + Optional operand = parseNumericOperand(ctx.number(0), false); + if (operand.isPresent()) { + NumberBasedConditionOperator op = (ctx.NEGATION() != null) ? + NumberBasedConditionOperator.NOT_EQUALS + : NumberBasedConditionOperator.EQUALS; + condition = new NumberBasedCondition( + exprStr, op, Collections.singletonList(operand.get())); + } } else if (ctx.IN() != null && ctx.numberArray() != null && ctx.numberArray().number().size() > 0) { - List numbers = ctx.numberArray().number().stream() - .map(RuleContext::getText) + List> numbers = ctx.numberArray().number() + .stream() + .map(op -> parseNumericOperand(op, false)) .collect(Collectors.toList()); - condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.IN, numbers); + if (numbers.stream().allMatch(Optional::isPresent)) { + NumberBasedConditionOperator op = (ctx.NOT() != null) ? + NumberBasedConditionOperator.NOT_IN + : NumberBasedConditionOperator.IN; + condition = new NumberBasedCondition(exprStr, op, + numbers.stream().map(Optional::get).collect(Collectors.toList())); + } } return Optional.ofNullable(condition); } + private Optional parseNumericOperand( + DataQualityDefinitionLanguageParser.NumberContext numberContext, boolean isParenthesized + ) { + if (numberContext.numberOp() != null) { + Optional operand1 = parseNumericOperand(numberContext.number(0), false); + Optional operand2 = parseNumericOperand(numberContext.number(1), false); + if (operand1.isPresent() && operand2.isPresent()) { + return Optional.of( + new BinaryExpressionOperand( + numberContext.getText(), + numberContext.numberOp().getText(), + operand1.get(), operand2.get(), + isParenthesized + ) + ); + } else { + return Optional.empty(); + } + } else if (numberContext.functionCall() != null) { + DataQualityDefinitionLanguageParser.FunctionCallContext fcc = numberContext.functionCall(); + String functionName = fcc.IDENTIFIER().getText(); + List functionParameters = new ArrayList<>(); + + if (fcc.functionParameters() != null) { + List> parameters = fcc.functionParameters().number() + .stream() + .map(op -> parseNumericOperand(op, false)) + .collect(Collectors.toList()); + + if (parameters.stream().allMatch(Optional::isPresent)) { + functionParameters = parameters.stream().map(Optional::get).collect(Collectors.toList()); + return Optional.of( + new FunctionCallOperand(fcc.getText(), functionName, functionParameters) + ); + } + } else { + // No parameter function + return Optional.of( + new FunctionCallOperand(fcc.getText(), functionName, functionParameters) + ); + } + } else if (numberContext.LPAREN() != null) { + return parseNumericOperand(numberContext.number(0), true); + } else if (numberContext.atomicNumber() != null) { + return Optional.of(new AtomicNumberOperand(numberContext.getText())); + } else if (numberContext.NULL() != null) { + return Optional.of(new NullNumericOperand(numberContext.getText())); + } + + return Optional.empty(); + } + private Optional parseStringBasedCondition( DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx ) { String exprStr = ctx.getText(); Condition condition = null; - if (ctx.EQUAL_TO() != null && ctx.quotedString() != null) { - condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.EQUALS, - Collections.singletonList(removeQuotes(ctx.quotedString().QUOTED_STRING().getText()))); - } else if (ctx.IN() != null && - ctx.quotedStringArray() != null && - ctx.quotedStringArray().quotedString().size() > 0) { - condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.IN, - ctx.quotedStringArray().quotedString().stream() - .map(s -> removeQuotes(removeEscapes(s.getText()))) - .collect(Collectors.toList()) - ); + if (ctx.EQUAL_TO() != null) { + StringBasedConditionOperator op = (ctx.NEGATION() != null) ? + StringBasedConditionOperator.NOT_EQUALS + : StringBasedConditionOperator.EQUALS; + + StringOperand operand; + if (ctx.variableDereference() != null) { + operand = new VariableReferenceOperand(ctx.variableDereference().IDENTIFIER().getText()); + } else if (ctx.stringValues() != null) { + Optional parsedOperand = parseStringOperand(ctx, Optional.of(ctx.stringValues()), op); + if (!parsedOperand.isPresent()) { + return Optional.empty(); + } + operand = parsedOperand.get(); + } else { + return Optional.empty(); + } + + condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand)); + } else if (ctx.IN() != null) { + StringBasedConditionOperator op = (ctx.NOT() != null) ? + StringBasedConditionOperator.NOT_IN + : StringBasedConditionOperator.IN; + + List operands; + if (ctx.variableDereference() != null) { + operands = Collections.singletonList( + new VariableReferenceOperand(ctx.variableDereference().IDENTIFIER().getText())); + } else if (ctx.stringValuesArray() != null && ctx.stringValuesArray().stringValues().size() > 0) { + operands = ctx.stringValuesArray().stringValues() + .stream() + .map(s -> parseStringOperand(ctx, Optional.of(s), op)) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(Collectors.toList()); + } else { + return Optional.empty(); + } + + if (!operands.isEmpty()) { + condition = new StringBasedCondition(exprStr, op, operands); + } } else if (ctx.matchesRegexCondition() != null) { - condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.MATCHES, - Collections.singletonList(removeQuotes(ctx.matchesRegexCondition().quotedString().getText()))); + StringBasedConditionOperator op = (ctx.NOT() != null) ? + StringBasedConditionOperator.NOT_MATCHES + : StringBasedConditionOperator.MATCHES; + Optional operand = parseStringOperand(ctx, Optional.ofNullable(ctx.stringValues()), op); + if (operand.isPresent()) { + condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand.get())); + } } return Optional.ofNullable(condition); } + private Optional parseStringOperand( + DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx, + Optional + stringValuesContext, StringBasedConditionOperator op) { + + switch (op) { + case NOT_EQUALS: + case EQUALS: + Keyword keyword = parseKeyword(stringValuesContext.get()); + if (keyword == null) { + return Optional.of(new QuotedStringOperand( + removeQuotes(stringValuesContext.get().quotedString().getText()))); + } else { + return Optional.of(new KeywordStringOperand(keyword)); + } + case NOT_IN: + case IN: + keyword = parseKeyword(stringValuesContext.get()); + if (keyword == null) { + return Optional.of(new QuotedStringOperand( + removeQuotes(removeEscapes(stringValuesContext.get().quotedString().getText())))); + } else { + return Optional.of(new KeywordStringOperand(keyword)); + } + case MATCHES: + case NOT_MATCHES: + return Optional.of(new QuotedStringOperand( + removeQuotes(ctx.matchesRegexCondition().quotedString().getText()))); + default: + return Optional.empty(); + } + } + private Optional parseDateBasedCondition( - DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx) { + DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx, Map tags) { String exprStr = ctx.getText(); Condition condition = null; if (ctx.BETWEEN() != null && ctx.dateExpression().size() == 2) { - Optional lower = parseDateExpression(ctx.dateExpression(0)); - Optional upper = parseDateExpression(ctx.dateExpression(1)); + Optional lower = parseDateExpression(ctx.dateExpression(0), tags); + Optional upper = parseDateExpression(ctx.dateExpression(1), tags); if (lower.isPresent() && upper.isPresent()) { + DateBasedConditionOperator op = (ctx.NOT() != null) ? + DateBasedConditionOperator.NOT_BETWEEN + : DateBasedConditionOperator.BETWEEN; condition = new DateBasedCondition( - exprStr, DateBasedConditionOperator.BETWEEN, Arrays.asList(lower.get(), upper.get()) + exprStr, op, Arrays.asList(lower.get(), upper.get()) ); } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(operand.get()) ); } } else if (ctx.GREATER_THAN() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(operand.get()) ); } } else if (ctx.LESS_THAN() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.LESS_THAN, Collections.singletonList(operand.get()) ); } } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { condition = new DateBasedCondition( exprStr, DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(operand.get()) ); } } else if (ctx.EQUAL_TO() != null && ctx.dateExpression().size() == 1) { - Optional operand = parseDateExpression(ctx.dateExpression(0)); + Optional operand = parseDateExpression(ctx.dateExpression(0), tags); if (operand.isPresent()) { + DateBasedConditionOperator op = (ctx.NEGATION() != null) ? + DateBasedConditionOperator.NOT_EQUALS + : DateBasedConditionOperator.EQUALS; condition = new DateBasedCondition( - exprStr, DateBasedConditionOperator.EQUALS, Collections.singletonList(operand.get()) + exprStr, op, Collections.singletonList(operand.get()) ); } } else if (ctx.IN() != null && ctx.dateExpressionArray() != null && ctx.dateExpressionArray().dateExpression().size() > 0) { List> expressions = ctx.dateExpressionArray().dateExpression().stream() - .map(this::parseDateExpression) + .map(x -> parseDateExpression(x, tags)) .collect(Collectors.toList()); - if (expressions.stream().allMatch(Optional::isPresent)) { + DateBasedConditionOperator op = (ctx.NOT() != null) ? + DateBasedConditionOperator.NOT_IN + : DateBasedConditionOperator.IN; condition = new DateBasedCondition( - exprStr, DateBasedConditionOperator.IN, + exprStr, op, expressions.stream().map(Optional::get).collect(Collectors.toList()) ); } @@ -481,8 +963,11 @@ private Optional parseDurationBasedCondition( Optional lower = parseDuration(ctx.durationExpression(0)); Optional upper = parseDuration(ctx.durationExpression(1)); if (lower.isPresent() && upper.isPresent()) { + DurationBasedConditionOperator op = (ctx.NOT() != null) ? + DurationBasedConditionOperator.NOT_BETWEEN + : DurationBasedConditionOperator.BETWEEN; condition = new DurationBasedCondition( - exprStr, DurationBasedConditionOperator.BETWEEN, Arrays.asList(lower.get(), upper.get()) + exprStr, op, Arrays.asList(lower.get(), upper.get()) ); } } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.durationExpression().size() == 1) { @@ -520,8 +1005,11 @@ private Optional parseDurationBasedCondition( } else if (ctx.EQUAL_TO() != null && ctx.durationExpression().size() == 1) { Optional operand = parseDuration(ctx.durationExpression(0)); if (operand.isPresent()) { + DurationBasedConditionOperator op = (ctx.NEGATION() != null) ? + DurationBasedConditionOperator.NOT_EQUALS + : DurationBasedConditionOperator.EQUALS; condition = new DurationBasedCondition( - exprStr, DurationBasedConditionOperator.EQUALS, + exprStr, op, Collections.singletonList(operand.get()) ); } @@ -534,8 +1022,11 @@ private Optional parseDurationBasedCondition( .collect(Collectors.toList()); if (durations.stream().allMatch(Optional::isPresent)) { + DurationBasedConditionOperator op = (ctx.NOT() != null) ? + DurationBasedConditionOperator.NOT_IN + : DurationBasedConditionOperator.IN; condition = new DurationBasedCondition( - exprStr, DurationBasedConditionOperator.IN, + exprStr, op, durations.stream().map(Optional::get).collect(Collectors.toList()) ); } @@ -544,8 +1035,91 @@ private Optional parseDurationBasedCondition( return Optional.ofNullable(condition); } + private Optional parseSizeBasedCondition( + DataQualityDefinitionLanguageParser.SizeBasedConditionContext ctx + ) { + + String exprStr = ctx.getText(); + Condition condition = null; + + if (ctx.BETWEEN() != null && ctx.sizeExpression().size() == 2) { + Optional lower = parseSize(ctx.sizeExpression(0)); + Optional upper = parseSize(ctx.sizeExpression(1)); + if (lower.isPresent() && upper.isPresent()) { + SizeBasedConditionOperator op = (ctx.NOT() != null) ? + SizeBasedConditionOperator.NOT_BETWEEN + : SizeBasedConditionOperator.BETWEEN; + condition = new SizeBasedCondition( + exprStr, op, Arrays.asList(lower.get(), upper.get()) + ); + } + } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.GREATER_THAN_EQUAL_TO, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.GREATER_THAN() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.GREATER_THAN, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.LESS_THAN() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.LESS_THAN, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + condition = new SizeBasedCondition( + exprStr, SizeBasedConditionOperator.LESS_THAN_EQUAL_TO, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.EQUAL_TO() != null && ctx.sizeExpression().size() == 1) { + Optional operand = parseSize(ctx.sizeExpression(0)); + if (operand.isPresent()) { + SizeBasedConditionOperator op = (ctx.NEGATION() != null) ? + SizeBasedConditionOperator.NOT_EQUALS + : SizeBasedConditionOperator.EQUALS; + condition = new SizeBasedCondition( + exprStr, op, + Collections.singletonList(operand.get()) + ); + } + } else if (ctx.IN() != null && + ctx.sizeExpressionArray() != null && + ctx.sizeExpressionArray().sizeExpression().size() > 0) { + + List> sizes = ctx.sizeExpressionArray().sizeExpression().stream() + .map(this::parseSize) + .collect(Collectors.toList()); + + if (sizes.stream().allMatch(Optional::isPresent)) { + SizeBasedConditionOperator op = (ctx.NOT() != null) ? + SizeBasedConditionOperator.NOT_IN + : SizeBasedConditionOperator.IN; + condition = new SizeBasedCondition( + exprStr, op, + sizes.stream().map(Optional::get).collect(Collectors.toList()) + ); + } + } + + return Optional.ofNullable(condition); + } + private Optional parseDateExpression( - DataQualityDefinitionLanguageParser.DateExpressionContext ctx) { + DataQualityDefinitionLanguageParser.DateExpressionContext ctx, Map tags) { if (ctx.durationExpression() != null) { Optional duration = parseDuration(ctx.durationExpression()); return duration.map(value -> new DateExpression.CurrentDateExpression( @@ -556,11 +1130,41 @@ private Optional parseDateExpression( )); } else if (ctx.dateNow() != null) { return Optional.of(new DateExpression.CurrentDate()); + } else if (ctx.NULL() != null) { + return Optional.of(new NullDateExpression()); + } else if (ctx.timeExpression() != null) { + final String time = removeQuotes(ctx.timeExpression().MIL_TIME() != null + ? ctx.timeExpression().MIL_TIME().getText() + : ctx.timeExpression().TIME().getText()); + final String pattern = ctx.timeExpression().MIL_TIME() != null + ? MILITARY_TIME_FORMAT + : AMPM_TIME_FORMAT; + final String timeZone = tags.getOrDefault("timeZone", "UTC"); + return parseTime(time, pattern, timeZone); } else { return Optional.of(new DateExpression.StaticDate(removeQuotes(ctx.DATE().getText()))); } } + private Optional parseTime(final String in, final String pattern, final String timeZone) { + try { + final ZoneId zoneId = ZoneId.of(timeZone); // https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html + final DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern); + final LocalTime time = LocalTime.parse(in, formatter); + final LocalDate today = LocalDate.now(); + final LocalDateTime localDateTime = LocalDateTime.of(today, time); + final ZonedDateTime zonedDateTime = localDateTime.atZone(zoneId); + final ZonedDateTime utcTime = zonedDateTime.withZoneSameInstant(ZoneOffset.UTC); + return Optional.of(new DateExpression.StaticDateTime(utcTime.toLocalDateTime(), in)); + } catch (final DateTimeParseException e) { + errorMessages.add(String.format("Error Parsing Date: %s. %s.", in, e.getMessage())); + return Optional.empty(); + } catch (final ZoneRulesException e) { + errorMessages.add(String.format("Error Parsing Time Zone: %s. %s.", timeZone, e.getMessage())); + return Optional.empty(); + } + } + private Optional parseDuration( DataQualityDefinitionLanguageParser.DurationExpressionContext ctx) { int amount = Integer.parseInt(ctx.INT() != null ? ctx.INT().getText() : ctx.DIGIT().getText()); @@ -572,6 +1176,17 @@ private Optional parseDuration( } } + private Optional parseSize( + DataQualityDefinitionLanguageParser.SizeExpressionContext ctx) { + int amount = Integer.parseInt(ctx.INT() != null ? ctx.INT().getText() : ctx.DIGIT().getText()); + if (ctx.sizeUnit().exception != null) { + return Optional.empty(); + } else { + SizeUnit unit = SizeUnit.valueOf(ctx.sizeUnit().getText().toUpperCase()); + return Optional.of(new Size(amount, unit)); + } + } + private String removeQuotes(String quotedString) { if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) { quotedString = quotedString.substring(1); @@ -584,4 +1199,69 @@ private String removeEscapes(String stringWithEscapes) { stringWithEscapes = stringWithEscapes.replaceAll("\\\\(.)", "$1"); return stringWithEscapes; } + + private List parseParameters( + List parameters) { + if (parameters == null) return new ArrayList<>(); + return parameters.stream().map(this::parseParameter).collect(Collectors.toList()); + } + + private DQRuleParameterValue parseParameter( + DataQualityDefinitionLanguageParser.ParameterWithConnectorWordContext pc) { + String connectorWord = pc.connectorWord() == null ? "" : pc.connectorWord().getText(); + + if (pc.parameter().QUOTED_STRING() != null) { + return new DQRuleParameterValue( + removeQuotes(pc.parameter().QUOTED_STRING().getText()), true, connectorWord); + } else if (pc.parameter().IDENTIFIER() != null) { + return new DQRuleParameterValue( + pc.parameter().IDENTIFIER().getText(), false, connectorWord); + } else { + return new DQRuleParameterValue(pc.parameter().getText(), true, connectorWord); + } + } + + private List validateDictionary(DataQualityDefinitionLanguageParser.DictionaryContext dc) { + List dictionaryErrors = new ArrayList<>(); + if (dc.pair() == null || (dc.pair().size() == 1 && dc.pair().get(0).getText().isEmpty())) { + dictionaryErrors.add("Empty dictionary provided"); + } + return dictionaryErrors; + } + + private Keyword parseKeyword( + DataQualityDefinitionLanguageParser.StringValuesContext stringValuesContext) { + Keyword keyword = null; + try { + String operand = stringValuesContext.getText().toUpperCase(); + if (isValidEnumValue(operand)) { + Method method = stringValuesContext.getClass().getMethod(operand); + Object result = method.invoke(stringValuesContext); + if (result != null) { + keyword = Keyword.valueOf(operand); + } + } + } catch (IllegalArgumentException | IllegalAccessException | NoSuchMethodException | + InvocationTargetException e) { + errorMessages.add(e.getMessage()); + } + return keyword; + } + + private boolean isValidEnumValue(String value) { + try { + Enum.valueOf(Keyword.class, value); + return true; + } catch (IllegalArgumentException e) { + return false; + } + } + + private String processStringValues(DataQualityDefinitionLanguageParser.StringValuesContext sv) { + if (sv.quotedString() != null) { + return removeQuotes(sv.quotedString().getText()); + } + return sv.getText(); + } + } diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java new file mode 100644 index 0000000..16a7178 --- /dev/null +++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java @@ -0,0 +1,59 @@ +package com.amazonaws.glue.ml.dataquality.dqdl.parser; + +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQVariable; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public final class DQDLVariableResolver { + + // Private constructor to prevent instantiation + private DQDLVariableResolver() { + throw new AssertionError("Utility class should not be instantiated"); + } + + public static Condition resolveVariablesInCondition(Condition condition, Map variables, + Map usedVars) { + if (!(condition instanceof StringBasedCondition)) { + return condition; + } + + StringBasedCondition stringCondition = (StringBasedCondition) condition; + List resolvedOperands = new ArrayList<>(); + + for (StringOperand operand : stringCondition.getOperands()) { + if (operand instanceof VariableReferenceOperand) { + String varName = operand.getOperand(); + DQVariable variable = variables.get(varName); + if (variable != null) { + usedVars.put(varName, variable); + Object value = variable.getValue(); + if (value instanceof List) { + for (Object listItem : (List) value) { + resolvedOperands.add(new QuotedStringOperand(listItem.toString())); + } + } else { + resolvedOperands.add(new QuotedStringOperand(value.toString())); + } + } else { + resolvedOperands.add(operand); + } + } else { + resolvedOperands.add(operand); + } + } + + return new StringBasedCondition( + stringCondition.getConditionAsString(), + stringCondition.getOperator(), + resolvedOperands, + stringCondition.getOperands() + ); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java new file mode 100644 index 0000000..174144a --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java @@ -0,0 +1,89 @@ +/* + * DQAnalyzerTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import com.amazonaws.glue.ml.dataquality.dqdl.exception.InvalidDataQualityRulesetException; +import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +public class DQAnalyzerTest { + DQDLParser parser = new DQDLParser(); + + @Test + void test_singleAnalyzer() { + String column = "colA"; + String ruleset = String.format("Rules = [ IsComplete \"%s\" ] Analyzers = [ Completeness \"%s\" ]", column, column); + + try { + DQRuleset dqRuleset = parser.parse(ruleset); + DQAnalyzer dqAnalyzer = dqRuleset.getAnalyzers().get(0); + assertEquals("Completeness", dqAnalyzer.getRuleType()); + assertEquals(1, dqAnalyzer.getParameters().size()); + assertTrue(dqAnalyzer.getParameters().containsValue(column)); + } catch (InvalidDataQualityRulesetException e) { + fail(e.getMessage()); + } + } + + @ParameterizedTest + @MethodSource("provideRawAnalyzers") + void test_analyzerParsingAndGeneratingWithParser(String analyzer) { + try { + DQRuleset dqRuleset = parser.parse(String.format("Rules = [ IsComplete \"colA\" ] Analyzers = [ %s ]", analyzer)); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals(1, dqRuleset.getAnalyzers().size()); + + DQAnalyzer dqAnalyzer = dqRuleset.getAnalyzers().get(0); + String dqAnalyzerAsString = dqAnalyzer.toString(); + assertEquals(analyzer, dqAnalyzerAsString); + } catch (InvalidDataQualityRulesetException e) { + fail(e.getMessage()); + } + } + + private static Stream provideRawAnalyzers() { + return Stream.of( + Arguments.of("RowCount"), + Arguments.of("RowCountMatch \"reference\""), + Arguments.of("Completeness \"col_1\""), + Arguments.of("ColumnCount"), + Arguments.of("ColumnCorrelation \"col_1\" \"col_2\""), + Arguments.of("Uniqueness \"col_1\""), + Arguments.of("Sum \"col_A-B.C\""), + Arguments.of("Mean \"col_A-B.CD\""), + Arguments.of("StandardDeviation \"col_A-B.CD\""), + Arguments.of("Entropy \"col_A-B.CD\""), + Arguments.of("DistinctValuesCount \"col_A-B.CD\""), + Arguments.of("UniqueValueRatio \"col_A-B.CD\""), + Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\""), + Arguments.of("ReferentialIntegrity \"col-A,col-B\" \"reference.{col-A1,col-A2}\""), + Arguments.of("DatasetMatch \"reference\" \"ID1,ID2\""), + Arguments.of("DatasetMatch \"reference\" \"ID1,ID2\" \"colA,colB,colC\""), + Arguments.of("DatasetMatch \"reference\" \"ID1->ID11,ID2->ID22\" \"colA->colAA\""), + Arguments.of("SchemaMatch \"ref-1\""), + Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(colB)\""), + Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\""), + Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\""), + Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\""), + Arguments.of("CustomSql \"select count(*) from primary\""), + Arguments.of("AllStatistics \"id\"") + ); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java new file mode 100644 index 0000000..1e50325 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java @@ -0,0 +1,80 @@ +/* + * DQRuleParameterValueTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class DQRuleParameterValueTest { + + @Test + public void test_constructorWithValueArg() { + String value = "col-A"; + DQRuleParameterValue param = new DQRuleParameterValue(value); + assertEquals(value, param.getValue()); + assertFalse(param.isQuoted()); + assertTrue(param.getConnectorWord().isEmpty()); + } + + @Test + public void test_constructorWithValueAndIsQuotedArgs() { + String value = "col-A"; + boolean isQuoted = true; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted); + assertEquals(value, param.getValue()); + assertEquals(isQuoted, param.isQuoted()); + assertTrue(param.getConnectorWord().isEmpty()); + } + + @Test + public void test_parameterValueToStringWithNoConnectorWordAndNoQuotes() { + String value = "col-A"; + String connectorWord = ""; + boolean isQuoted = false; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord); + assertEquals(value, param.toString()); + } + + @Test + public void test_parameterValueToStringWithConnectorWordAndNoQuotes() { + String value = "col-A"; + String connectorWord = "of"; + boolean isQuoted = false; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord); + assertEquals(String.format("%s %s", connectorWord, value), param.toString()); + } + + @Test + public void test_parameterValueToStringWithConnectorWordAndWithQuotes() { + String value = "col-A"; + String connectorWord = "of"; + boolean isQuoted = true; + DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord); + assertEquals(String.format("%s \"%s\"", connectorWord, value), param.toString()); + } + + @Test + public void test_equalsAndHashCode() { + String value = "col-A"; + String connectorWord = "of"; + boolean isQuoted = true; + + DQRuleParameterValue param1 = new DQRuleParameterValue(value, isQuoted, connectorWord); + DQRuleParameterValue param2 = new DQRuleParameterValue(value, isQuoted, connectorWord); + + assertNotSame(param1, param2); + assertEquals(param1, param2); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java index 98aeed5..3139102 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java @@ -15,7 +15,10 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.Size; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedCondition; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand; import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -29,15 +32,22 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; +import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Date; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -62,76 +72,153 @@ void test_ruleParsingAndGeneratingWithParser(String rule) { } } + @ParameterizedTest + @MethodSource("provideRawRules") + void test_rulesEqualWhenRepresentationsEqual(String ruleStringRepr) { + try { + DQRule rule1 = parser.parse("Rules = [ " + ruleStringRepr + " ]").getRules().get(0); + DQRule rule2 = parser.parse("Rules = [ " + ruleStringRepr + " ]").getRules().get(0); + + assertEquals(rule1, rule2); + assertTrue(rule1.equals(rule2)); + assertEquals(rule1.hashCode(), rule2.hashCode()); + assertNotSame(rule1, rule2); + } catch (InvalidDataQualityRulesetException e) { + fail(e.getMessage()); + } + } + private static Stream provideRawRules() { return Stream.of( - // Arguments.of("JobStatus = \"SUCCEEDED\""), - // Arguments.of("JobStatus in [\"SUCCEEDED\",\"READY\"]"), - // Arguments.of("JobDuration between 10 and 1000"), - // Arguments.of("JobDuration between -10 and 1000"), - // Arguments.of("FileCount between 10 and 100"), - // Arguments.of("FileCount between -10000 and -1000"), Arguments.of("IsPrimaryKey \"colA\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\""), + Arguments.of("IsPrimaryKey colA \"col B\""), Arguments.of("IsPrimaryKey \"colA\" \"colB\" \"colC\""), + Arguments.of("IsPrimaryKey \"colA\" where \"colA > 100\""), Arguments.of("RowCount = 100"), + Arguments.of("RowCount != 100"), Arguments.of("RowCount = -100"), + Arguments.of("RowCount = 100 where \"colA > 100\""), + Arguments.of("RowCount between (0.9 * average(last(10))) and 1.1 * average(last(10))"), + Arguments.of("RowCount not between (0.9 * average(last(10))) and 1.1 * average(last(10))"), Arguments.of("RowCountMatch \"reference\" = 1.0"), Arguments.of("RowCountMatch \"reference\" >= 0.95"), Arguments.of("RowCountMatch \"reference\" between 0.8 and 0.98"), Arguments.of("Completeness \"col_1\" between 0.5 and 0.8"), + Arguments.of("Completeness of col_1 between 0.5 and 0.8"), + Arguments.of("Completeness of col_1 not between 0.5 and 0.8"), + Arguments.of("Completeness \"col_1\" between 0.5 and 0.8 where \"col-A > 100\""), Arguments.of("IsComplete \"col_1\""), + Arguments.of("IsComplete \"col_1\" where \"col-A > 100\""), Arguments.of("Completeness \"col_1\" between -0.5 and -0.4"), + Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"), Arguments.of("ColumnDataType \"col_1\" = \"String\""), + Arguments.of("ColumnDataType \"col_1\" != \"String\""), + Arguments.of("ColumnDataType \"col_2\" = \"Integer\""), Arguments.of("ColumnDataType \"col_1\" = \"String\" with threshold between 0.4 and 0.8"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"]"), Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9"), + Arguments.of("ColumnDataType \"col_1\" = \"String\" where \"col-A > 100\""), Arguments.of("ColumnNamesMatchPattern \"aws_.*_[a-zA-Z0-9]+\""), Arguments.of("ColumnExists \"load_dt\""), Arguments.of("ColumnCount >= 100"), + Arguments.of("ColumnCount = avg(std(last(10)))"), + Arguments.of("ColumnCount != avg(std(last(10)))"), + Arguments.of("ColumnCount = avg(std(last(percentile(1,2,3))))"), Arguments.of("ColumnCount > -100.123456"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8"), + Arguments.of("ColumnCorrelation of col_1 col_2 between 0.4 and 0.8"), + Arguments.of("ColumnCorrelation of col_1 and \"col abc\" between 0.4 and 0.8"), Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between -0.44444 and 0.888888"), + Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8 where \"col-A > 100\""), Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2"), + Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2 where \"col-A > 100\""), Arguments.of("IsUnique \"col_1\""), + Arguments.of("IsUnique \"col_1\" where \"col-A > 100\""), Arguments.of("Uniqueness \"col_1\" between -0.00000001 and 0.00000000000002"), Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\""), + Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\" where \"col-A > 100\""), Arguments.of("ColumnValues \"load_dt\" > (now() - 1 days)"), Arguments.of("ColumnValues \"order-id\" in [1,2,3,4]"), + Arguments.of("ColumnValues \"order-id\" in [1,2,3,4,NULL]"), + Arguments.of("ColumnValues \"order-id\" not in [1,2,3,4]"), Arguments.of("ColumnValues \"order-id\" in [\"1\",\"2\",\"3\",\"4\"]"), + Arguments.of("ColumnValues \"order-id\" not in [\"1\",\"2\",\"3\",\"4\"]"), + Arguments.of("ColumnValues \"col-A\" < (now() + 4 minutes)"), + Arguments.of("ColumnValues \"col-A\" < (now() - 25 minutes)"), + Arguments.of("ColumnValues \"col-A\" > \"9:30 AM\""), + Arguments.of("ColumnValues \"col-A\" > \"9:30 PM\""), + Arguments.of("ColumnValues \"col-A\" > \"19:30\""), + Arguments.of("ColumnValues \"col-A\" between \"9:00 AM\" and \"21:50\""), Arguments.of("Sum \"col_A-B.C\" > 100.0"), Arguments.of("Sum \"col_A-B.C\" > -100.0"), + Arguments.of("Sum \"col_A-B.C\" > -100.0 where \"col-A > 100\""), Arguments.of("Mean \"col_A-B.CD\" between 10 and 20"), Arguments.of("Mean \"col_A-B.CD\" between -20 and -10"), + Arguments.of("Mean \"col_A-B.CD\" between -20 and -10 where \"col-A > 100\""), Arguments.of("StandardDeviation \"col_A-B.CD\" <= 10.0"), Arguments.of("StandardDeviation \"col_A-B.CD\" <= -10000.0"), + Arguments.of("StandardDeviation \"col_A-B.CD\" <= -10000.0 where \"col-A > 100\""), Arguments.of("Entropy \"col_A-B.CD\" <= 10.0"), Arguments.of("Entropy \"col_A-B.CD\" between 10 and 30"), + Arguments.of("Entropy \"col_A-B.CD\" between 10 and 30 where \"col-A > 100\""), Arguments.of("DistinctValuesCount \"col_A-B.CD\" > 1000"), Arguments.of("DistinctValuesCount \"col_A-B.CD\" between 10 and 30"), + Arguments.of("DistinctValuesCount \"col_A-B.CD\" between 10 and 30 where \"col-A > 100\""), Arguments.of("UniqueValueRatio \"col_A-B.CD\" < 0.5"), Arguments.of("UniqueValueRatio \"col_A-B.CD\" between 0.1 and 0.5"), + Arguments.of("UniqueValueRatio \"col_A-B.CD\" between 0.1 and 0.5 where \"col-A > 100\""), Arguments.of("ColumnLength \"col_A-B.CD\" < 10"), Arguments.of("ColumnLength \"col_A-B.CD\" >= 100"), + Arguments.of("ColumnLength \"col_A-B.CD\" >= 100 where \"col-A > 100\""), Arguments.of("ColumnValues \"col-A\" matches \"[a-zA-Z0-9]*\""), + Arguments.of("ColumnValues \"col-A\" not matches \"[a-zA-Z0-9]*\""), Arguments.of("ColumnValues \"col-A\" >= now()"), Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and now()"), + Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and now()"), Arguments.of("ColumnValues \"col-A\" between now() and (now() + 3 hours)"), Arguments.of("ColumnValues \"col-A\" < (now() + 4 days)"), Arguments.of("ColumnValues \"col-A\" = (now() - 3 hours)"), + Arguments.of("ColumnValues \"col-A\" != (now() - 3 hours)"), Arguments.of("ColumnValues \"col-A\" in [now(),(now() - 3 hours),now(),(now() + 4 days)]"), + Arguments.of("ColumnValues \"col-A\" not in [now(),(now() - 3 hours),now(),(now() + 4 days)]"), Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and (now() + 14 days)"), + Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and (now() + 14 days)"), Arguments.of("ColumnValues \"col-A\" matches \"[a-z]*\" with threshold <= 0.4"), + Arguments.of("ColumnValues \"col-A\" not matches \"[a-z]*\" with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [\"A\",\"B\"] with threshold <= 0.4"), Arguments.of("ColumnValues \"col-A\" in [1,2,3] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"A\" with threshold > 0.98"), + Arguments.of("ColumnValues \"col-A\" = NULL"), + Arguments.of("ColumnValues \"col-A\" = EMPTY"), + Arguments.of("ColumnValues \"col-A\" = WHITESPACES_ONLY"), + Arguments.of("ColumnValues \"col-A\" != NULL"), + Arguments.of("ColumnValues \"col-A\" != EMPTY"), + Arguments.of("ColumnValues \"col-A\" != WHITESPACES_ONLY"), + Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL]"), + Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL]"), + Arguments.of("ColumnValues \"col-A\" not in [\"a\",NULL]"), + Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL,EMPTY,WHITESPACES_ONLY]"), + Arguments.of("ColumnValues \"col-A\" in [NULL,EMPTY,WHITESPACES_ONLY]"), + Arguments.of("(ColumnValues \"col-A\" not in [NULL,EMPTY,WHITESPACES_ONLY]) OR (ColumnValues \"col-B\" != WHITESPACES_ONLY)"), + Arguments.of("(ColumnValues \"col-A\" in [NULL,EMPTY,WHITESPACES_ONLY]) AND (ColumnValues \"col-B\" != WHITESPACES_ONLY)"), Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold between 0.4 and 0.8"), + Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold not between 0.4 and 0.8"), Arguments.of("ColumnValues \"col-A\" > 0.4 with threshold > 0.4"), Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\"] with threshold > 0.98"), + Arguments.of("ColumnValues \"col-A\" = NULL"), + Arguments.of("ColumnValues \"col-A\" != NULL"), + Arguments.of("ColumnValues \"col-A\" in [NULL]"), + Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\",NULL] with threshold > 0.98"), + Arguments.of("ColumnValues \"col-A\" not in [\"2022-01-01\",NULL] with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = 1 with threshold > 0.98"), Arguments.of("ColumnValues \"col-A\" = \"2022-01-01\" with threshold > 0.98"), Arguments.of("DataFreshness \"col-A\" <= 3 days"), - Arguments.of("DataFreshness \"col-A\" > 30 hours"), + Arguments.of("DataFreshness \"col-A\" > 3 minutes"), + Arguments.of("DataFreshness \"col-A\" > 90 minutes"), Arguments.of("DataFreshness \"col-A\" between 2 days and 4 days"), + Arguments.of("DataFreshness \"col-A\" between 2 minutes and 4 minutes"), + Arguments.of("DataFreshness \"col-A\" <= 3 days where \"col-A > 100\""), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" between 0.4 and 0.6"), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" > 0.98"), Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" = 0.99"), @@ -146,12 +233,237 @@ private static Stream provideRawRules() { Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.1"), Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\" between 0.8 and 0.9"), Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\" >= 0.95"), - Arguments.of( "CustomSql \"select count(*) from primary\" > 0"), - Arguments.of( "CustomSql \"select col-A from primary\""), - Arguments.of( "CustomSql \"select col-A from primary\" with threshold > 0.5") + Arguments.of("CustomSql \"select count(*) from primary\" > 0"), + Arguments.of("CustomSql \"select col-A from primary\""), + Arguments.of("CustomSql \"select col-A from primary\" with threshold > 0.5"), + Arguments.of("DetectAnomalies \"RowCount\""), + Arguments.of("DetectAnomalies of RowCount"), + Arguments.of("DetectAnomalies of Completeness of \"colA\""), + Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\""), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\"]"), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"]"), + Arguments.of("FileMatch in [\"hashList\",\"hashList\"]"), + Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), + Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\" with randomTagThing = \"@sampom\""), + Arguments.of("FileMatch \"S3://PATH1\" in [\"a\"] with tag1 = \"sampom\" with tag2 = \"pomsam\""), + Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""), + Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9"), + Arguments.of("FileFreshness \"S3://PATH\" between \"2023-02-07\" and \"2024-07-15\""), + Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)"), + Arguments.of("FileUniqueness \"S3://PATH\" > 0.9"), + Arguments.of("FileUniqueness > 0.5"), + Arguments.of("FileSize between 1 B and 1 GB"), + Arguments.of("FileSize not between 50 GB and 1 TB"), + Arguments.of("FileSize > 5 B"), + Arguments.of("FileSize >= 5 KB"), + Arguments.of("FileSize < 5 MB"), + Arguments.of("FileSize <= 5 GB"), + Arguments.of("FileSize = 5 TB"), + Arguments.of("FileSize != 5 B"), + Arguments.of("FileSize in [5 B]"), + Arguments.of("FileSize not in [500 KB,150 GB]"), + Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") OR (IsUnique \"colA\")"), + Arguments.of("(RowCount > 0) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), + Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"), + Arguments.of("FileFreshness \"S3://PATH\" > (now() - 30 minutes)"), + Arguments.of("FileFreshness \"S3://PATH\" > (now() + 45 minutes)"), + Arguments.of("FileFreshness \"S3://PATH\" > \"9:30 AM\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"9:30 PM\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"09:30\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"13:30\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with timeZone = \"America/New_York\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 PM\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 AM\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"09:30\" and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between (now() - 2 hours) and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between (now() + 5 minutes) and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"2024-01-01\" and \"21:45\""), + Arguments.of("FileFreshness \"S3://PATH\" between \"2024-01-01\" and (now() + 10 minutes)") ); } + @Test + void test_TagFormatting() throws Exception { + final String rule = "Rules = [ " + + "FileFreshness > \"9:30 AM\" with recentFiles = 1, " + + "FileFreshness > \"9:30 AM\" with recentFiles = \"1\", " + + "FileFreshness > \"9:30 AM\" with matchFileName = \"True\", " + + "FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\" " + + "]"; + List rules = parser.parse(rule).getRules(); + assertEquals("FileFreshness > \"9:30 AM\" with recentFiles = 1", rules.get(0).toString()); + assertEquals("FileFreshness > \"9:30 AM\" with recentFiles = \"1\"", rules.get(1).toString()); + assertEquals("FileFreshness > \"9:30 AM\" with matchFileName = \"True\"", rules.get(2).toString()); + assertEquals("FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\"", rules.get(3).toString()); + } + + @Test + void test_Timezone() throws Exception { + String rule = "Rules = [ FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\", FileFreshness > \"19:30\" with timeZone = \"Asia/Dubai\", FileFreshness > \"9:30 AM\" ]"; + List rules = parser.parse(rule).getRules(); + DateBasedCondition c1 = (DateBasedCondition) rules.get(0).getCondition(); + DateBasedCondition c2 = (DateBasedCondition) rules.get(1).getCondition(); + DateBasedCondition c3 = (DateBasedCondition) rules.get(2).getCondition(); + Date today = new Date(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + String todayStr = sdf.format(today); + assertEquals(todayStr + "T14:30", c1.getOperands().get(0).getEvaluatedExpression().toString()); + assertEquals(todayStr + "T15:30", c2.getOperands().get(0).getEvaluatedExpression().toString()); + assertEquals(todayStr + "T09:30", c3.getOperands().get(0).getEvaluatedExpression().toString()); + } + + @Test + void test_AMPM_Parsing() throws Exception { + String rule = "Rules = [ FileFreshness \"S3://PATH\" between \"9:15 AM\" and \"21:45\" ]"; + DQRule parsedRule = parser.parse(rule).getRules().get(0); + DateBasedCondition c1 = (DateBasedCondition) parsedRule.getCondition(); + DateExpression d1 = c1.getOperands().get(0); + DateExpression d2 = c1.getOperands().get(1); + Date today = new Date(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + String todayStr = sdf.format(today); + assertEquals("FileFreshness", parsedRule.getRuleType()); + assertEquals("between \"9:15 AM\" and \"21:45\"", c1.getFormattedCondition()); + assertEquals("\"9:15 AM\"" ,d1.getFormattedExpression()); + assertEquals("\"21:45\"" ,d2.getFormattedExpression()); + assertEquals(todayStr + "T09:15", d1.getEvaluatedExpression().toString()); + assertEquals(todayStr + "T21:45", d2.getEvaluatedExpression().toString()); + } + + @Test + void test_sizeConditionParsing() throws Exception { + List unitList = Arrays.asList("B", "KB", "MB", "GB", "TB"); + for (String unit : unitList) { + String rule = String.format("Rules = [ FileSize = 2 %s ]", unit); + DQRule parsedRule = parser.parse(rule).getRules().get(0); + assertEquals("FileSize", parsedRule.getRuleType()); + SizeBasedCondition c = (SizeBasedCondition) parsedRule.getCondition(); + assertEquals(unit, c.getOperands().get(0).getUnit().name()); + } + + String defaultByte = "Rules = [ FileSize > 2, FileSize in [3,4,5,6] ]"; + List rules = parser.parse(defaultByte).getRules(); + DQRule parsedRuleNoUnit0 = rules.get(0); + DQRule parsedRuleNoUnit1 = rules.get(1); + assertEquals("FileSize", parsedRuleNoUnit0.getRuleType()); + assertEquals("FileSize", parsedRuleNoUnit1.getRuleType()); + SizeBasedCondition c0 = (SizeBasedCondition) parsedRuleNoUnit0.getCondition(); + SizeBasedCondition c1 = (SizeBasedCondition) parsedRuleNoUnit1.getCondition(); + assertEquals("B", c0.getOperands().get(0).getUnit().name()); + for (Size unit : c1.getOperands()) { + assertEquals("B", unit.getUnit().name()); + } + } + + @Test + void test_fileFileFreshnessParsing() throws Exception { + String fileRules = "Rules = [ " + + "FileFreshness \"S3://path\" between \"2023-02-07\" and \"2024-07-15\", " + + "FileFreshness \"S3://path\" > (now() - 3 days), " + + "FileFreshness \"S3://path\" < (now() - 4 days), " + + "FileFreshness between \"2023-02-07\" and \"2024-07-15\", " + + "FileFreshness > (now() + 35 minutes), " + + "FileFreshness <= (now() - 35 minutes), " + + "FileFreshness = (now() + 70 minutes) " + + "]"; + DQRuleset dqRuleset = parser.parse(fileRules); + List ruleList = dqRuleset.getRules(); + DQRule rule0 = ruleList.get(0); + + DateBasedCondition c0 = (DateBasedCondition) rule0.getCondition(); + assertEquals("FileFreshness", rule0.getRuleType()); + assertEquals("S3://path", rule0.getParameters().get("DataPath")); + assertEquals("2023-02-07", removeQuotes(c0.getOperands().get(0).getFormattedExpression())); + assertEquals("2024-07-15", removeQuotes(c0.getOperands().get(1).getFormattedExpression())); + + DQRule rule1 = ruleList.get(1); + DateBasedCondition c1 = (DateBasedCondition) rule1.getCondition(); + assertEquals("FileFreshness", rule1.getRuleType()); + assertEquals("S3://path", rule1.getParameters().get("DataPath")); + assertEquals("GREATER_THAN", c1.getOperator().toString()); + assertEquals("(now() - 3 days)", c1.getOperands().get(0).getFormattedExpression()); + + DQRule rule2 = ruleList.get(2); + DateBasedCondition c2 = (DateBasedCondition) rule2.getCondition(); + assertEquals("FileFreshness", rule2.getRuleType()); + assertEquals("S3://path", rule2.getParameters().get("DataPath")); + assertEquals("LESS_THAN", c2.getOperator().toString()); + assertEquals("(now() - 4 days)", c2.getOperands().get(0).getFormattedExpression()); + + DQRule rule3 = ruleList.get(3); + DateBasedCondition c3 = (DateBasedCondition) rule3.getCondition(); + assertEquals("FileFreshness", rule3.getRuleType()); + assertFalse(rule3.getParameters().containsKey("DataPath")); + assertEquals("2023-02-07", removeQuotes(c3.getOperands().get(0).getFormattedExpression())); + assertEquals("2024-07-15", removeQuotes(c3.getOperands().get(1).getFormattedExpression())); + + DQRule rule4 = ruleList.get(4); + DateBasedCondition c4 = (DateBasedCondition) rule4.getCondition(); + assertEquals("(now() + 35 minutes)", c4.getOperands().get(0).getFormattedExpression()); + + DQRule rule5 = ruleList.get(5); + DateBasedCondition c5 = (DateBasedCondition) rule5.getCondition(); + assertEquals("(now() - 35 minutes)", c5.getOperands().get(0).getFormattedExpression()); + + DQRule rule6 = ruleList.get(6); + DateBasedCondition c6 = (DateBasedCondition) rule6.getCondition(); + assertEquals("(now() + 70 minutes)", c6.getOperands().get(0).getFormattedExpression()); + } + + @Test + void test_checksumRuleParsing() throws Exception { + String fileRules = "Rules = [ " + + "FileMatch in [\"exampleHash\"] with hashAlgorithm = \"MD5\" with dataFrame = \"true\" ," + + "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with hashAlgorithm = \"SHA-256\" ," + + "FileMatch \"s3://sampom-bucket3/\" in [\"exampleHash3\"] ," + + "FileMatch in [\"exampleHash4\"] with dataFrame = \"true\"" + + "]"; + DQRuleset dqRuleset = parser.parse(fileRules); + List ruleList = dqRuleset.getRules(); + + DQRule rule0 = ruleList.get(0); + assertEquals("FileMatch", rule0.getRuleType()); + assertEquals("exampleHash", ((StringBasedCondition) rule0.getCondition()).getOperands().get(0).getOperand()); + assertEquals("MD5", rule0.getTags().get("hashAlgorithm")); + assertEquals("true", rule0.getTags().get("dataFrame")); + + DQRule rule1 = ruleList.get(1); + assertEquals("FileMatch", rule1.getRuleType()); + assertEquals("s3://sampom-bucket2/", rule1.getParameters().get("DataPath")); + assertEquals("exampleHash2", ((StringBasedCondition) rule1.getCondition()).getOperands().get(0).getOperand()); + assertEquals("SHA-256", rule1.getTags().get("hashAlgorithm")); + + DQRule rule2 = ruleList.get(2); + assertEquals("FileMatch", rule2.getRuleType()); + assertEquals("s3://sampom-bucket3/", rule2.getParameters().get("DataPath")); + assertEquals("exampleHash3", ((StringBasedCondition) rule2.getCondition()).getOperands().get(0).getOperand()); + + DQRule rule3 = ruleList.get(3); + assertEquals("FileMatch", rule3.getRuleType()); + assertEquals("exampleHash4", ((StringBasedCondition) rule3.getCondition()).getOperands().get(0).getOperand()); + } + + @Test + void test_fileMatchRuleParsing() throws Exception { + String fileRules = "Rules = [ " + + "FileMatch \"s3://sampom-bucket1/\" \"s3://sampom-bucket2/\"," + + "FileMatch \"s3://sampom-bucket1/file1.json\" \"s3://sampom-bucket2/file2.json\"" + + "]"; + DQRuleset dqRuleset = parser.parse(fileRules); + List ruleList = dqRuleset.getRules(); + + DQRule rule0 = ruleList.get(0); + assertEquals("FileMatch", rule0.getRuleType()); + assertEquals("s3://sampom-bucket1/", rule0.getParameters().get("DataPath")); + assertEquals("s3://sampom-bucket2/", rule0.getParameters().get("CompareDataPath")); + + DQRule rule1 = ruleList.get(1); + assertEquals("FileMatch", rule0.getRuleType()); + assertEquals("s3://sampom-bucket1/file1.json", rule1.getParameters().get("DataPath")); + assertEquals("s3://sampom-bucket2/file2.json", rule1.getParameters().get("CompareDataPath")); + } + @Test void test_toStringIgnoresSpacesOnlyThreshold() { Map parameters = new HashMap<>(); @@ -190,10 +502,9 @@ void test_setExpressionContainsRuleContainingRule() throws InvalidDataQualityRul assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); + List stringList = constructOperandsAsStringList(dqRule); assertEquals( - Collections.singletonList("ColumnValues in [ \"col-A\" ]"), - ((StringBasedCondition) dqRule.getCondition()).getOperands() - ); + Collections.singletonList("ColumnValues in [ \"col-A\" ]"), stringList); } @Test @@ -218,7 +529,8 @@ void test_setExpressionContainsItemContainingEscapedQuotes() throws InvalidDataQ assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); - assertEquals(Arrays.asList("a\"b", "c", "d\"e"), ((StringBasedCondition) dqRule.getCondition()).getOperands()); + List stringList = constructOperandsAsStringList(dqRule); + assertEquals(Arrays.asList("a\"b", "c", "d\"e"), stringList); } @Test @@ -227,7 +539,8 @@ void test_setExpressionContainsItemContainingCommas() throws InvalidDataQualityR assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); - assertEquals(Arrays.asList("a,,b", "c", "d,,,e"), ((StringBasedCondition) dqRule.getCondition()).getOperands()); + List stringList = constructOperandsAsStringList(dqRule); + assertEquals(Arrays.asList("a,,b", "c", "d,,,e"), stringList); } @Test @@ -237,13 +550,22 @@ void test_serializationDeserializationWithExpressionFieldSet() assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass()); - assertEquals(Arrays.asList("A", "B"), ((StringBasedCondition) dqRule.getCondition()).getOperands()); + List stringList = constructOperandsAsStringList(dqRule); + assertEquals(Arrays.asList("A", "B"), stringList); byte[] serialized = serialize(dqRule); DQRule deserialized = deserialize(serialized, DQRule.class); assertEquals(dqRule.toString(), deserialized.toString()); assertEquals(StringBasedCondition.class, deserialized.getCondition().getClass()); } + private static List constructOperandsAsStringList(DQRule dqRule) { + List stringOperandsList = ((StringBasedCondition) dqRule.getCondition()).getOperands(); + List stringList = stringOperandsList.stream() + .map(StringOperand::getOperand) + .collect(Collectors.toList()); + return stringList; + } + @Test void test_serializationDeserializationWithNumericExpression() throws InvalidDataQualityRulesetException, IOException, ClassNotFoundException { @@ -251,12 +573,12 @@ void test_serializationDeserializationWithNumericExpression() assertEquals(1, dqRuleset.getRules().size()); DQRule dqRule = dqRuleset.getRules().get(0); assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); - assertTrue(((NumberBasedCondition) dqRule.getCondition()).evaluate(0.4)); + assertTrue(((NumberBasedCondition) dqRule.getCondition()).evaluate(0.4, dqRule, testEvaluator)); byte[] serialized = serialize(dqRule); DQRule deserialized = deserialize(serialized, DQRule.class); assertEquals(dqRule.toString(), deserialized.toString()); assertEquals(NumberBasedCondition.class, deserialized.getCondition().getClass()); - assertFalse(((NumberBasedCondition) deserialized.getCondition()).evaluate(0.9)); + assertFalse(((NumberBasedCondition) deserialized.getCondition()).evaluate(0.9, dqRule, testEvaluator)); } @Test @@ -269,6 +591,343 @@ void test_compositeRulesAreReparseable() throws InvalidDataQualityRulesetExcepti assertEquals(reStringed, rulesetString); } + @Test + void test_constructorWithOriginalParameterMap() { + String ruleType = "IsComplete"; + String columnKey = "TargetColumn"; + String column = "colA"; + String emptyCondition = ""; + + Map parameters = new HashMap<>(); + parameters.put(columnKey, column); + + Condition condition = new Condition(emptyCondition); + Condition thresholdCondition = new Condition(emptyCondition); + + DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; + List nestedRules = new ArrayList<>(); + + String whereClause = null; + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition, operator, nestedRules, whereClause); + + assertEquals(ruleType, rule.getRuleType()); + + assertTrue(rule.getParameters().containsKey(columnKey)); + assertEquals(column, rule.getParameters().get(columnKey)); + assertTrue(rule.getParameterValueMap().containsKey(columnKey)); + assertEquals(column, rule.getParameterValueMap().get(columnKey).getValue()); + assertTrue(rule.getParameterValueMap().get(columnKey).getConnectorWord().isEmpty()); + assertTrue(rule.getParameterValueMap().get(columnKey).isQuoted()); + assertTrue(rule.getCondition().getConditionAsString().isEmpty()); + assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); + assertEquals(operator, rule.getOperator()); + assertTrue(rule.getNestedRules().isEmpty()); + } + + @Test + void test_parametersWithoutQuotesAreParsed() throws InvalidDataQualityRulesetException { + String colA = "colA"; + String colB = "col\\\"B"; + String colC = "col C"; + + String allCols = "AllColumns"; + + String rule1 = String.format("IsPrimaryKey %s \"%s\" \"%s\"", colA, colB, colC); + String rule2 = String.format("ColumnValues %s between 1 and 10", colA); + + String analyzer1 = String.format("Completeness \"%s\"", colC); + String analyzer2 = String.format("AllStatistics %s", allCols); + + String ruleset = String.format( + "Rules = [ %s, %s ] Analyzers = [ %s, %s ]", rule1, rule2, analyzer1, analyzer2); + + DQRuleset dqRuleset = parser.parse(ruleset); + + DQRule parsedRule1 = dqRuleset.getRules().get(0); + DQRule parsedRule2 = dqRuleset.getRules().get(1); + + DQAnalyzer parsedAnalyzer1 = dqRuleset.getAnalyzers().get(0); + DQAnalyzer parsedAnalyzer2 = dqRuleset.getAnalyzers().get(1); + + assertTrue(Stream.of(colA, colB, colC).allMatch(c -> parsedRule1.getParameters().containsValue(c))); + assertTrue(Stream.of(colA).allMatch(c -> parsedRule2.getParameters().containsValue(c))); + + assertTrue(Stream.of(colC).allMatch(c -> parsedAnalyzer1.getParameters().containsValue(c))); + assertTrue(Stream.of(allCols).allMatch(c -> parsedAnalyzer2.getParameters().containsValue(c))); + } + + @Test + public void test_equalsAndHashCode() throws InvalidDataQualityRulesetException { + String rule = "IsPrimaryKey \"colA\" \"colB\""; + String ruleset = String.format("Rules = [ %s ]", rule); + + DQRuleset dqRuleset1 = parser.parse(ruleset); + DQRuleset dqRuleset2 = parser.parse(ruleset); + + assertNotSame(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1.hashCode(), dqRuleset2.hashCode()); + } + + @Test + public void test_whereClause() throws InvalidDataQualityRulesetException { + String rule = "IsComplete \"colA\" where \"colB is NOT NULL\""; + String ruleset = String.format("Rules = [ %s ]", rule); + + DQRuleset dqRuleset1 = parser.parse(ruleset); + DQRuleset dqRuleset2 = parser.parse(ruleset); + + assertNotSame(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1, dqRuleset2); + assertEquals(dqRuleset1.hashCode(), dqRuleset2.hashCode()); + } + + @Test + public void test_whereClauseWithThreshold() throws InvalidDataQualityRulesetException { + String rule = "ColumnValues \"colA\" in [10,20] where \"colB is NOT NULL\" with threshold > 0.5"; + String ruleset = String.format("Rules = [ %s ]", rule); + + DQRuleset dqRuleset1 = parser.parse(ruleset); + + assertEquals(dqRuleset1.getRules().get(0).toString(), rule); + } + + @Test + void test_whereClauseRuleToStringFromRule() throws InvalidDataQualityRulesetException { + Map parameters = new HashMap<>(); + parameters.put("TargetColumn", "colA"); + DQRule dqRule = new DQRule("IsComplete", parameters, new Condition(""), null, + DQRuleLogicalOperator.AND, null, "colB is NOT NULL"); + String ruleString = "IsComplete \"colA\" where \"colB is NOT NULL\""; + assertEquals(dqRule.toString(), ruleString); + assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); + } + + @Test + void test_whereClauseRuleToStringFromRuleWithThreshold() throws InvalidDataQualityRulesetException { + Map parameters = new HashMap<>(); + parameters.put("TargetColumn", "colA"); + DQRule dqRule = new DQRule("ColumnValues", parameters, new Condition("in [10,20]"), new Condition("> 0.5"), + DQRuleLogicalOperator.AND, null, "colB is NOT NULL"); + String ruleString = "ColumnValues \"colA\" in [10,20] where \"colB is NOT NULL\" with threshold > 0.5"; + assertEquals(dqRule.toString(), ruleString); + assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); + } + + @Test + void test_whereClauseRuleToStringFromRuleset() throws InvalidDataQualityRulesetException { + String ruleString = "IsComplete \"colA\" where \"colB is NOT NULL\""; + String ruleset = String.format("Rules = [ %s ]", ruleString); + DQRuleset dqRuleset = parser.parse(ruleset); + DQRule dqRule = dqRuleset.getRules().get(0); + assertEquals(dqRule.toString(), ruleString); + assertEquals(dqRule.getWhereClause(), "colB is NOT NULL"); + } + + @Test + void test_whereClauseNeedsQuotedSQLStatement() { + String rule = "IsComplete \"colA\" where \"\""; + String ruleset = String.format("Rules = [ %s ]", rule); + assertThrows(InvalidDataQualityRulesetException.class, () -> parser.parse(ruleset)); + } + + @Test + void test_whereClauseCannotBeEmpty() { + String rule = "IsComplete \"colA\" where \"\""; + String ruleset = String.format("Rules = [ %s ]", rule); + assertThrows(InvalidDataQualityRulesetException.class, () -> parser.parse(ruleset)); + } + + @Test + void test_constructorWithWhereClause() { + String ruleType = "IsComplete"; + String columnKey = "TargetColumn"; + String column = "colA"; + String emptyCondition = ""; + String whereClause = "\"colB is NOT NULL\""; + + Map parameters = new HashMap<>(); + parameters.put(columnKey, column); + + Condition condition = new Condition(emptyCondition); + Condition thresholdCondition = new Condition(emptyCondition); + + DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND; + List nestedRules = new ArrayList<>(); + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition, operator, nestedRules, whereClause); + assertEquals(ruleType, rule.getRuleType()); + + assertTrue(rule.getParameters().containsKey(columnKey)); + assertEquals(column, rule.getParameters().get(columnKey)); + assertTrue(rule.getParameterValueMap().containsKey(columnKey)); + assertEquals(column, rule.getParameterValueMap().get(columnKey).getValue()); + assertTrue(rule.getParameterValueMap().get(columnKey).getConnectorWord().isEmpty()); + assertTrue(rule.getParameterValueMap().get(columnKey).isQuoted()); + assertTrue(rule.getCondition().getConditionAsString().isEmpty()); + assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); + assertEquals(operator, rule.getOperator()); + assertTrue(rule.getNestedRules().isEmpty()); + assertEquals(rule.getWhereClause(), whereClause); + } + + @Test + void test_constructorWithParametersAndCondition() { + String ruleType = "IsComplete"; + String columnKey = "TargetColumn"; + String column = "colA"; + String emptyCondition = ""; + + Map parameters = new HashMap<>(); + parameters.put(columnKey, column); + + Condition condition = new Condition(emptyCondition); + Condition thresholdCondition = new Condition(emptyCondition); + + DQRule rule = new DQRule(ruleType, parameters, condition, thresholdCondition); + assertEquals(ruleType, rule.getRuleType()); + + assertTrue(rule.getParameters().containsKey(columnKey)); + assertEquals(column, rule.getParameters().get(columnKey)); + assertTrue(rule.getParameterValueMap().containsKey(columnKey)); + assertEquals(column, rule.getParameterValueMap().get(columnKey).getValue()); + assertTrue(rule.getParameterValueMap().get(columnKey).getConnectorWord().isEmpty()); + assertTrue(rule.getParameterValueMap().get(columnKey).isQuoted()); + assertTrue(rule.getCondition().getConditionAsString().isEmpty()); + assertTrue(rule.getThresholdCondition().getConditionAsString().isEmpty()); + } + + @Test + void test_modifyNestedRules() throws InvalidDataQualityRulesetException { + String rule1 = "IsComplete \"name\""; + String rule2 = "IsUnique \"name\""; + String rule3 = "IsPrimaryKey \"name\""; + String ruleset = String.format("Rules = [" + + "(%s) AND (%s)," + + "%s ]", rule1, rule2, rule3); + DQRuleset dqRuleset = parser.parse(ruleset); + + DQRule composite = dqRuleset.getRules().get(0); + + // Copy the list's elements into a new list, without copying the list itself + List nested = new ArrayList<>(composite.getNestedRules()); + nested.add(dqRuleset.getRules().get(1)); // IsComplete AND IsUnique AND IsPrimaryKey + + DQRule modified = composite.withNestedRules(nested); + + // The original rule hasn't been modified + assertEquals(composite.toString(), "(IsComplete \"name\") AND (IsUnique \"name\")"); + + // The modified rule includes all subrules + assertEquals(modified.toString(), "(IsComplete \"name\") AND (IsUnique \"name\") AND (IsPrimaryKey \"name\")"); + assertEquals(modified.getNestedRules().size(), 3); + } + + @Test + void test_withCondition() throws InvalidDataQualityRulesetException { + DQRuleset ruleset = parser.parse("Rules = [RowCount > 20, RowCount > 10 + 10]"); + + DQRule simple = ruleset.getRules().get(0); + DQRule dynamic = ruleset.getRules().get(1); + + Condition simplified = simple.getCondition(); + assertEquals(simplified.getFormattedCondition(), "> 20"); + + DQRule modified = dynamic.withCondition(simplified); + + // The original rule hasn't been modified + assertEquals(dynamic.toString(), "RowCount > 10 + 10"); + + // The modified rule uses the simplified condition + assertEquals(modified.toString(), "RowCount > 20"); + } + + @Test + void test_withCompositeRuleThatReachesMaxDepth() { + Map ruleIdToRuleMap = getStringStringMap(); + + /* + AND + Rule1 OR + Rule2 AND + Rule3 OR + Rule4 AND + Rule5 Rule6 <----- Depth = 5 which is OK + */ + String compositeRule = "(Rule1) AND ((Rule2) OR ((Rule3) AND ((Rule4) OR ((Rule5) AND (Rule6)))))"; // Template + int ruleCount = 6; + + for (int i = 1; i <= ruleCount; i++) { + String ruleId = String.format("Rule%s", i); + compositeRule = compositeRule.replace(ruleId, ruleIdToRuleMap.get(ruleId)); + } + + String rulesetString = String.format("Rules = [ %s ]", compositeRule); + + try { + DQRuleset ruleset = parser.parse(rulesetString); + assertEquals(1, ruleset.getRules().size()); + + DQRule actualCompositeRule = ruleset.getRules().get(0); + List nestedRules = actualCompositeRule.getNestedRulesAsFlattenedList(); + + assertEquals(compositeRule, actualCompositeRule.toString()); + assertEquals(6, nestedRules.size()); + + List nestedRulesAsStrings = nestedRules.stream().map(DQRule::toString).collect(Collectors.toList()); + + for (int i = 1; i <= ruleCount; i++) { + String ruleId = String.format("Rule%s", i); + assertTrue(nestedRulesAsStrings.contains(ruleIdToRuleMap.get(ruleId))); + } + } catch (InvalidDataQualityRulesetException e) { + fail("This rule that reaches max depth should have been successfully parsed"); + } + } + + @Test + void test_withCompositeRuleThatBreachesMaxDepth() { + Map ruleIdToRuleMap = getStringStringMap(); + /* + AND + Rule1 OR + Rule2 AND + Rule3 OR + Rule4 AND + Rule5 OR + Rule6 Rule 7 <----- Depth = 6 which is not OK + */ + String compositeRule = "(Rule1) AND ((Rule2) OR ((Rule3) OR ((Rule4) OR ((Rule5) AND ((Rule6) OR (Rule7))))))"; + int ruleCount = 7; + + for (int i = 1; i <= ruleCount; i++) { + String ruleId = String.format("Rule%s", i); + compositeRule = compositeRule.replace(ruleId, ruleIdToRuleMap.get(ruleId)); + } + + String rulesetString = String.format("Rules = [ %s ]", compositeRule); + + try { + parser.parse(rulesetString); + fail("This rule that breaches max depth should have failed to parse"); + } catch (InvalidDataQualityRulesetException e) { + assertTrue(e.getMessage().contains("Maximum nested expression depth")); + } + } + + private static Map getStringStringMap() { + Map ruleIdToRuleMap = new HashMap<>(); + ruleIdToRuleMap.put("Rule1", "RowCount > 1"); + ruleIdToRuleMap.put("Rule2", "Completeness of \"colA\" between 0.4 and 1.0"); + ruleIdToRuleMap.put("Rule3", "CustomSql \"select count(*) from primary\" = 10"); + ruleIdToRuleMap.put("Rule4", "ReferentialIntegrity of \"primary.colA\" and \"ref.colA\" = 0.9"); + ruleIdToRuleMap.put("Rule5", "IsUnique \"id\""); + ruleIdToRuleMap.put("Rule6", "ColumnNamesMatchPattern \"[a-zA-Z]*\""); + ruleIdToRuleMap.put("Rule7", "SchemaMatch \"ref\" between 0.8 and 0.9"); + return ruleIdToRuleMap; + } + @Disabled void test_nullParametersAreCorrectlyHandled() { Map parameters = null; @@ -303,4 +962,12 @@ private T deserialize(byte[] b, Class cls) throws IO Object o = objectStream.readObject(); return cls.cast(o); } + + private String removeQuotes(String quotedString) { + if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) { + quotedString = quotedString.substring(1); + quotedString = quotedString.substring(0, quotedString.length() - 1); + } + return quotedString; + } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java index 5468a96..73df39d 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRulesetTest.java @@ -15,6 +15,11 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -181,6 +186,132 @@ void test_isPrimaryCheckWithMetadataAndSourcesAndNoPrimarySourceToString() { assertEquals(dqdlFormatted, dqRuleset.toString()); } + @Test + void test_isPrimaryCheckWithMetadataAndSourcesAndAnalyzers() { + String dqdl = "Metadata = { \"Version\": \"1.0\" }" + LINE_SEP + + "DataSources = { \"Primary\": \"orders-table\", \"AdditionalDataSources\": [ \"ref-table\" ] } " + LINE_SEP + + "Rules = [ IsPrimaryKey \"colA\" ] " + LINE_SEP + + "Analyzers = [ Completeness \"colA\" ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals("orders-table", dqRuleset.getPrimarySourceName()); + assertEquals(1, dqRuleset.getAdditionalDataSourcesNames().size()); + assertEquals("ref-table", dqRuleset.getAdditionalDataSourcesNames().get(0)); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("IsPrimaryKey", dqRuleset.getRules().get(0).getRuleType()); + assertEquals(1, dqRuleset.getAnalyzers().size()); + assertEquals("Completeness", dqRuleset.getAnalyzers().get(0).getRuleType()); + + String dqdlFormatted = + "Metadata = {" + LINE_SEP + + " \"Version\": \"1.0\"" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "DataSources = {" + LINE_SEP + + " \"Primary\": \"orders-table\"," + LINE_SEP + + " \"AdditionalDataSources\": [ \"ref-table\" ]" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "Rules = [" + LINE_SEP + + " IsPrimaryKey \"colA\"" + LINE_SEP + + "]" + LINE_SEP + LINE_SEP + + "Analyzers = [" + LINE_SEP + + " Completeness \"colA\"" + LINE_SEP + + "]"; + assertEquals(dqdlFormatted, dqRuleset.toString()); + } + + @Test + void test_rulesetWithAnalyzersAndEmptyOrMissingRules() { + String dqdl1 = "Analyzers = [ RowCount, Completeness of \"col-A\" ]"; + String dqdl2 = "Rules = [] Analyzers = [ RowCount, Completeness of \"col-A\" ]"; + + Arrays.asList(dqdl1, dqdl2).forEach(dqdl -> { + DQRuleset ruleset = parseDQDL(dqdl); + List dqRules = ruleset.getRules(); + List dqAnalyzers = ruleset.getAnalyzers(); + + assertEquals(0, dqRules.size()); + assertEquals(2, dqAnalyzers.size()); + assertEquals("RowCount", dqAnalyzers.get(0).getRuleType()); + assertEquals(0, dqAnalyzers.get(0).getParameterValueMap().size()); + assertEquals("Completeness", dqAnalyzers.get(1).getRuleType()); + assertTrue(dqAnalyzers.get(1).getParameterValueMap().containsKey("TargetColumn")); + assertEquals("col-A", dqAnalyzers.get(1).getParameterValueMap().get("TargetColumn").getValue()); + }); + } + + @Test + void test_rulesetWithEmptyAnalyzersAndEmptyRules() { + Arrays.asList( + "Rules = []", + "Analyzers = []", + "Rules = [] Analyzers = []" + ).forEach(ruleset -> { + try { + dqdlParser.parse(ruleset); + fail("Ruleset parsing should have failed"); + } catch (InvalidDataQualityRulesetException e) { + System.out.println(e.getMessage()); + assertTrue(e.getMessage().contains("No rules or analyzers provided")); + } + }); + } + + @Test + void test_rulesetWithMetadataAndSourcesAndAnalyzersAndNoRules() { + String dqdl = + "Metadata = { \"Version\": \"1.0\" }" + LINE_SEP + + "DataSources = {" + + " \"Primary\": \"orders-table\", " + LINE_SEP + + " \"AdditionalDataSources\": [ \"ref-table\" ]" + LINE_SEP + + "}" + LINE_SEP + + "Analyzers = [ RowCount, Completeness \"colA\", Uniqueness of col_A ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals("orders-table", dqRuleset.getPrimarySourceName()); + assertEquals(1, dqRuleset.getAdditionalDataSourcesNames().size()); + assertEquals("ref-table", dqRuleset.getAdditionalDataSourcesNames().get(0)); + assertEquals(0, dqRuleset.getRules().size()); + assertEquals(3, dqRuleset.getAnalyzers().size()); + assertEquals("RowCount", dqRuleset.getAnalyzers().get(0).getRuleType()); + assertEquals("Completeness", dqRuleset.getAnalyzers().get(1).getRuleType()); + assertEquals("Uniqueness", dqRuleset.getAnalyzers().get(2).getRuleType()); + + String dqdlFormatted = + "Metadata = {" + LINE_SEP + + " \"Version\": \"1.0\"" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "DataSources = {" + LINE_SEP + + " \"Primary\": \"orders-table\"," + LINE_SEP + + " \"AdditionalDataSources\": [ \"ref-table\" ]" + LINE_SEP + + "}" + LINE_SEP + LINE_SEP + + "Analyzers = [" + LINE_SEP + + " RowCount," + LINE_SEP + + " Completeness \"colA\"," + LINE_SEP + + " Uniqueness of col_A" + LINE_SEP + + "]"; + assertEquals(dqdlFormatted, dqRuleset.toString()); + } + + @Test + void test_rulesetWithAnalyzersAndNoRules() { + String dqdl = "Analyzers = [ Completeness \"colA\", AllStatistics of AllColumns, Uniqueness of \"col_A\" ]"; + DQRuleset dqRuleset = parseDQDL(dqdl); + + List analyzers = dqRuleset.getAnalyzers(); + assertEquals(3, analyzers.size()); + assertEquals("Completeness", analyzers.get(0).getRuleType()); + assertEquals("AllStatistics", analyzers.get(1).getRuleType()); + assertEquals("Uniqueness", analyzers.get(2).getRuleType()); + + String dqdlFormatted = + "Analyzers = [" + LINE_SEP + + " Completeness \"colA\"," + LINE_SEP + + " AllStatistics of AllColumns," + LINE_SEP + + " Uniqueness of \"col_A\"" + LINE_SEP + + "]"; + assertEquals(dqdlFormatted, dqRuleset.toString()); + } + @Disabled void test_jobStatusRuleWithEqualityCheck() { String dqdl = "Rules = [ JobStatus = \"SUCCEEDED\" ]"; @@ -321,6 +452,120 @@ void test_invalidRulesetThrowsException() { } } + @Test + public void testStringVariableResolvedCorrectly() { + String dqdlWithVariable = + "locationVariable = [\"YYZ14\", \"b\", \"c\"]\n" + + "Rules = [ ColumnValues \"Location-id\" in $locationVariable ]"; + String dqdlWithoutVariable = "Rules = [ ColumnValues \"Location-id\" in [\"YYZ14\", \"b\", \"c\"] ]"; + String ruleWithVariable = "ColumnValues \"Location-id\" in $locationVariable"; + String ruleWithoutVariable = "ColumnValues \"Location-id\" in [\"YYZ14\",\"b\",\"c\"]"; + String rulesWithVariable = "Rules = [\n ColumnValues \"Location-id\" in $locationVariable\n]"; + String rulesWithoutVariable = "Rules = [\n ColumnValues \"Location-id\" in [\"YYZ14\",\"b\",\"c\"]\n]"; + + DQRuleset dqRulesetWithVariable = parseDQDL(dqdlWithVariable); + DQRuleset dqRulesetWithoutVariable = parseDQDL(dqdlWithoutVariable); + assertEquals(rulesWithVariable, dqRulesetWithVariable.toString()); + assertEquals(rulesWithoutVariable, dqRulesetWithoutVariable.toString()); + assertEquals(dqRulesetWithoutVariable.getRules().size(), dqRulesetWithVariable.getRules().size()); + assertEquals(ruleWithVariable, + dqRulesetWithVariable.getRules().get(0).toString()); + assertEquals(ruleWithoutVariable, + dqRulesetWithoutVariable.getRules().get(0).toString()); + } + + @Test + public void testStringArrayVariable() { + String dqdl = + "str_arr = [\"a\", \"b\", \"c\"]\n" + + "Rules = [ ColumnValues \"order-id\" in $str_arr ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"order-id\" in $str_arr", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testMultipleRulesWithStringArrayVariable() { + String dqdl = + "codes = [\"A1\", \"B2\", \"C3\"]\n" + + "statuses = [\"active\", \"pending\", \"inactive\"]\n" + + "Rules = [\n" + + " ColumnValues \"product_code\" in $codes,\n" + + " ColumnValues \"status\" in $statuses\n" + + "]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(2, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" in $codes", + dqRuleset.getRules().get(0).toString()); + assertEquals("ColumnValues \"status\" in $statuses", + dqRuleset.getRules().get(1).toString()); + } + + @Test + public void testStringArrayVariableWithNotIn() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in $invalid_codes ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in $invalid_codes", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testUnusedVariable() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in [\"A1\", \"B2\", \"C3\"] ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in [\"A1\",\"B2\",\"C3\"]", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testMultipleVariableDefinitionsOnlyOneUsed() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "invalid_codes1 = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in [\"A1\", \"B2\", \"C3\"] ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in [\"A1\",\"B2\",\"C3\"]", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testVariableDefinitionMissing() { + String dqdl = + "invalid_codes = [\"X1\", \"Y2\", \"Z3\"]\n" + + "invalid_codes1 = [\"X1\", \"Y2\", \"Z3\"]\n" + + "Rules = [ ColumnValues \"product_code\" not in $invalid_codes2 ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("ColumnValues \"product_code\" not in $invalid_codes2", + dqRuleset.getRules().get(0).toString()); + } + + @Test + public void testStringVariable() { + String dqdl = + "sqlString = \"select id from primary where age < 100\"\n" + + "Rules = [ CustomSql \"select id from primary where age < 100\" ]"; + + DQRuleset dqRuleset = parseDQDL(dqdl); + assertEquals(1, dqRuleset.getRules().size()); + assertEquals("CustomSql \"select id from primary where age < 100\"", + dqRuleset.getRules().get(0).toString()); + } + @Test void test_multipleRules() { String dqdl = diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java new file mode 100644 index 0000000..db85392 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariableTest.java @@ -0,0 +1,86 @@ +/* + * DQVariableTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model; + +import org.junit.jupiter.api.Test; +import java.time.Duration; +import java.time.LocalDate; +import java.util.Arrays; +import java.util.List; +import static org.junit.jupiter.api.Assertions.*; + +class DQVariableTest { + + @Test + void testConstructorAndGetters() { + DQVariable intVar = new DQVariable<>("age", DQVariable.VariableType.NUMBER, 30); + assertEquals("age", intVar.getName()); + assertEquals(DQVariable.VariableType.NUMBER, intVar.getType()); + assertEquals(30, intVar.getValue()); + } + + @Test + void testEqualsAndHashCode() { + DQVariable var1 = new DQVariable<>("name", DQVariable.VariableType.STRING, "John"); + DQVariable var2 = new DQVariable<>("name", DQVariable.VariableType.STRING, "John"); + DQVariable var3 = new DQVariable<>("name", DQVariable.VariableType.STRING, "Jane"); + + assertEquals(var1, var2); + assertNotEquals(var1, var3); + assertEquals(var1.hashCode(), var2.hashCode()); + } + + @Test + void testToStringForNumber() { + DQVariable intVar = new DQVariable<>("age", DQVariable.VariableType.NUMBER, 30); + assertEquals("age = 30", intVar.toString()); + } + + @Test + void testToStringForString() { + DQVariable stringVar = new DQVariable<>("name", DQVariable.VariableType.STRING, "John"); + assertEquals("name = \"John\"", stringVar.toString()); + } + + @Test + void testToStringForDate() { + LocalDate date = LocalDate.of(2023, 5, 15); + DQVariable dateVar = new DQVariable<>("birthdate", DQVariable.VariableType.DATE, date); + assertEquals("birthdate = 2023-05-15", dateVar.toString()); + } + + @Test + void testToStringForDuration() { + Duration duration = Duration.ofHours(2); + DQVariable durationVar = new DQVariable<>("timeSpent", DQVariable.VariableType.DURATION, duration); + assertEquals("timeSpent = PT2H", durationVar.toString()); + } + + @Test + void testToStringForNumberArray() { + List numbers = Arrays.asList(1, 2, 3); + DQVariable> arrayVar = new DQVariable<>("numbers", DQVariable.VariableType.NUMBER_ARRAY, numbers); + assertEquals("numbers = [1, 2, 3]", arrayVar.toString()); + } + + @Test + void testToStringForStringArray() { + List names = Arrays.asList("John", "Jane", "Doe"); + DQVariable> arrayVar = new DQVariable<>("names", DQVariable.VariableType.STRING_ARRAY, names); + assertEquals("names = [John, Jane, Doe]", arrayVar.toString()); + } + + @Test + void testToStringForNullValue() { + DQVariable nullVar = new DQVariable<>("nullValue", DQVariable.VariableType.STRING, null); + assertEquals("nullValue = null", nullVar.toString()); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java index d92079e..1b4bfb1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DeserializationTest.java @@ -71,11 +71,12 @@ public void test_parseRuleParameterWithoutIsVarArg() throws JsonProcessingExcept } @Test - public void test_parseDQRuleType() throws JsonProcessingException { + public void test_parseDQRuleTypeWithThresholdAndRowLevelFlags() throws JsonProcessingException { String ruleTypeName = "DatasetMatch"; String ruleTypeDesc = "This rule matches two datasets"; String returnType = "STRING"; boolean isThresholdSupported = true; + boolean isExcludedAtRowLevelInCompositeRules = true; // Parameter 1 String param1Type = "String"; @@ -98,15 +99,67 @@ public void test_parseDQRuleType() throws JsonProcessingException { "\"description\":\"%s\"," + "\"parameters\": [ %s, %s ]," + "\"return_type\": \"%s\"," + - "\"is_threshold_supported\": %s" + + "\"is_threshold_supported\": \"%s\"," + + "\"is_excluded_at_row_level_in_composite_rules\": %s" + "}", - ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType, isThresholdSupported); + ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType, isThresholdSupported, + isExcludedAtRowLevelInCompositeRules); DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); assertEquals(ruleTypeName, ruleType.getRuleTypeName()); assertEquals(ruleTypeDesc, ruleType.getDescription()); assertEquals(returnType, ruleType.getReturnType()); assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); + assertEquals(isExcludedAtRowLevelInCompositeRules, ruleType.isExcludedAtRowLevelInCompositeRules()); + + DQRuleParameter param1 = ruleType.getParameters().get(0); + assertEquals(param1Type, param1.getType()); + assertEquals(param1Name, param1.getName()); + assertEquals(param1Desc, param1.getDescription()); + + DQRuleParameter param2 = ruleType.getParameters().get(1); + assertEquals(param2Type, param2.getType()); + assertEquals(param2Name, param2.getName()); + assertEquals(param2Desc, param2.getDescription()); + } + + @Test + public void test_parseDQRuleTypeWithNoThresholdAndRowLevelFlags() throws JsonProcessingException { + String ruleTypeName = "DatasetMatch"; + String ruleTypeDesc = "This rule matches two datasets"; + String returnType = "STRING"; + boolean isThresholdSupported = false; + boolean isExcludedAtRowLevelInCompositeRules = false; + + // Parameter 1 + String param1Type = "String"; + String param1Name = "PrimaryDatasetAlias"; + String param1Desc = "This is the primary dataset alias"; + String param1Json = String.format( + "{\"type\":\"%s\",\"name\":\"%s\",\"description\":\"%s\"}", param1Type, param1Name, param1Desc); + + // Parameter2 + String param2Type = "String"; + String param2Name = "ReferenceDatasetAlias"; + String param2Desc = "This is the reference dataset alias"; + + String param2Json = String.format( + "{\"type\":\"%s\",\"name\":\"%s\",\"description\":\"%s\"}", param2Type, param2Name, param2Desc); + + String json = String.format( + "{" + + "\"rule_type_name\":\"%s\"," + + "\"description\":\"%s\"," + + "\"parameters\": [ %s, %s ]," + + "\"return_type\": \"%s\"" + + "}", ruleTypeName, ruleTypeDesc, param1Json, param2Json, returnType); + + DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); + assertEquals(ruleTypeName, ruleType.getRuleTypeName()); + assertEquals(ruleTypeDesc, ruleType.getDescription()); + assertEquals(returnType, ruleType.getReturnType()); + assertEquals(isThresholdSupported, ruleType.isThresholdSupported()); + assertEquals(isExcludedAtRowLevelInCompositeRules, ruleType.isExcludedAtRowLevelInCompositeRules()); DQRuleParameter param1 = ruleType.getParameters().get(0); assertEquals(param1Type, param1.getType()); @@ -179,4 +232,30 @@ public void test_parseDQRuleTypeWithMultipleParametersAndIncorrectVarArgParamete assertEquals(IllegalArgumentException.class, thrown.getCause().getClass()); assertTrue(thrown.getMessage().contains("Property isVarArg can only be set to true on last element in parameters list")); } + + @Test + public void test_parseDQRuleTypeScope() throws JsonProcessingException { + String ruleTypeName = "ColumnCount"; + String ruleTypeDesc = "This rule checks the column count"; + String returnType = "NUMBER"; + String scope = "table"; + + String json = String.format( + "{" + + "\"rule_type_name\":\"%s\"," + + "\"description\":\"%s\"," + + "\"parameters\": [ ]," + + "\"return_type\": \"%s\"," + + "\"scope\": \"%s\"" + + "}", + ruleTypeName, ruleTypeDesc, returnType, scope); + + DQRuleType ruleType = new ObjectMapper().readValue(json, DQRuleType.class); + + assertEquals(ruleTypeName, ruleType.getRuleTypeName()); + assertEquals(ruleTypeDesc, ruleType.getDescription()); + assertEquals(returnType, ruleType.getReturnType()); + assertEquals(scope, ruleType.getScope()); + assertTrue(ruleType.getParameters().isEmpty()); + } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java index 0c51b4e..fc781f6 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/ConditionTest.java @@ -23,6 +23,7 @@ import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -34,32 +35,66 @@ public class ConditionTest { private static Stream provideRulesWithNumberBasedConditions() { return Stream.of( Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.5, true), + Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.4, false), Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.3, false), + Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.9, false), Arguments.of("Completeness \"colA\" between 0.4 and 0.9", 0.91, false), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.5, false), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.4, true), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.3, true), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.9, true), + Arguments.of("Completeness \"colA\" not between 0.4 and 0.9", 0.91, true), + Arguments.of("Completeness \"colA\" not in [0.4, 0.9]", 0.91, true), Arguments.of("ColumnCorrelation \"colA\" \"colB\" between -0.2 and 1.0", 0.9, true), Arguments.of("ColumnCorrelation \"colA\" \"colB\" between -0.2 and 1.0", -0.19, true), Arguments.of("ColumnCorrelation \"colA\" \"colB\" between -0.2 and 1.0", -0.2001, false), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not between -0.2 and 1.0", -0.2, true), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not between -0.2 and 1.0", -0.19, false), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not between -0.2 and 1.0", -0.21, true), + Arguments.of("ColumnCorrelation \"colA\" \"colB\" not in [-0.2, 1.0]", -0.2001, true), + Arguments.of("ColumnLength \"colA\" in [1, 2, 3]", 4.0, false), + Arguments.of("ColumnLength \"colA\" not in [1, 2, 3]", 4.0, true), + Arguments.of("ColumnLength \"colA\" in [1, 2, 3, 4]", 4.0, true), + Arguments.of("ColumnLength \"colA\" not in [1, 2, 3, 4]", 4.0, false), + Arguments.of("ColumnValues \"colA\" in [1, 2, 3, 4]", 3.999999, true), + Arguments.of("ColumnValues \"colA\" in [1, 2, 3, 4]", 3.999, false), Arguments.of("Completeness \"colA\" >= 0.4", 0.4, true), Arguments.of("Completeness \"colA\" >= 0.4", 0.39, false), Arguments.of("Completeness \"colA\" >= 0.4", 1.0, true), + Arguments.of("DatasetMatch \"reference\" \"colA\" not between 0.1 and 1.0", 1.0, true), + Arguments.of("DistinctValuesCount \"colA\" not between 0.1 and 1.0", 1.0, true), + Arguments.of("Entropy \"colA\" <= 0.678", 0.679, false), + Arguments.of("Entropy \"colA\" <= 0.678", 0.677, true), + Arguments.of("Entropy \"colA\" <= 0.678", -0.1, true), + Arguments.of("Entropy \"colA\" != 0.678", 0.678, false), + Arguments.of("Mean \"colA\" != 10.0", 10.0, false), + Arguments.of("RowCount != 10.0", 10.0, false), + Arguments.of("Mean \"colA\" != 10.0", 10.0, false), + Arguments.of("StandardDeviation \"colA\" = 10.0", 10.0, true), + Arguments.of("StandardDeviation \"colA\" = -10000.0", -10000.0, true), + Arguments.of("StandardDeviation \"colA\" = 99.34", 99.35, false), + Arguments.of("StandardDeviation \"colA\" != 10.0", 10.0, false), + Arguments.of("StandardDeviation \"colA\" != -10000.0", -10000.0, false), + Arguments.of("StandardDeviation \"colA\" != 99.34", 99.35, true), + Arguments.of("Sum \"colA\" not in [5.0, 10.0]", 10.0, false), Arguments.of("Uniqueness \"colA\" > 0.4", 0.41, true), Arguments.of("Uniqueness \"colA\" > 0.4", 0.4, false), Arguments.of("Uniqueness \"colA\" > 0.4", -0.4, false), + Arguments.of("Uniqueness \"colA\" != 0.4", -0.4, true), + Arguments.of("Uniqueness \"colA\" not between 0.1 and 0.5", 0.5, true), + Arguments.of("Uniqueness \"colA\" not in [0.1, 0.1, 0.5]", 0.3, true), Arguments.of("UniqueValueRatio \"colA\" < -0.4", 100.9, false), Arguments.of("UniqueValueRatio \"colA\" < -0.4", -0.5, true), Arguments.of("UniqueValueRatio \"colA\" < -0.4", -0.41, true), - Arguments.of("Entropy \"colA\" <= 0.678", 0.679, false), - Arguments.of("Entropy \"colA\" <= 0.678", 0.677, true), - Arguments.of("Entropy \"colA\" <= 0.678", -0.1, true), - Arguments.of("StandardDeviation \"colA\" = 10.0", 10.0, true), - Arguments.of("StandardDeviation \"colA\" = -10000.0", -10000.0, true), - Arguments.of("StandardDeviation \"colA\" = 99.34", 99.35, false) + Arguments.of("UniqueValueRatio \"colA\" not between -0.5 and -0.4", -0.41, false), + Arguments.of("UniqueValueRatio \"colA\" not between -0.4 and -0.5", -0.41, true) ); } private static Stream provideRulesWithNumberBasedThresholdConditions() { return Stream.of( Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold between 0.4 and 0.9", 0.5, true), + Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold not between 0.4 and 0.9", 0.5, false), Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold > 0.6", 0.59, false), Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold >= 0.5", 0.5, true), Arguments.of("ColumnValues \"colA\" in [ \"A\", \"B\"] with threshold < 0.333", 0.334, false), @@ -71,6 +106,7 @@ private static Stream provideRulesWithNumberBasedThresholdConditions( Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold < 0.333", 0.332, true), Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold <= 0.333", 0.3, true), Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold = 0.2", 0.2, true), + Arguments.of("ColumnValues \"colA\" matches \"[a-zA-Z]\" with threshold != 0.2", 0.2, false), Arguments.of("ColumnValues \"Customer_ID\" in [1,2,3,4,5,6,7,8,9] with threshold > 0.98", 0.979, false) ); } @@ -79,29 +115,35 @@ private static Stream provideRulesWithDateBasedThresholdConditions() return Stream.of( // With static dates Arguments.of("ColumnValues \"colA\" in [ \"2022-01-01\", \"2022-12-31\" ]"), + Arguments.of("ColumnValues \"colA\" not in [ \"2022-01-01\", \"2022-12-31\" ]"), Arguments.of("ColumnValues \"colA\" >= \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" > \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" <= \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" < \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" between \"2022-01-01\" and \"2022-12-31\""), + Arguments.of("ColumnValues \"colA\" not between \"2022-01-01\" and \"2022-12-31\""), // With dynamic expressions Arguments.of("ColumnValues \"colA\" in [ (now() - 14 days), (now() - 7 days), \"2022-01-01\" ]"), + Arguments.of("ColumnValues \"colA\" not in [ (now() - 14 days), (now() - 7 days), \"2022-01-01\" ]"), Arguments.of("ColumnValues \"colA\" >= now() ]"), Arguments.of("ColumnValues \"colA\" > (now() - 12 hours) ]"), Arguments.of("ColumnValues \"colA\" <= (now() + 3 days) ]"), Arguments.of("ColumnValues \"colA\" < (now() + 72 hours) ]"), - Arguments.of("ColumnValues \"colA\" between (now() - 14 days) and now()") + Arguments.of("ColumnValues \"colA\" between (now() - 14 days) and now()"), + Arguments.of("ColumnValues \"colA\" not between (now() - 14 days) and now()") ); } private static Stream provideRulesWithDurationBasedThresholdConditions() { return Stream.of( Arguments.of("DataFreshness \"colA\" in [ 3 hours, 12 hours, 1 days ]"), + Arguments.of("DataFreshness \"colA\" not in [ 3 hours, 12 hours, 1 days ]"), Arguments.of("DataFreshness \"colA\" >= 12 hours"), Arguments.of("DataFreshness \"colA\" > 2 days"), Arguments.of("DataFreshness \"colA\" <= 2 hours"), Arguments.of("DataFreshness \"colA\" < 6 hours"), - Arguments.of("DataFreshness \"colA\" between 6 hours and 12 hours") + Arguments.of("DataFreshness \"colA\" between 6 hours and 12 hours"), + Arguments.of("DataFreshness \"colA\" not between 6 hours and 12 hours") ); } @@ -117,7 +159,7 @@ void test_ruleParsingAndVerifyingNumberBasedCondition(String rule, Double metric NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); assertTrue(dqRule.toString().contains(condition.getFormattedCondition())); - assertEquals(shouldRulePass, condition.evaluate(metric)); + assertEquals(shouldRulePass, condition.evaluate(metric, dqRule, testEvaluator)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } @@ -136,7 +178,7 @@ void test_ruleParsingAndVerifyingNumberBasedThresholdCondition(String rule, Doub NumberBasedCondition thresholdCondition = (NumberBasedCondition) dqRule.getThresholdCondition(); assertTrue(dqRule.toString().contains(thresholdCondition.getFormattedCondition())); - assertEquals(shouldRulePass, thresholdCondition.evaluate(metric)); + assertEquals(shouldRulePass, thresholdCondition.evaluate(metric, dqRule, testEvaluator)); } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } @@ -155,6 +197,14 @@ void test_ruleParsingAndVerifyingDateBasedCondition(String rule) { DateBasedCondition condition = (DateBasedCondition) dqRule.getCondition(); assertTrue(dqRule.toString().contains(condition.getFormattedCondition())); + + try { + condition.evaluate(0.0, dqRule, testEvaluator); + fail("Expected date condition to throw UnsupportedOperationException"); + } catch (UnsupportedOperationException e) { + // pass + } + } catch (InvalidDataQualityRulesetException e) { fail(e.getMessage()); } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java index e5b332d..fd4af81 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionTest.java @@ -12,6 +12,7 @@ import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration; import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit; +import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -20,6 +21,7 @@ import java.util.Collections; import java.util.stream.Stream; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.NULL; import static org.junit.jupiter.api.Assertions.assertEquals; public class DateBasedConditionTest { @@ -33,8 +35,20 @@ private static Stream provideDateBasedConditionsWithExpectedFormatted new DateExpression.StaticDate("2023-01-01"), new DateExpression.StaticDate("2023-12-31") ) ), + "between \"2023-01-01\" and \"2023-12-31\"", "between \"2023-01-01\" and \"2023-12-31\"" ), + Arguments.of( + new DateBasedCondition( + "notbetween\"2023-01-01\"and\"2023-12-31\"", + DateBasedConditionOperator.NOT_BETWEEN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), new DateExpression.StaticDate("2023-12-31") + ) + ), + "not between \"2023-01-01\" and \"2023-12-31\"", + "not between \"2023-01-01\" and \"2023-12-31\"" + ), Arguments.of( new DateBasedCondition( "between(now()-4days)and(now()+72hours)", @@ -48,14 +62,32 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ) ), + "between (now() - 4 days) and (now() + 72 hours)", "between (now() - 4 days) and (now() + 72 hours)" ), + Arguments.of( + new DateBasedCondition( + "notbetween(now()-4days)and(now()+72hours)", + DateBasedConditionOperator.NOT_BETWEEN, + Arrays.asList( + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.MINUS,new Duration(4, DurationUnit.DAYS) + ), + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) + ) + ) + ), + "not between (now() - 4 days) and (now() + 72 hours)", + "not between (now() - 4 days) and (now() + 72 hours)" + ), Arguments.of( new DateBasedCondition( ">\"2023-01-01\"", DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "> \"2023-01-01\"", "> \"2023-01-01\"" ), Arguments.of( @@ -64,6 +96,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(new DateExpression.CurrentDate()) ), + "> now()", "> now()" ), Arguments.of( @@ -72,6 +105,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) DateBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + ">= \"2023-01-01\"", ">= \"2023-01-01\"" ), Arguments.of( @@ -84,6 +118,7 @@ DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) ) ) ), + ">= (now() - 2 days)", ">= (now() - 2 days)" ), Arguments.of( @@ -92,6 +127,7 @@ DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) DateBasedConditionOperator.LESS_THAN, Collections.singletonList(new DateExpression.CurrentDate()) ), + "< now()", "< now()" ), Arguments.of( @@ -104,6 +140,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) ) ) ), + "< (now() + 100 days)", "< (now() + 100 days)" ), Arguments.of( @@ -112,6 +149,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "<= \"2023-01-01\"", "<= \"2023-01-01\"" ), Arguments.of( @@ -120,6 +158,7 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(new DateExpression.CurrentDate()) ), + "<= now()", "<= now()" ), Arguments.of( @@ -128,8 +167,18 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(100, DurationUnit.DAYS) DateBasedConditionOperator.EQUALS, Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) ), + "= \"2023-01-01\"", "= \"2023-01-01\"" ), + Arguments.of( + new DateBasedCondition( + "!=\"2023-01-01\"", + DateBasedConditionOperator.NOT_EQUALS, + Collections.singletonList(new DateExpression.StaticDate("2023-01-01")) + ), + "!= \"2023-01-01\"", + "!= \"2023-01-01\"" + ), Arguments.of( new DateBasedCondition( ">=(now()-2days)", @@ -140,6 +189,7 @@ DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) ) ) ), + "= (now() - 2 days)", "= (now() - 2 days)" ), Arguments.of( @@ -157,7 +207,68 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) ) ) ), - "in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]" + "in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]", + "in [\"2023-01-01\",(now() + 72 hours),(now() - 2 days),now()]" + ), + Arguments.of( + new DateBasedCondition( + "in[\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + DateBasedConditionOperator.IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.StaticDate("2022-01-01"), + new DateExpression.StaticDate("2021-01-01"), + new DateExpression.StaticDate("2020-01-01") + ) + ), + "in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + "in [\"2020-01-01\",\"2021-01-01\",\"2022-01-01\",\"2023-01-01\"]" + ), + Arguments.of( + new DateBasedCondition( + "notin[\"2023-01-01\",now(),(now()-2days),(now()+72hours)]", + DateBasedConditionOperator.NOT_IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.CurrentDate(), + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.MINUS, new Duration(2, DurationUnit.DAYS) + ), + new DateExpression.CurrentDateExpression( + DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) + ) + ) + ), + "not in [\"2023-01-01\",now(),(now() - 2 days),(now() + 72 hours)]", + "not in [\"2023-01-01\",(now() + 72 hours),(now() - 2 days),now()]" + ), + Arguments.of( + new DateBasedCondition( + "notin[\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + DateBasedConditionOperator.NOT_IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.StaticDate("2022-01-01"), + new DateExpression.StaticDate("2021-01-01"), + new DateExpression.StaticDate("2020-01-01") + ) + ), + "not in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",\"2020-01-01\"]", + "not in [\"2020-01-01\",\"2021-01-01\",\"2022-01-01\",\"2023-01-01\"]" + ), + Arguments.of( + new DateBasedCondition( + "in[\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",NULL]", + DateBasedConditionOperator.IN, + Arrays.asList( + new DateExpression.StaticDate("2023-01-01"), + new DateExpression.StaticDate("2022-01-01"), + new DateExpression.StaticDate("2021-01-01"), + new NullDateExpression() + ) + ), + "in [\"2023-01-01\",\"2022-01-01\",\"2021-01-01\",NULL]", + "in [\"2021-01-01\",\"2022-01-01\",\"2023-01-01\",NULL]" ) ); } @@ -165,7 +276,9 @@ DateExpression.DateExpressionOperator.PLUS, new Duration(72, DurationUnit.HOURS) @ParameterizedTest @MethodSource("provideDateBasedConditionsWithExpectedFormattedStrings") public void test_correctlyFormatsDuration(DateBasedCondition condition, - String expectedFormattedString) { + String expectedFormattedString, + String expectedSortedFormattedString) { assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java index 71e6c30..aa8a1b1 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpressionTest.java @@ -15,6 +15,7 @@ import org.junit.jupiter.api.Test; import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.time.temporal.ChronoUnit; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -44,7 +45,7 @@ public void test_currentDateFormattedExpression() { @Test public void test_currentDateEvaluatedExpression() { DateExpression.CurrentDate currentDate = new DateExpression.CurrentDate(); - LocalDateTime dt = LocalDateTime.now(); + LocalDateTime dt = LocalDateTime.now(ZoneOffset.UTC); assertEquals( dt.toString().substring(0, 10), currentDate.getEvaluatedExpression().toString().substring(0, 10) @@ -70,6 +71,26 @@ public void test_currentDateExpressionFormattedExpression() { ); } + @Test + public void test_currentDateExpressionEvaluatedExpressionForMinutes() { + DurationUnit unit = DurationUnit.MINUTES; + int amount = 24; + Duration duration = new Duration(amount, unit); + + DateExpression.DateExpressionOperator operator = + DateExpression.DateExpressionOperator.PLUS; + + LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.UTC).withSecond(0).withNano(0); + DateExpression.CurrentDateExpression currentDateExpression = + new DateExpression.CurrentDateExpression(operator, duration); + + long minutesDiff = ChronoUnit.MINUTES.between( + currentDate, currentDateExpression.getEvaluatedExpression() + ); + + assertEquals(amount, minutesDiff); + } + @Test public void test_currentDateExpressionEvaluatedExpressionForHours() { DurationUnit unit = DurationUnit.HOURS; @@ -79,7 +100,7 @@ public void test_currentDateExpressionEvaluatedExpressionForHours() { DateExpression.DateExpressionOperator operator = DateExpression.DateExpressionOperator.PLUS; - LocalDateTime currentDate = LocalDateTime.now(); + LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.UTC).withMinute(0).withSecond(0).withNano(0); DateExpression.CurrentDateExpression currentDateExpression = new DateExpression.CurrentDateExpression(operator, duration); @@ -88,6 +109,12 @@ public void test_currentDateExpressionEvaluatedExpressionForHours() { ); assertEquals(amount, hoursDiff); + + long minutesDiff = ChronoUnit.MINUTES.between( + currentDate, currentDateExpression.getEvaluatedExpression() + ); + + assertEquals(amount * 60, minutesDiff); } @Test @@ -99,7 +126,7 @@ public void test_currentDateExpressionEvaluatedExpressionForDays() { DateExpression.DateExpressionOperator operator = DateExpression.DateExpressionOperator.MINUS; - LocalDateTime currentDate = LocalDateTime.now(); + LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.UTC); DateExpression.CurrentDateExpression currentDateExpression = new DateExpression.CurrentDateExpression(operator, duration); @@ -108,5 +135,11 @@ public void test_currentDateExpressionEvaluatedExpressionForDays() { ); assertTrue(amount * 24 + hoursDiff <= 1); + + long minutesDiff = ChronoUnit.MINUTES.between( + currentDate, currentDateExpression.getEvaluatedExpression() + ); + + assertTrue(amount * 24 * 60 + minutesDiff <= 1); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java index 98d467e..24cd7c7 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionTest.java @@ -32,14 +32,28 @@ private static Stream provideDurationConditionsWithExpectedFormattedS new Duration(4, DurationUnit.DAYS) ) ), + "between 3 hours and 4 days", "between 3 hours and 4 days" ), + Arguments.of( + new DurationBasedCondition( + "notbetween3hoursand4days", + DurationBasedConditionOperator.NOT_BETWEEN, + Arrays.asList( + new Duration(3, DurationUnit.HOURS), + new Duration(4, DurationUnit.DAYS) + ) + ), + "not between 3 hours and 4 days", + "not between 3 hours and 4 days" + ), Arguments.of( new DurationBasedCondition( ">256hours", DurationBasedConditionOperator.GREATER_THAN, Collections.singletonList(new Duration(256, DurationUnit.HOURS)) ), + "> 256 hours", "> 256 hours" ), Arguments.of( @@ -48,6 +62,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(new Duration(2, DurationUnit.DAYS)) ), + ">= 2 days", ">= 2 days" ), Arguments.of( @@ -56,6 +71,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.LESS_THAN, Collections.singletonList(new Duration(25000, DurationUnit.HOURS)) ), + "< 25000 hours", "< 25000 hours" ), Arguments.of( @@ -64,6 +80,7 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(new Duration(24, DurationUnit.DAYS)) ), + "<= 24 days", "<= 24 days" ), Arguments.of( @@ -72,8 +89,18 @@ private static Stream provideDurationConditionsWithExpectedFormattedS DurationBasedConditionOperator.EQUALS, Collections.singletonList(new Duration(10, DurationUnit.DAYS)) ), + "= 10 days", "= 10 days" ), + Arguments.of( + new DurationBasedCondition( + "!=10days", + DurationBasedConditionOperator.NOT_EQUALS, + Collections.singletonList(new Duration(10, DurationUnit.DAYS)) + ), + "!= 10 days", + "!= 10 days" + ), Arguments.of( new DurationBasedCondition( "in[3hours,4days,96hours,7days]", @@ -85,7 +112,22 @@ private static Stream provideDurationConditionsWithExpectedFormattedS new Duration(7, DurationUnit.DAYS) ) ), - "in [3 hours, 4 days, 96 hours, 7 days]" + "in [3 hours,4 days,96 hours,7 days]", + "in [3 hours,4 days,7 days,96 hours]" + ), + Arguments.of( + new DurationBasedCondition( + "notin[3hours,4days,96hours,7days]", + DurationBasedConditionOperator.NOT_IN, + Arrays.asList( + new Duration(3, DurationUnit.HOURS), + new Duration(4, DurationUnit.DAYS), + new Duration(96, DurationUnit.HOURS), + new Duration(7, DurationUnit.DAYS) + ) + ), + "not in [3 hours,4 days,96 hours,7 days]", + "not in [3 hours,4 days,7 days,96 hours]" ) ); } @@ -93,7 +135,9 @@ private static Stream provideDurationConditionsWithExpectedFormattedS @ParameterizedTest @MethodSource("provideDurationConditionsWithExpectedFormattedStrings") public void test_correctlyFormatsDuration(DurationBasedCondition condition, - String expectedFormattedString) { + String expectedFormattedString, + String expectedSortedFormattedString) { assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); } } diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java new file mode 100644 index 0000000..02b10bc --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionTest.java @@ -0,0 +1,109 @@ +/* + * NumberBasedConditionTest.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class NumberBasedConditionTest { + private static Stream provideNumberConditionsWithExpectedFormattedStrings() { + return Stream.of( + Arguments.of( + new NumberBasedCondition( + "in[15,10,20,5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new AtomicNumberOperand("20"), + new AtomicNumberOperand("5") + ) + ), + "in [15,10,20,5]", + "in [5,10,15,20]" + ), + Arguments.of( + new NumberBasedCondition( + "in[1.5,1.0,2.0,0.5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("1.5"), + new AtomicNumberOperand("1.0"), + new AtomicNumberOperand("2.0"), + new AtomicNumberOperand("0.5") + ) + ), + "in [1.5,1.0,2.0,0.5]", + "in [0.5,1.0,1.5,2.0]" + ), + Arguments.of( + new NumberBasedCondition( + "notin[15,10,20,5]", + NumberBasedConditionOperator.NOT_IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new AtomicNumberOperand("20"), + new AtomicNumberOperand("5") + ) + ), + "not in [15,10,20,5]", + "not in [5,10,15,20]" + ), + Arguments.of( + new NumberBasedCondition( + "in[15,10,NULL,20,5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new NullNumericOperand("NULL"), + new AtomicNumberOperand("20"), + new AtomicNumberOperand("5") + ) + ), + "in [15,10,NULL,20,5]", + "in [5,10,15,20,NULL]" + ), + // We don't limit customers from adding multiple NULL keywords + Arguments.of( + new NumberBasedCondition( + "in[15,10,NULL,NULL,5]", + NumberBasedConditionOperator.IN, + Arrays.asList( + new AtomicNumberOperand("15"), + new AtomicNumberOperand("10"), + new NullNumericOperand("NULL"), + new NullNumericOperand("NULL"), + new AtomicNumberOperand("5") + ) + ), + "in [15,10,NULL,NULL,5]", + "in [5,10,15,NULL,NULL]" + ) + ); + } + + @ParameterizedTest + @MethodSource("provideNumberConditionsWithExpectedFormattedStrings") + public void test_correctlyFormatsNumber(NumberBasedCondition condition, + String expectedFormattedString, + String expectedSortedFormattedString) { + assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java new file mode 100644 index 0000000..846b9da --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperandTest.java @@ -0,0 +1,136 @@ +/* + * NumericOperandTest.java + * + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number; + +import com.amazonaws.glue.ml.dataquality.dqdl.exception.InvalidDataQualityRulesetException; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule; +import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset; +import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class NumericOperandTest { + private static final String MULTIPLY_FUNCTION_NAME = "multiply"; + private static final String AVG_FUNCTION_NAME = "avg"; + + private final DQDLParser parser = new DQDLParser(); + public static final OperandEvaluator testEvaluator = new OperandEvaluator() { + @Override + public Double evaluate(DQRule rule, NumericOperand operand) { + if (operand instanceof AtomicNumberOperand) { + return Double.parseDouble(operand.getOperand()); + } + if (operand instanceof FunctionCallOperand) { + FunctionCallOperand op = (FunctionCallOperand) operand; + if (MULTIPLY_FUNCTION_NAME.equals(op.getFunctionName())) { + return op.getOperands().stream() + .map(ops -> testEvaluator.evaluate(rule, ops)).reduce(1.0, (a, b) -> a * b); + } else if (AVG_FUNCTION_NAME.equals(op.getFunctionName())) { + return op.getOperands().stream() + .map(ops -> testEvaluator.evaluate(rule, ops)).reduce(0.0, (a, b) -> a + b / op.getOperands().size()); + } else { + throw new RuntimeException("Function not supported"); + } + } + if (operand instanceof BinaryExpressionOperand) { + BinaryExpressionOperand op = (BinaryExpressionOperand) operand; + Double op1 = testEvaluator.evaluate(rule, op.getOperand1()); + Double op2 = testEvaluator.evaluate(rule, op.getOperand2()); + switch (op.getOperator()) { + case "+": + return op1 + op2; + case "-": + return op1 - op2; + case "/": + return op1 / op2; + case "*": + return op1 * op2; + default: + throw new IllegalArgumentException("Bad operator"); + } + } else { + throw new RuntimeException("Type not supported"); + } + } + }; + @Test + public void test_functionCallWorksWithAtomicNumberOperands() throws InvalidDataQualityRulesetException { + String rule = "RowCount = multiply(1,2,3)"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + + assertTrue(condition.evaluate(6.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(3.0, dqRule, testEvaluator)); + } + + @Test + public void test_functionCallWorksWithNestedFunctionCallOperands() throws InvalidDataQualityRulesetException { + String rule = "RowCount = multiply(avg(2,4), avg(10,20))"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + assertTrue(condition.evaluate(45.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(40.0, dqRule, testEvaluator)); + } + + @Test + public void test_functionCallWorksInBinaryExpression() throws InvalidDataQualityRulesetException { + String rule = "RowCount = 2.0 * multiply(avg(2,4), avg(10,20))"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + assertTrue(condition.evaluate(90.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(45.0, dqRule, testEvaluator)); + } + + @Test + public void test_functionCallWorksInSimpleBinaryExpression() throws InvalidDataQualityRulesetException { + String rule = "RowCount = 8.0 * (5.0 - (1.0 + (4.0 / 2.0)))"; + DQRuleset ruleset = parser.parse(String.format("Rules = [ %s ]", rule)); + + assertNotNull(ruleset); + assertEquals(1, ruleset.getRules().size()); + + DQRule dqRule = ruleset.getRules().get(0); + + assertEquals(NumberBasedCondition.class, dqRule.getCondition().getClass()); + + NumberBasedCondition condition = (NumberBasedCondition) dqRule.getCondition(); + assertTrue(condition.evaluate(16.0, dqRule, testEvaluator)); + assertFalse(condition.evaluate(8.0, dqRule, testEvaluator)); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java new file mode 100644 index 0000000..b497de5 --- /dev/null +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionTest.java @@ -0,0 +1,86 @@ +/* + * StringBasedConditionTest.java + * + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * PROPRIETARY/CONFIDENTIAL + * + * Use is subject to license terms. + */ + +package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.stream.Stream; + +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.EMPTY; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.NULL; +import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword.WHITESPACES_ONLY; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class StringBasedConditionTest { + + private static Stream provideStringConditionsWithExpectedFormattedStrings() { + return Stream.of( + Arguments.of( + new StringBasedCondition( + "in[\"d\",\"a\",\"c\",\"b\"]", + StringBasedConditionOperator.IN, + Arrays.asList( + new QuotedStringOperand("d"), + new QuotedStringOperand("a"), + new QuotedStringOperand("b"), + new QuotedStringOperand("c") + ) + ), + "in [\"d\",\"a\",\"b\",\"c\"]", + "in [\"a\",\"b\",\"c\",\"d\"]" + ), + Arguments.of( + new StringBasedCondition( + "notin[\"d\",\"a\",\"c\",\"b\"]", + StringBasedConditionOperator.NOT_IN, + Arrays.asList( + new QuotedStringOperand("d"), + new QuotedStringOperand("a"), + new QuotedStringOperand("b"), + new QuotedStringOperand("c") + ) + ), + "not in [\"d\",\"a\",\"b\",\"c\"]", + "not in [\"a\",\"b\",\"c\",\"d\"]" + ), + // Test for Keyword values + Arguments.of( + new StringBasedCondition( + "in[\"z\",\"a\",WHITESPACES_ONLY,EMPTY,\"c\",NULL]", + StringBasedConditionOperator.IN, + Arrays.asList( + new QuotedStringOperand("z"), + new QuotedStringOperand("a"), + new KeywordStringOperand(WHITESPACES_ONLY), + new KeywordStringOperand(EMPTY), + new QuotedStringOperand("c"), + new KeywordStringOperand(NULL) + ) + ), + // verifying behavior that quoted strings will be sorted before keywords + "in [\"z\",\"a\",WHITESPACES_ONLY,EMPTY,\"c\",NULL]", + "in [\"a\",\"c\",\"z\",EMPTY,NULL,WHITESPACES_ONLY]" + ) + ); + } + + @ParameterizedTest + @MethodSource("provideStringConditionsWithExpectedFormattedStrings") + public void test_correctlyFormatsString(StringBasedCondition condition, + String expectedFormattedString, + String expectedSortedFormattedString) { + assertEquals(expectedFormattedString, condition.getFormattedCondition()); + assertEquals(expectedSortedFormattedString, condition.getSortedFormattedCondition()); + } +} diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java index 0603b40..7dcc615 100644 --- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java +++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/parser/InvalidDQRulesetTest.java @@ -24,52 +24,138 @@ public class InvalidDQRulesetTest { private static Stream provideInvalidRulesets() { return Stream.of( - Arguments.of("Rules = {"), - Arguments.of("Rules = }"), - Arguments.of("Rules = { }"), - Arguments.of("Rules = [ ]"), - Arguments.of("Rules = ["), - Arguments.of("Rules = ]"), - Arguments.of("Rules = Abcdefg123"), - Arguments.of("Rules11 = [ ColumnValues \"load_dt\" > (now() - 1) ]"), - Arguments.of("Rules = [ 11ColumnValues \"load_dt\" > (now() - 1) ]"), - Arguments.of("Rules = [ ColumnValues \"load_dt\" \"load_dt_2\" > (now() - 1) ]"), - Arguments.of("Rules = [ Completeness \"col-A\" ]"), - Arguments.of("Rules = { Completeness \"col-A\" }"), - Arguments.of("Rules = [ ColumnNamesMatchPattern aws_* ]"), - Arguments.of("Rules = [ IsComplete \"col-A\" > 0.05 ]"), - Arguments.of("Rules = [ IsUnique \"col-A\" <= 1.5 ]"), - Arguments.of("Rules = [ IsPrimaryKey \"col-A\" between 1 and 2 ]"), - Arguments.of("Rules = [ ColumnDataType \"col-A\" ]"), - Arguments.of("Rules = [ ColumnDataType \"col-A\" with threshold > 0.7 ]"), - Arguments.of("Rules = [ ColumnDataType \"col-A\" \"col-B\" ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" matches ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" now() ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > now() + 1 hours ]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" = (now() - 3 weeks) ]"), - Arguments.of("Rules = [ Completeness \"col-A\" > 0.4 with threshold > 0.4]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 threshold]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with threshold]"), - Arguments.of("Rules = [ ColumnValues \"col-A\" in [1,\"2\"] ]"), - Arguments.of("Rules = [ DataFreshness \"col-A\" <= 3 ]"), - Arguments.of("Rules = [ DataFreshness \"col-A\" > 30 ]"), - Arguments.of("Rules = [ DataFreshness \"col-A\" between 2 and 4 days ]"), - Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference\" \"col-A1\" ]"), - Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" = 0.99 ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" = 0.99 ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" ]"), - Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 with threshold > 0.9]"), - Arguments.of("Rules = [ SchemaMatch with threshold between 0.2 and 0.4 ]"), - Arguments.of("Rules = [ SchemaMatch \"ref-1\" between 0.2 and 0.4 with threshold > 0.5 ]"), - Arguments.of("Rules = [ SchemaMatch \"ref-1\" \"ref-2\" ]"), - Arguments.of("Rules = [ RowCountMatch > 0.1 ]"), - Arguments.of("Rules = [ RowCountMatch \"reference-1\" \"col-1\" > 0.1 ]"), - Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 with threshold > 0.1 ]"), - Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), - Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), - Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]") + Arguments.of(""), + Arguments.of("Metadata = {}"), + Arguments.of("DataSources = {}"), + Arguments.of("Metadata = { \"Version\": \"1.0\" }"), + Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = {}"), + Arguments.of("Metadata = { \"Version\": \"1.0\" } DataSources = { \"Primary\": \"Foo\" }"), + Arguments.of("Rules = {"), + Arguments.of("Rules = }"), + Arguments.of("Rules = { }"), + Arguments.of("Rules = [ ]"), + Arguments.of("Rules = ["), + Arguments.of("Rules = ]"), + Arguments.of("Rules = Abcdefg123"), + Arguments.of("Rules11 = [ ColumnValues \"load_dt\" > (now() - 1) ]"), + Arguments.of("Rules = [ 11ColumnValues \"load_dt\" > (now() - 1) ]"), + Arguments.of("Rules = [ ColumnValues \"load_dt\" \"load_dt_2\" > (now() - 1) ]"), + Arguments.of("Rules = [ Completeness \"col-A\" ]"), + Arguments.of("Rules = { Completeness \"col-A\" }"), + Arguments.of("Rules = [ ColumnNamesMatchPattern aws_* ]"), + Arguments.of("Rules = [ ColumnNamesMatchPattern \"aws_*\" where \"aws_id > 100\"]"), + Arguments.of("Rules = [ IsComplete \"col-A\" > 0.05 ]"), + Arguments.of("Rules = [ IsUnique \"col-A\" <= 1.5 ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\" between 1 and 2 ]"), + Arguments.of("Rules = [ ColumnDataType \"col-A\" ]"), + Arguments.of("Rules = [ ColumnDataType \"col-A\" with threshold > 0.7 ]"), + Arguments.of("Rules = [ ColumnDataType \"col-A\" \"col-B\" ]"), + Arguments.of("Rules = [ ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9 with threshold > 0.7 ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" matches ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" now() ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > now() + 1 hours ]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" = (now() - 3 weeks) ]"), + Arguments.of("Rules = [ Completeness \"col-A\" > 0.4 with threshold > 0.4]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 threshold]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" > 0.4 with threshold]"), + Arguments.of("Rules = [ ColumnValues \"col-A\" in [1,\"2\"] ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" <= 3 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 30 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 9:30 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 9:30 SM ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" > 25:30 ]"), + Arguments.of("Rules = [ DataFreshness \"col-A\" between 2 and 4 days ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference\" \"col-A1\" ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" = 0.99 ]"), + Arguments.of("Rules = [ ReferentialIntegrity \"col-A\" \"reference.col-A\" = 0.99 where \"col-A > 100\"]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" = 0.99 ]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" ]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" ]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 with threshold > 0.9]"), + Arguments.of("Rules = [ DatasetMatch \"reference\" \"ID\" \"colA\" > 0.9 where \"ID > 100\"]"), + Arguments.of("Rules = [ SchemaMatch with threshold between 0.2 and 0.4 ]"), + Arguments.of("Rules = [ SchemaMatch \"ref-1\" between 0.2 and 0.4 with threshold > 0.5 ]"), + Arguments.of("Rules = [ SchemaMatch \"ref-1\" \"ref-2\" ]"), + Arguments.of("Rules = [ RowCountMatch > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" \"col-1\" > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 with threshold > 0.1 ]"), + Arguments.of("Rules = [ RowCountMatch \"reference-1\" > 0.1 where \"id > 100\"]"), + Arguments.of("Rules = [ AggregateMatch > 0.1 ]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" > 0.1 ]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\"]"), + Arguments.of("Rules = [ AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.8 where \"col-A > 100\"]"), + Arguments.of("Rules = [ DetectAnomalies ]"), + Arguments.of("Rules = [ DetectAnomalies \"col-A\" where \"col-A > 100\"]"), + Arguments.of("Rules = [ AllStatistics \"id\" > 0 ]"), + Arguments.of("Rules = [ FileMatch ]"), + Arguments.of("Rules = [ FileMatch in [] ]"), + Arguments.of("Rules = [ FileMatch SHA SHA SHA ]"), + Arguments.of("Rules = [ FileMatch SHA SHA SHA in [] ]"), + Arguments.of("Rules = [ FileMatch s3Path ]"), + Arguments.of("Rules = [ FileMatch s3Path with noHashAlgorithm ]"), + Arguments.of("FileMatch \"S3://PATH\" \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""), + Arguments.of("Rules = [ FileMatch S3://PATH1 ]"), + Arguments.of("Rules = [ FileUniqueness S3://PATH1 S3://PATH1 ]"), + Arguments.of("Rules = [ FileFreshness between \"2024-07-15\" ]"), + Arguments.of("Rules = [ FileFreshness \"S3://PATH\" between and \"2024-07-15\" ]"), + Arguments.of("Rules = [ FileFreshness \"S3://PATH\" \"S3://PATH\" ]"), + Arguments.of("Rules = [ FileFreshness > (now() 3 days) ]"), + Arguments.of("Rules = [ FileUniqueness \"PATH\" ]"), + Arguments.of("Rules = [ FileUniqueness ]"), + Arguments.of("Rules = [ FileSize ]"), + Arguments.of("Rules = [ FileSize > 1 SAM]"), + Arguments.of("Rules = [ FileSize > 1 KB with exampleTag in [\"SAM\"] ]"), + Arguments.of("Rules = [ FileSize > 1 KB with exampleTag != \"SAM\"]"), + Arguments.of("Rules = [ FileSize 1 GB]"), + Arguments.of("Rules = [ FileSize <= 1 ZB ]"), + Arguments.of("Rules = [ FileFreshness > 13:50 AM ]"), + Arguments.of("Rules = [ FileFreshness > 13:50 PM ]"), + Arguments.of("Rules = [ FileFreshness > 25:00 ]"), + Arguments.of("Rules = [ FileFreshness > 9:30 ]"), + Arguments.of("Rules = [ FileFreshness > 9:30 SM ]"), + Arguments.of("Rules = [ FileFreshness > 22:1s ]"), + Arguments.of("Rules = [ FileFreshness \"S3://PATH\" > \"9:30 PM\" with threshold > 1 with threshold = 2 ]"), + Arguments.of("FileFreshness > \"9:30 AM\" with timeZone = \"America/New_Chicago\""), + Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with \"timeZone\" = \"America/Dubai\""), + Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") AND (IsUnique \"colA\"))"), + Arguments.of("((RowCount > 0) AND IsComplete"), + Arguments.of("variable > 1"), + Arguments.of("Rules = [ RowCount > $min_count ]"), + Arguments.of("max_size = 1 ZB\nRules = [ FileSize <= $max_size ]"), + Arguments.of("min_count = 100\nRules = [ RowCount > $min_count ]"), + Arguments.of("min count = 100\nRules = [ RowCount > 100 ]"), + Arguments.of("min count = \nRules = [ RowCount > 100 ]"), + Arguments.of("Rules = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ RowCount > 100 ]"), + Arguments.of("Rules = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ RowCount > 100 ]"), + Arguments.of("allowed_counts = [5, 10, 15, 20]\n" + + "Rules = [ ColumnValues \"product_code\" not in $allowed_counts ]"), + Arguments.of(".allowed_counts = [5, 10, 15, 20]\n" + + "Rules = [ ColumnValues \"product_code\" not in $.allowed_counts ]"), + Arguments.of("allowed_counts = [5, 10, 15, 20]\n" + + "Rules = [ ColumnValues \"product_code\" not in random$allowed_counts ]"), + Arguments.of("phone_pattern = '^\\\\d{3}-\\\\d{3}-\\\\d{4}$'\n" + + "Rules = [ ColumnValues \"colA\" matches $phone_pattern ]"), + Arguments.of("allowed_statuses = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ ColumnValues \"colA\" in allowed_statuses ]"), + Arguments.of("allowed_statuses = [\"Active\", \"Pending\", \"Closed\"]\n" + + "allowed_statuses = [\"Active\", \"Pending\", \"Closed\"]\n" + + "Rules = [ ColumnValues \"colA\" in $allowed_statuses ]"), + Arguments.of("max$value = 1000\nRules = [ RowCount > 1000 ]"), + Arguments.of("base = 10\nfactor = 5\nRules = [ RowCount > $base * $factor ]") + ); + } + + private static Stream provideInvalidRulesetsWithAnalyzers() { + return Stream.of( + Arguments.of("Rules = [ ] Analyzers = [ ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ IsComplete \"colA\" ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Foo ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\" > 1.0 ]"), + Arguments.of("Rules = [ IsPrimaryKey \"col-A\"] Analyzers = [ Completeness \"colA\", Uniqueness \"colB\" = 1.0 ]") ); } @@ -83,4 +169,15 @@ void test_invalidRulesetParsing(String ruleset) { System.out.println(e.getMessage()); } } + + @ParameterizedTest + @MethodSource("provideInvalidRulesetsWithAnalyzers") + void test_invalidRulesetWithAnalyzersParsing(String ruleset) { + try { + parser.parse(ruleset); + fail("Ruleset validation exception was expected"); + } catch (InvalidDataQualityRulesetException e) { + System.out.println(e.getMessage()); + } + } }