diff --git a/configuration/dqdl/CommonLexerRules.g4 b/configuration/dqdl/CommonLexerRules.g4
index d1890a6..194de8e 100644
--- a/configuration/dqdl/CommonLexerRules.g4
+++ b/configuration/dqdl/CommonLexerRules.g4
@@ -12,6 +12,10 @@ LPAREN: '(';
RPAREN: ')';
AND: 'and' | 'AND';
OR: 'or' | 'OR';
+OF: 'of' | 'OF';
+NULL: 'null' | 'NULL';
+EMPTY: 'empty' | 'EMPTY';
+WHITESPACES_ONLY: 'whitespaces_only' | 'WHITESPACES_ONLY';
BETWEEN: 'between';
EQUAL_TO: '=';
@@ -20,10 +24,16 @@ GREATER_THAN_EQUAL_TO: '>=';
LESS_THAN: '<';
LESS_THAN_EQUAL_TO: '<=';
IN: 'in';
+NOT: 'not';
+NEGATION: '!';
DIGIT: [0-9];
DATE:
QUOTE DIGIT DIGIT DIGIT DIGIT '-' DIGIT DIGIT '-' DIGIT DIGIT QUOTE;
+TIME:
+ QUOTE (DIGIT | DIGIT DIGIT) ':' DIGIT DIGIT (' AM' | ' PM') QUOTE;
+MIL_TIME:
+ QUOTE DIGIT DIGIT ':' DIGIT DIGIT QUOTE;
INT: DIGIT+;
DECIMAL: INT '.' INT;
QUOTED_STRING: QUOTE (ESC | .)*? QUOTE;
@@ -31,7 +41,7 @@ NEGATIVE: '-';
LINE_COMMENT: '#' .*? '\r'? '\n' -> skip; // Match "#" stuff '\n'
-IDENTIFIER: [a-zA-Z0-9]+;
+IDENTIFIER: [a-zA-Z0-9_.]+;
WS: [ \t\n]+ -> skip;
diff --git a/configuration/dqdl/DataQualityDefinitionLanguage.g4 b/configuration/dqdl/DataQualityDefinitionLanguage.g4
index e5cf0d4..26b4d6d 100644
--- a/configuration/dqdl/DataQualityDefinitionLanguage.g4
+++ b/configuration/dqdl/DataQualityDefinitionLanguage.g4
@@ -1,25 +1,39 @@
-grammar DataQualityDefinitionLanguage; // "parser grammars for DQDL"
+grammar DataQualityDefinitionLanguage; // "parser grammars for DQDL"
import CommonLexerRules;
// Sections
metadataSectionStart: 'Metadata';
dataSourcesSectionStart: 'DataSources';
rulesSectionStart: 'Rules';
+analyzersSectionStart: 'Analyzers';
// Expressions
dateNow: 'now()';
-durationUnit: 'days' | 'hours';
+durationUnit: 'days' | 'hours' | 'minutes';
durationExpression: (DIGIT | INT) durationUnit;
+sizeUnit:
+ 'B'
+ | 'KB'
+ | 'MB'
+ | 'GB'
+ | 'TB';
+
+sizeExpression: (DIGIT | INT) sizeUnit;
+
+timeExpression: TIME | MIL_TIME;
+
dateExpressionOp: ('-' | '+');
dateExpression:
DATE
| dateNow
- | LPAREN dateNow dateExpressionOp durationExpression RPAREN;
+ | LPAREN dateNow dateExpressionOp durationExpression RPAREN
+ | timeExpression
+ | NULL;
-number:
+atomicNumber:
DIGIT
| NEGATIVE DIGIT
| INT
@@ -27,72 +41,132 @@ number:
| DECIMAL
| NEGATIVE DECIMAL;
+functionParameters:
+ number
+ | number (COMMA number)*;
+
+functionCall:
+ IDENTIFIER LPAREN RPAREN
+ | IDENTIFIER LPAREN functionParameters RPAREN;
+
+numberOp: '+' | '-' | '/' | '*';
+
+number:
+ number numberOp number
+ | functionCall
+ | LPAREN number RPAREN
+ | atomicNumber
+ | NULL;
+
quotedString: QUOTED_STRING;
matchesRegexCondition: 'matches' quotedString;
numberArray: LBRAC number (COMMA number)* RBRAC;
numberBasedCondition:
- BETWEEN number AND number
+ NOT? BETWEEN number AND number
| GREATER_THAN number
| GREATER_THAN_EQUAL_TO number
| LESS_THAN number
| LESS_THAN_EQUAL_TO number
- | EQUAL_TO number
- | IN numberArray;
+ | NEGATION? EQUAL_TO number
+ | NOT? IN numberArray;
+
+variableDereference: '$' IDENTIFIER;
+
+stringValues:
+ quotedString
+ | variableDereference
+ | NULL
+ | EMPTY
+ | WHITESPACES_ONLY;
-quotedStringArray: LBRAC quotedString (COMMA quotedString)* RBRAC;
+stringValuesArray: LBRAC stringValues (COMMA stringValues)* RBRAC;
stringBasedCondition:
- EQUAL_TO quotedString
- | IN quotedStringArray
- | matchesRegexCondition;
+ NEGATION? EQUAL_TO stringValues
+ | NOT? IN stringValuesArray
+ | NOT? IN variableDereference
+ | NOT? matchesRegexCondition;
+tagValues: IDENTIFIER;
dateExpressionArray: LBRAC dateExpression (COMMA dateExpression)* RBRAC;
dateBasedCondition:
- BETWEEN dateExpression AND dateExpression
+ NOT? BETWEEN dateExpression AND dateExpression
| GREATER_THAN dateExpression
| GREATER_THAN_EQUAL_TO dateExpression
| LESS_THAN dateExpression
| LESS_THAN_EQUAL_TO dateExpression
- | EQUAL_TO dateExpression
- | IN dateExpressionArray;
+ | NEGATION? EQUAL_TO dateExpression
+ | NOT? IN dateExpressionArray;
durationExpressionArray: LBRAC durationExpression (COMMA durationExpression)* RBRAC;
durationBasedCondition:
- BETWEEN durationExpression AND durationExpression
+ NOT? BETWEEN durationExpression AND durationExpression
| GREATER_THAN durationExpression
| GREATER_THAN_EQUAL_TO durationExpression
| LESS_THAN durationExpression
| LESS_THAN_EQUAL_TO durationExpression
- | EQUAL_TO durationExpression
- | IN durationExpressionArray;
+ | NEGATION? EQUAL_TO durationExpression
+ | NOT? IN durationExpressionArray;
+
+sizeExpressionArray: LBRAC sizeExpression (COMMA sizeExpression)* RBRAC;
+sizeBasedCondition:
+ NOT? BETWEEN sizeExpression AND sizeExpression
+ | GREATER_THAN sizeExpression
+ | GREATER_THAN_EQUAL_TO sizeExpression
+ | LESS_THAN sizeExpression
+ | LESS_THAN_EQUAL_TO sizeExpression
+ | NEGATION? EQUAL_TO sizeExpression
+ | NOT? IN sizeExpressionArray;
ruleType: IDENTIFIER;
-parameter: (QUOTED_STRING | INT | DIGIT);
+analyzerType: IDENTIFIER;
+parameter: QUOTED_STRING
+ | IDENTIFIER;
+connectorWord: OF | AND;
+parameterWithConnectorWord: connectorWord? parameter;
+tagWithCondition: 'with' tagValues (stringBasedCondition | numberBasedCondition);
condition:
numberBasedCondition
| stringBasedCondition
| dateBasedCondition
- | durationBasedCondition;
+ | durationBasedCondition
+ | sizeBasedCondition;
-withThresholdCondition: 'with' 'threshold' numberBasedCondition;
+whereClause: 'where' quotedString;
-dqRule: ruleType parameter* condition? withThresholdCondition?;
+dqRule: ruleType parameterWithConnectorWord* condition? whereClause? tagWithCondition*;
+dqAnalyzer: analyzerType parameterWithConnectorWord*;
+
+// Variable Declarations
+expression:
+ stringValues
+ | stringValuesArray;
+
+variableDeclaration:
+ IDENTIFIER EQUAL_TO expression;
+variableDeclarations: variableDeclaration*;
topLevelRule:
- dqRule
- | '(' dqRule ')' (AND '(' dqRule ')')*
- | '(' dqRule ')' (OR '(' dqRule ')')*;
+ LPAREN topLevelRule RPAREN
+ | topLevelRule AND topLevelRule
+ | topLevelRule OR topLevelRule
+ | dqRule;
// Rules Definition
dqRules: topLevelRule (COMMA topLevelRule)*;
+dqAnalyzers: dqAnalyzer (COMMA dqAnalyzer)*;
// Top Level Document
rules:
rulesSectionStart EQUAL_TO LBRAC dqRules RBRAC
| rulesSectionStart EQUAL_TO LBRAC RBRAC; // empty array
+analyzers:
+ analyzersSectionStart EQUAL_TO LBRAC dqAnalyzers RBRAC
+ | analyzersSectionStart EQUAL_TO LBRAC RBRAC; // empty array
+
// This dictionary does not support nested dictionaries. Just strings and arrays.
dictionary: LCURL pair (COMMA pair)* RCURL;
pair: QUOTED_STRING COLON pairValue;
@@ -101,5 +175,6 @@ array: LBRAC QUOTED_STRING (COMMA QUOTED_STRING)* RBRAC;
metadata: metadataSectionStart EQUAL_TO dictionary;
dataSources: dataSourcesSectionStart EQUAL_TO dictionary;
+rulesOrAnalyzers: rules | analyzers | rules analyzers;
-document: metadata? dataSources? rules;
+document: metadata? dataSources? variableDeclarations? rulesOrAnalyzers;
diff --git a/configuration/rules/rules-config.json b/configuration/rules/rules-config.json
index bb4dea8..0c8a68e 100644
--- a/configuration/rules/rules-config.json
+++ b/configuration/rules/rules-config.json
@@ -4,7 +4,9 @@
"rule_type_name": "RowCount",
"description": "Check the number of rows in the dataset",
"parameters": [],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "table"
},
{
"rule_type_name": "RowCountMatch",
@@ -16,13 +18,15 @@
"description": "Alias of reference dataset"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "scope": "column"
},
{
"rule_type_name": "ColumnCount",
"description": "Checks the number of columns in the dataset",
"parameters": [],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "scope": "table"
},
{
"rule_type_name": "Completeness",
@@ -34,7 +38,10 @@
"description": "Name of column to check completeness of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_excluded_at_row_level_in_composite_rules": true,
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "IsComplete",
@@ -46,11 +53,13 @@
"description": "Name of column to check completeness of"
}
],
- "return_type": "BOOLEAN"
+ "return_type": "BOOLEAN",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "ColumnDataType",
- "description": "Check the data type of the given column",
+ "description": "Check the data type of the given column. Supported values: Boolean, Date, Timestamp, Integer, Double, Float, Long",
"parameters": [
{
"type": "String",
@@ -59,7 +68,9 @@
}
],
"return_type": "STRING",
- "is_threshold_supported": true
+ "is_threshold_supported": true,
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "ColumnNamesMatchPattern",
@@ -71,7 +82,8 @@
"description": "Pattern to match against the names of the columns"
}
],
- "return_type": "BOOLEAN"
+ "return_type": "BOOLEAN",
+ "scope": "table"
},
{
"rule_type_name": "ColumnExists",
@@ -83,7 +95,8 @@
"description": "Name of column to check existence of"
}
],
- "return_type": "BOOLEAN"
+ "return_type": "BOOLEAN",
+ "scope": "column"
},
{
"rule_type_name": "ColumnCorrelation",
@@ -100,7 +113,9 @@
"description": "Name of second column"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "Uniqueness",
@@ -112,7 +127,10 @@
"description": "Name of column to check uniqueness of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_excluded_at_row_level_in_composite_rules": true,
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "IsUnique",
@@ -124,7 +142,9 @@
"description": "Name of column to check uniqueness of"
}
],
- "return_type": "BOOLEAN"
+ "return_type": "BOOLEAN",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "Mean",
@@ -136,7 +156,9 @@
"description": "Name of column to check mean of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "Sum",
@@ -148,7 +170,9 @@
"description": "Name of column to check sum of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "StandardDeviation",
@@ -160,7 +184,9 @@
"description": "Name of column to check standard deviation of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "Entropy",
@@ -172,7 +198,9 @@
"description": "Name of column to check entropy of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "DistinctValuesCount",
@@ -184,7 +212,9 @@
"description": "Name of column to check distinct values count of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "UniqueValueRatio",
@@ -196,7 +226,9 @@
"description": "Name of column to check unique value ratio of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "ColumnLength",
@@ -208,7 +240,9 @@
"description": "Name of column to check the length of the values of"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "IsPrimaryKey",
@@ -221,7 +255,9 @@
"is_var_arg": true
}
],
- "return_type": "BOOLEAN"
+ "return_type": "BOOLEAN",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "ColumnValues",
@@ -234,7 +270,9 @@
}
],
"return_type": "STRING_ARRAY|NUMBER_ARRAY|DATE_ARRAY",
- "is_threshold_supported": true
+ "is_threshold_supported": true,
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "DataFreshness",
@@ -246,7 +284,9 @@
"description": "Name of column to check the freshness of"
}
],
- "return_type": "DURATION_ARRAY"
+ "return_type": "DURATION_ARRAY",
+ "is_where_clause_supported": true,
+ "scope": "column"
},
{
"rule_type_name": "CustomSql",
@@ -259,7 +299,8 @@
}
],
"return_type": "NUMBER|BOOLEAN",
- "is_threshold_supported": true
+ "is_threshold_supported": true,
+ "scope": "table"
},
{
"rule_type_name": "ReferentialIntegrity",
@@ -276,7 +317,9 @@
"description": "Alias of reference dataset and comma separated names of columns from reference dataset. The alias and the names should be separated by a period. The names should be enclosed in curly brackets."
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_excluded_at_row_level_in_composite_rules": true,
+ "scope": "table"
},
{
"rule_type_name": "DatasetMatch",
@@ -293,7 +336,9 @@
"description": "Mappings of key columns used for joining the two datasets"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "is_excluded_at_row_level_in_composite_rules": true,
+ "scope": "table"
},
{
"rule_type_name": "DatasetMatch",
@@ -315,7 +360,8 @@
"description": "Mappings of columns used for matching"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "scope": "table"
},
{
"rule_type_name": "SchemaMatch",
@@ -327,7 +373,8 @@
"description": "Alias of reference dataset"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "scope": "table"
},
{
"rule_type_name": "AggregateMatch",
@@ -344,7 +391,161 @@
"description": "The second aggregate expression"
}
],
- "return_type": "NUMBER"
+ "return_type": "NUMBER",
+ "scope": "column"
+ },
+ {
+ "rule_type_name": "DetectAnomalies",
+ "description": "Checks if the current value of the metric is anomalous with respect to the historical values",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "MetricParameter",
+ "description": "The parameters required to evaluate the metric. The first parameter must be the metric name.",
+ "is_var_arg": true
+ }
+ ],
+ "return_type": "BOOLEAN",
+ "scope": "column"
+ },
+ {
+ "rule_type_name": "AllStatistics",
+ "is_analyzer_only": true,
+ "description": "Analyzer Only. Produces a collection of statistics.",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "TargetColumn",
+ "description": "Name of the column to analyze"
+ }
+ ],
+ "return_type": "NUMBER",
+ "scope": "column"
+ },
+ {
+ "rule_type_name": "FileMatch",
+ "description": "Match Files/Directories against Files/Directories.",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "DataPath",
+ "description": "File/Directory for comparison"
+ },
+ {
+ "type": "String",
+ "name": "CompareDataPath",
+ "description": "Other File/Directory for comparison"
+ }
+ ],
+ "return_type": "BOOLEAN",
+ "is_threshold_supported": false,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileMatch",
+ "description": "Match Files/Directories inferred from DataFrames against a list of checksum values.",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "DataPath",
+ "description": "File/Directory for comparison"
+ }
+ ],
+ "return_type": "STRING",
+ "is_threshold_supported": false,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileMatch",
+ "description": "Match Files/Directories inferred from DataFrames against a list of checksum values.",
+ "parameters": [],
+ "return_type": "STRING",
+ "is_threshold_supported": false,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileUniqueness",
+ "description": "Checks the contents of a folder and the uniqueness of each file within.",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "DataPath",
+ "description": "Data Path for FileUniqueness."
+ }
+ ],
+ "return_type": "NUMBER",
+ "is_threshold_supported": false,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileUniqueness",
+ "description": "Checks the contents of an inferred folder and the uniqueness of each file within.",
+ "parameters": [],
+ "return_type": "NUMBER",
+ "is_threshold_supported": false,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileFreshness",
+ "description": "Checks the age of a filepath against a specified date.",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "DataPath",
+ "description": "Data Path for FileFreshness."
+ }
+ ],
+ "return_type": "DATE",
+ "is_threshold_supported": true,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileFreshness",
+ "description": "Checks the age of an inferred file against a specified date.",
+ "parameters": [],
+ "return_type": "DATE",
+ "is_threshold_supported": true,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileSize",
+ "description": "Checks the size of a filepath.",
+ "parameters": [
+ {
+ "type": "String",
+ "name": "DataPath",
+ "description": "Data Path for FileSize."
+ }
+ ],
+ "return_type": "SIZE",
+ "is_threshold_supported": true,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
+ },
+ {
+ "rule_type_name": "FileSize",
+ "description": "Checks the size of an inferred file.",
+ "parameters": [],
+ "return_type": "SIZE",
+ "is_threshold_supported": true,
+ "is_where_clause_supported": false,
+ "scope": "file",
+ "experimental": true
}
]
-}
\ No newline at end of file
+}
diff --git a/pom.xml b/pom.xml
index 7021831..417a331 100644
--- a/pom.xml
+++ b/pom.xml
@@ -15,6 +15,7 @@
2.12.7.1
5.9.1
1.18.28
+ 2.0.16
3.11.0
1.8
1.8
@@ -56,6 +57,14 @@
provided
+
+
+ org.slf4j
+ slf4j-log4j12
+ ${slf4j.version}
+ provided
+
+
org.antlr
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java
new file mode 100644
index 0000000..9bf56ea
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzer.java
@@ -0,0 +1,49 @@
+/*
+ * DQAnalyzer.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model;
+
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+@AllArgsConstructor
+@Getter
+public class DQAnalyzer implements HasRuleTypeAndParameters {
+ private final String ruleType;
+ private final Map parameters;
+ private final Map parameterValueMap;
+
+ public DQAnalyzer(final String ruleType,
+ final Map parameters) {
+ this.ruleType = ruleType;
+ this.parameters = parameters;
+ this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(this.parameters);
+ }
+
+ public static DQAnalyzer createFromValueMap(final String ruleType,
+ final LinkedHashMap parameters) {
+ return new DQAnalyzer(ruleType, DQRuleParameterValue.createParameterMap(parameters), parameters);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(ruleType);
+
+ if (parameterValueMap != null) {
+ parameterValueMap.values().forEach(p -> sb.append(" ").append(p.toString()));
+ }
+
+ return sb.toString();
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java
index 57e35ae..b115a0e 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRule.java
@@ -11,37 +11,93 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag;
+import lombok.AccessLevel;
import lombok.AllArgsConstructor;
+import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag.convertToStringMap;
+import static com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLVariableResolver.resolveVariablesInCondition;
import static com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils.isBlank;
@AllArgsConstructor
@Getter
@EqualsAndHashCode
-public class DQRule implements Serializable {
+@Builder(toBuilder = true, access = AccessLevel.PRIVATE)
+public class DQRule implements Serializable, HasRuleTypeAndParameters {
private final String ruleType;
private final Map parameters;
+ private final Map parameterValueMap;
private final Condition condition;
private final Condition thresholdCondition;
private final DQRuleLogicalOperator operator;
private final List nestedRules;
+ private final String whereClause;
+ private Boolean isExcludedAtRowLevelInCompositeRules = false;
+ private Map tags;
+
+ // Adding this constructor so as to not break the Data Quality ETL package.
+ public DQRule(final String ruleType,
+ final Map parameters,
+ final Condition condition,
+ final Condition thresholdCondition,
+ final DQRuleLogicalOperator operator,
+ final List nestedRules,
+ final String whereClause) {
+ this.ruleType = ruleType;
+ this.parameters = parameters;
+ this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters);
+ this.condition = condition;
+ this.thresholdCondition = thresholdCondition;
+ this.operator = operator;
+ this.nestedRules = nestedRules;
+ this.whereClause = whereClause;
+ }
+
+ public DQRule(final String ruleType,
+ final Map parameters,
+ final Condition condition,
+ final Condition thresholdCondition,
+ final DQRuleLogicalOperator operator,
+ final List nestedRules) {
+ this.ruleType = ruleType;
+ this.parameters = parameters;
+ this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters);
+ this.condition = condition;
+ this.thresholdCondition = thresholdCondition;
+ this.operator = operator;
+ this.nestedRules = nestedRules;
+ this.whereClause = null;
+ }
public DQRule(final String ruleType,
final Map parameters,
final Condition condition) {
this.ruleType = ruleType;
this.parameters = parameters;
+ this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters);
this.condition = condition;
this.thresholdCondition = null;
this.operator = DQRuleLogicalOperator.AND;
this.nestedRules = new ArrayList<>();
+ this.whereClause = null;
+ }
+
+ // Can't overload the constructor above, due to type erasure
+ public static DQRule createFromParameterValueMap(final DQRuleType ruleType,
+ final LinkedHashMap parameters,
+ final Condition condition) {
+ return createFromParameterValueMap(ruleType, parameters, condition,
+ null, null, null);
}
public DQRule(final String ruleType,
@@ -50,10 +106,89 @@ public DQRule(final String ruleType,
final Condition thresholdCondition) {
this.ruleType = ruleType;
this.parameters = parameters;
+ this.parameterValueMap = DQRuleParameterValue.createParameterValueMap(parameters);
this.condition = condition;
this.thresholdCondition = thresholdCondition;
this.operator = DQRuleLogicalOperator.AND;
this.nestedRules = new ArrayList<>();
+ this.whereClause = null;
+ }
+
+ // Can't overload the constructor above, due to type erasure
+ public static DQRule createFromParameterValueMap(final DQRuleType ruleType,
+ final LinkedHashMap parameters,
+ final Condition condition,
+ final Condition thresholdCondition,
+ final String whereClause,
+ final Map tags) {
+ DQRuleLogicalOperator operator = DQRuleLogicalOperator.AND;
+ List nestedRules = new ArrayList<>();
+
+ return new DQRule(
+ ruleType.getRuleTypeName(),
+ DQRuleParameterValue.createParameterMap(parameters),
+ parameters,
+ condition,
+ thresholdCondition,
+ operator,
+ nestedRules,
+ whereClause,
+ ruleType.isExcludedAtRowLevelInCompositeRules(),
+ tags
+ );
+ }
+
+ // Add a new method for creating with variable resolution
+ public static DQRule createFromParameterValueMapWithVariables(final DQRuleType ruleType,
+ final LinkedHashMap
+ parameters,
+ final Condition condition,
+ final Condition thresholdCondition,
+ final String whereClause,
+ final Map tags,
+ final Map variables) {
+ // Create the unresolved rule first
+ DQRule unresolvedRule = createFromParameterValueMap(ruleType, parameters, condition,
+ thresholdCondition, whereClause, tags);
+
+ // If there are no variables to resolve, return the unresolved rule
+ if (variables == null || variables.isEmpty()) {
+ return unresolvedRule;
+ }
+
+ Map usedVars = new HashMap<>();
+
+ // Resolve variables in conditions
+ Condition resolvedCondition = condition != null
+ ? resolveVariablesInCondition(condition, variables, usedVars) : null;
+ Condition resolvedThresholdCondition = thresholdCondition != null
+ ? resolveVariablesInCondition(thresholdCondition, variables, usedVars) : null;
+
+ // Create the resolved rule
+ return new DQRule(
+ ruleType.getRuleTypeName(),
+ DQRuleParameterValue.createParameterMap(parameters),
+ parameters,
+ resolvedCondition,
+ resolvedThresholdCondition,
+ DQRuleLogicalOperator.AND,
+ new ArrayList<>(),
+ whereClause,
+ ruleType.isExcludedAtRowLevelInCompositeRules(),
+ tags
+ );
+ }
+
+ public DQRule withNestedRules(final List nestedRules) {
+ return this.toBuilder().nestedRules(nestedRules).build();
+ }
+
+ public DQRule withCondition(final Condition condition) {
+ return this.toBuilder().condition(condition).build();
+ }
+
+ public Map getTags() {
+ return convertToStringMap(tags);
}
@Override
@@ -63,8 +198,8 @@ public String toString() {
if (nestedRules == null || nestedRules.isEmpty()) {
sb.append(ruleType);
- if (parameters != null) {
- parameters.values().forEach(p -> sb.append(" ").append("\"").append(p).append("\""));
+ if (parameterValueMap != null) {
+ parameterValueMap.values().forEach(p -> sb.append(" ").append(p.toString()));
}
if (condition != null) {
@@ -72,21 +207,77 @@ public String toString() {
if (!isBlank(formattedCondition)) sb.append(" ").append(condition.getFormattedCondition());
}
+ // where clause syntax should go before threshold
+ if (whereClause != null) {
+ if (!isBlank(whereClause)) sb.append(" where ").append("\"").append(whereClause).append("\"");
+ }
+
if (thresholdCondition != null) {
String formattedCondition = thresholdCondition.getFormattedCondition();
if (!isBlank(formattedCondition)) sb.append(" with threshold ").append(formattedCondition);
}
- return sb.toString();
+ if (tags != null) {
+ for (Map.Entry entry : tags.entrySet()) {
+ sb.append(entry.getValue());
+ }
+ }
+
+ return sb.toString().trim();
} else {
- for (int i = 0; i < nestedRules.size(); i++) {
- sb.append("(").append(nestedRules.get(i).toString()).append(")");
- if (i != nestedRules.size() - 1) {
- sb.append(" ").append(operator.toString()).append(" ");
+ boolean canBeFlattened = usesSameOperator(operator);
+
+ if (canBeFlattened) {
+ List flattenedListOfRules = getNestedRulesAsFlattenedList();
+ for (int i = 0; i < flattenedListOfRules.size(); i++) {
+ sb.append("(").append(flattenedListOfRules.get(i).toString()).append(")");
+ if (i != flattenedListOfRules.size() - 1) {
+ sb.append(" ").append(operator.toString()).append(" ");
+ }
+ }
+ } else {
+ for (int i = 0; i < nestedRules.size(); i++) {
+ sb.append("(").append(nestedRules.get(i).toString()).append(")");
+ if (i != nestedRules.size() - 1) {
+ sb.append(" ").append(operator.toString()).append(" ");
+ }
}
}
}
return sb.toString();
}
+
+ /*
+ * This function checks if the same operator is used across all the nested rules.
+ * Example: (RuleA) or (RuleB) or (RuleC) / (RuleA) and (RuleB) and (RuleC)
+ *
+ * If that is the case, in order to maintain backwards compatibility, we will update
+ * toString() method so that we do not add additional parentheses.
+ */
+ private boolean usesSameOperator(DQRuleLogicalOperator op) {
+ if (nestedRules.isEmpty()) return true;
+ if (operator != op) return false;
+
+ for (DQRule nestedRule : nestedRules) {
+ if (!nestedRule.usesSameOperator(op)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ public List getNestedRulesAsFlattenedList() {
+ List ret = new ArrayList<>();
+ if (nestedRules.isEmpty()) {
+ ret.add(this);
+ } else {
+ for (DQRule nestedRule: nestedRules) {
+ List nestedRet = nestedRule.getNestedRulesAsFlattenedList();
+ ret.addAll(nestedRet);
+ }
+ }
+ return ret;
+ }
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java
new file mode 100644
index 0000000..c61643a
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValue.java
@@ -0,0 +1,74 @@
+/*
+ * DQRuleParameterValue.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model;
+
+import lombok.AllArgsConstructor;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+@AllArgsConstructor
+@Getter
+@EqualsAndHashCode
+public class DQRuleParameterValue implements Serializable {
+ private static final String EMPTY_CONNECTOR = "";
+
+ private final String value;
+ private final boolean isQuoted;
+
+ // We could use an Optional here, instead of resorting to an empty string.
+ // But this needs to be serializable for Spark.
+ // Optional has presented problems in that regard.
+ private final String connectorWord;
+
+ public DQRuleParameterValue(final String value) {
+ this.value = value;
+ this.isQuoted = false;
+ this.connectorWord = EMPTY_CONNECTOR;
+ }
+
+ public DQRuleParameterValue(final String value, final boolean isQuoted) {
+ this.value = value;
+ this.isQuoted = isQuoted;
+ this.connectorWord = EMPTY_CONNECTOR;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if (!EMPTY_CONNECTOR.equals(connectorWord)) sb.append(connectorWord).append(" ");
+ String surroundBy = isQuoted ? "\"" : "";
+ sb.append(surroundBy).append(value).append(surroundBy);
+ return sb.toString();
+ }
+
+ public static Map createParameterValueMap(Map parameters) {
+ Map map = new HashMap<>();
+ if (parameters == null) return map;
+
+ // Add quotes when converting from the map of string values, and do not use connector word.
+ // This is to maintain backwards compatibility.
+ boolean isQuoted = true;
+ parameters.forEach((k, v) -> map.put(k, new DQRuleParameterValue(v, isQuoted)));
+
+ return map;
+ }
+
+ public static Map createParameterMap(Map parameters) {
+ Map paramValuesAsStringsMap = new LinkedHashMap<>();
+ parameters.forEach((k, v) -> paramValuesAsStringsMap.put(k, v.getValue()));
+ return paramValuesAsStringsMap;
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java
index 59f33ea..4728ff3 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleType.java
@@ -22,7 +22,6 @@
import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap;
import java.util.List;
-import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
@@ -33,17 +32,36 @@ public class DQRuleType {
private final List parameters;
private final String returnType;
private final boolean isThresholdSupported;
+ private final boolean isExcludedAtRowLevelInCompositeRules;
+ private final boolean isWhereClauseSupported;
+ private final boolean isAnalyzerOnly;
+ private final String scope;
+ private final boolean isExperimental;
+ @SuppressWarnings("checkstyle:parameternumber")
public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName,
@JsonProperty(value = "description") String description,
@JsonProperty(value = "parameters") List parameters,
@JsonProperty(value = "return_type") String returnType,
- @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported) {
+ // boolean defaults to false if not present
+ @JsonProperty(value = "is_threshold_supported") boolean isThresholdSupported,
+ @JsonProperty(value = "is_excluded_at_row_level_in_composite_rules")
+ boolean isExcludedAtRowLevelInCompositeRules,
+ @JsonProperty(value = "is_where_clause_supported")
+ boolean isWhereClauseSupported,
+ @JsonProperty(value = "is_analyzer_only") boolean isAnalyzerOnly,
+ @JsonProperty(value = "scope") String scope,
+ @JsonProperty(value = "experimental") boolean isExperimental) {
this.ruleTypeName = ruleTypeName;
this.description = description;
this.parameters = parameters;
this.returnType = returnType;
this.isThresholdSupported = isThresholdSupported;
+ this.isExcludedAtRowLevelInCompositeRules = isExcludedAtRowLevelInCompositeRules;
+ this.isWhereClauseSupported = isWhereClauseSupported;
+ this.isAnalyzerOnly = isAnalyzerOnly;
+ this.scope = scope;
+ this.isExperimental = isExperimental;
if (parameters.isEmpty()) {
return;
@@ -60,11 +78,9 @@ public DQRuleType(@JsonProperty(value = "rule_type_name") String ruleTypeName,
}
public Optional verifyParameters(List expectedParameters,
- List actualParameters) {
+ List actualParameters) {
if (!expectedParameters.isEmpty()) {
-
- boolean isVarArg = expectedParameters.get(
- expectedParameters.size() - 1).isVarArg();
+ boolean isVarArg = expectedParameters.get(expectedParameters.size() - 1).isVarArg();
if (isVarArg) {
if (expectedParameters.size() > actualParameters.size()) {
@@ -82,9 +98,9 @@ public Optional verifyParameters(List expectedParameter
return Optional.empty();
}
- public Map createParameterMap(List dqRuleTypeParameters,
- List actualParameters) {
- Map parameterMap = new LinkedHashMap<>();
+ public LinkedHashMap createParameterMap(List dqRuleTypeParameters,
+ List actualParameters) {
+ LinkedHashMap parameterMap = new LinkedHashMap<>();
for (int i = 0; i < dqRuleTypeParameters.size(); i++) {
String dqRuleTypeParameterName = dqRuleTypeParameters.get(i).getName();
@@ -98,7 +114,7 @@ public Map createParameterMap(List dqRuleTypePa
for (int j = counter; j < actualParameters.size(); j++) {
String newDqRuleTypeParameterName = dqRuleTypeParameterName + (j + 1);
- String actualParameterName = actualParameters.get(j);
+ DQRuleParameterValue actualParameterName = actualParameters.get(j);
parameterMap.put(newDqRuleTypeParameterName, actualParameterName);
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java
index d836b1e..dec8804 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleset.java
@@ -30,14 +30,20 @@ public class DQRuleset {
private final String primarySourceName;
private final List additionalDataSourcesNames;
private final List rules;
+ private final List analyzers;
private static final String LINE_SEP = System.lineSeparator();
public DQRuleset(final List rules) {
+ this(rules, new ArrayList<>());
+ }
+
+ public DQRuleset(final List rules, final List analyzers) {
this.metadata = new HashMap<>();
this.primarySourceName = null;
this.additionalDataSourcesNames = new ArrayList<>();
this.rules = rules;
+ this.analyzers = analyzers;
}
@Override
@@ -75,12 +81,23 @@ public String toString() {
"}";
}
- String rulesStr = "Rules = [" + LINE_SEP +
- rules.stream()
- .map(i -> " " + i)
- .collect(Collectors.joining("," + LINE_SEP)) +
- LINE_SEP + "]";
+ String rulesStr = "";
+ if (!rules.isEmpty()) {
+ rulesStr = "Rules = [" + LINE_SEP +
+ rules.stream()
+ .map(i -> " " + i)
+ .collect(Collectors.joining("," + LINE_SEP)) +
+ LINE_SEP + "]";
+ }
+ String analyzersStr = "";
+ if (!analyzers.isEmpty()) {
+ analyzersStr = "Analyzers = [" + LINE_SEP +
+ analyzers.stream()
+ .map(i -> " " + i)
+ .collect(Collectors.joining("," + LINE_SEP)) +
+ LINE_SEP + "]";
+ }
StringBuilder sb = new StringBuilder();
if (!metadataStr.isEmpty()) {
@@ -91,7 +108,16 @@ public String toString() {
sb.append(sourcesStr).append(LINE_SEP).append(LINE_SEP);
}
- sb.append(rulesStr);
+ if (!rulesStr.isEmpty()) {
+ sb.append(rulesStr);
+ }
+
+ if (!analyzersStr.isEmpty()) {
+ if (!rulesStr.isEmpty()) {
+ sb.append(LINE_SEP).append(LINE_SEP);
+ }
+ sb.append(analyzersStr);
+ }
return sb.toString();
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java
new file mode 100644
index 0000000..b9774d7
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/DQVariable.java
@@ -0,0 +1,60 @@
+/*
+ * DQVariable.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model;
+
+import lombok.AllArgsConstructor;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.stream.Collectors;
+
+@AllArgsConstructor
+@Getter
+@EqualsAndHashCode
+public class DQVariable implements Serializable {
+
+ public enum VariableType {
+ NUMBER,
+ STRING,
+ DATE,
+ DURATION,
+ NUMBER_ARRAY,
+ STRING_ARRAY,
+ DATE_ARRAY,
+ DURATION_ARRAY
+ }
+
+ private final String name;
+ private final VariableType type;
+ private final T value;
+
+ @Override
+ public String toString() {
+ if (value instanceof List) {
+ return String.format("%s = %s", name, formatArray((List>) value));
+ }
+ return String.format("%s = %s", name, formatValue(value));
+ }
+
+ private String formatValue(T val) {
+ if (val == null) return "null";
+ if (type == VariableType.STRING) return "\"" + val + "\"";
+ return val.toString();
+ }
+
+ private String formatArray(List> list) {
+ return "[" + list.stream()
+ .map(Object::toString)
+ .collect(Collectors.joining(", ")) + "]";
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java
new file mode 100644
index 0000000..36d6bfc
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/HasRuleTypeAndParameters.java
@@ -0,0 +1,20 @@
+/*
+ * HasRuleTypeAndParameters.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model;
+
+import java.util.Map;
+
+public interface HasRuleTypeAndParameters {
+
+ String getRuleType();
+
+ Map getParameters();
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java
index dc59445..900ba88 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/Condition.java
@@ -10,6 +10,8 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model.condition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.OperandEvaluator;
import lombok.EqualsAndHashCode;
import lombok.Getter;
@@ -27,4 +29,11 @@ public Condition(final String conditionAsString) {
public String getFormattedCondition() {
return this.conditionAsString;
}
+
+ public String getSortedFormattedCondition() {
+ return this.conditionAsString;
+ }
+ public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) {
+ throw new UnsupportedOperationException();
+ }
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java
index 0439f01..0458f98 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedCondition.java
@@ -42,6 +42,11 @@ public String getFormattedCondition() {
operands.get(0).getFormattedExpression(),
operands.get(1).getFormattedExpression()
);
+ case NOT_BETWEEN:
+ return String.format("not between %s and %s",
+ operands.get(0).getFormattedExpression(),
+ operands.get(1).getFormattedExpression()
+ );
case GREATER_THAN:
return String.format("> %s", operands.get(0).getFormattedExpression());
case GREATER_THAN_EQUAL_TO:
@@ -52,16 +57,49 @@ public String getFormattedCondition() {
return String.format("<= %s", operands.get(0).getFormattedExpression());
case EQUALS:
return String.format("= %s", operands.get(0).getFormattedExpression());
+ case NOT_EQUALS:
+ return String.format("!= %s", operands.get(0).getFormattedExpression());
case IN: {
- List formattedOperands = operands.stream()
- .map(DateExpression::getFormattedExpression)
- .collect(Collectors.toList());
+ List formattedOperands = getFormattedOperands();
return String.format("in [%s]", String.join(",", formattedOperands));
}
+ case NOT_IN: {
+ List formattedOperands = getFormattedOperands();
+ return String.format("not in [%s]", String.join(",", formattedOperands));
+ }
default:
break;
}
return "";
}
+
+ @Override
+ public String getSortedFormattedCondition() {
+ if (StringUtils.isBlank(conditionAsString)) return "";
+
+ switch (operator) {
+ case IN:
+ return String.format("in [%s]", String.join(",", getSortedFormattedOperands()));
+ case NOT_IN:
+ return String.format("not in [%s]", String.join(",", getSortedFormattedOperands()));
+ default:
+ return getFormattedCondition();
+ }
+ }
+
+ private List getFormattedOperands() {
+ List formattedOperands = operands.stream()
+ .map(DateExpression::getFormattedExpression)
+ .collect(Collectors.toList());
+ return formattedOperands;
+ }
+
+ private List getSortedFormattedOperands() {
+ List formattedOperands = operands.stream()
+ .map(DateExpression::getFormattedExpression)
+ .sorted()
+ .collect(Collectors.toList());
+ return formattedOperands;
+ }
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java
index 09bab98..565f771 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateBasedConditionOperator.java
@@ -12,10 +12,13 @@
public enum DateBasedConditionOperator {
BETWEEN,
+ NOT_BETWEEN,
GREATER_THAN,
GREATER_THAN_EQUAL_TO,
LESS_THAN,
LESS_THAN_EQUAL_TO,
EQUALS,
- IN
+ NOT_EQUALS,
+ IN,
+ NOT_IN
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java
index 1353ffb..b14c3d1 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/DateExpression.java
@@ -11,12 +11,13 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration;
-import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
+import lombok.Getter;
import java.io.Serializable;
import java.time.LocalDateTime;
+import java.time.ZoneOffset;
@EqualsAndHashCode
public abstract class DateExpression implements Serializable {
@@ -46,7 +47,23 @@ public String getFormattedExpression() {
@Override
public LocalDateTime getEvaluatedExpression() {
- return LocalDateTime.now();
+ return LocalDateTime.now(ZoneOffset.UTC);
+ }
+ }
+
+ @AllArgsConstructor
+ public static class StaticDateTime extends DateExpression {
+ private final LocalDateTime dateTime;
+ private final String dateTimeString;
+
+ @Override
+ public String getFormattedExpression() {
+ return "\"" + dateTimeString + "\"";
+ }
+
+ @Override
+ public LocalDateTime getEvaluatedExpression() {
+ return dateTime;
}
}
@@ -55,6 +72,7 @@ public enum DateExpressionOperator {
PLUS
}
+ @Getter
@AllArgsConstructor
public static class CurrentDateExpression extends DateExpression {
private final DateExpressionOperator operator;
@@ -74,16 +92,37 @@ public String getFormattedExpression() {
@Override
public LocalDateTime getEvaluatedExpression() {
- int hours = duration.getUnit().equals(DurationUnit.DAYS)
- ? duration.getAmount() * 24
- : duration.getAmount();
+ switch (duration.getUnit()) {
+ case MINUTES:
+ return evaluateMinutes(
+ operator,
+ duration.getAmount(),
+ LocalDateTime.now(ZoneOffset.UTC)
+ );
+ case HOURS:
+ return evaluateMinutes(
+ operator,
+ duration.getAmount() * 60,
+ LocalDateTime.now(ZoneOffset.UTC).withMinute(0)
+ );
+ case DAYS:
+ return evaluateMinutes(
+ operator,
+ duration.getAmount() * 60 * 24,
+ LocalDateTime.now(ZoneOffset.UTC).withMinute(0)
+ );
+ default:
+ throw new RuntimeException("Unsupported duration unit: " + duration.getUnit());
+ }
+ }
- LocalDateTime dt = LocalDateTime.now();
+ private LocalDateTime evaluateMinutes(DateExpressionOperator operator, int minutes, LocalDateTime dt) {
+ dt = dt.withSecond(0).withNano(0);
switch (operator) {
case MINUS:
- return dt.minusHours(hours);
+ return dt.minusMinutes(minutes);
case PLUS:
- return dt.plusHours(hours);
+ return dt.plusMinutes(minutes);
default:
return dt;
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java
new file mode 100644
index 0000000..0ce87ac
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/date/NullDateExpression.java
@@ -0,0 +1,26 @@
+/*
+ * NullDateExpression.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date;
+
+import java.time.LocalDateTime;
+
+public class NullDateExpression extends DateExpression {
+
+ @Override
+ public String getFormattedExpression() {
+ return "NULL";
+ }
+
+ @Override
+ public LocalDateTime getEvaluatedExpression() {
+ return null;
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java
index 8629c4a..1f6f80a 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedCondition.java
@@ -11,6 +11,7 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
+import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils;
import lombok.EqualsAndHashCode;
import lombok.Getter;
@@ -40,6 +41,10 @@ public String getFormattedCondition() {
return String.format("between %s and %s",
operands.get(0).getFormattedDuration(),
operands.get(1).getFormattedDuration());
+ case NOT_BETWEEN:
+ return String.format("not between %s and %s",
+ operands.get(0).getFormattedDuration(),
+ operands.get(1).getFormattedDuration());
case GREATER_THAN:
return String.format("> %s", operands.get(0).getFormattedDuration());
case GREATER_THAN_EQUAL_TO:
@@ -50,11 +55,15 @@ public String getFormattedCondition() {
return String.format("<= %s", operands.get(0).getFormattedDuration());
case EQUALS:
return String.format("= %s", operands.get(0).getFormattedDuration());
+ case NOT_EQUALS:
+ return String.format("!= %s", operands.get(0).getFormattedDuration());
case IN: {
- List formattedOperands = operands.stream()
- .map(Duration::getFormattedDuration)
- .collect(Collectors.toList());
- return String.format("in [%s]", String.join(", ", formattedOperands));
+ List formattedOperands = getFormattedOperands();
+ return String.format("in [%s]", String.join(",", formattedOperands));
+ }
+ case NOT_IN: {
+ List formattedOperands = getFormattedOperands();
+ return String.format("not in [%s]", String.join(",", formattedOperands));
}
default:
break;
@@ -62,4 +71,26 @@ public String getFormattedCondition() {
return "";
}
+
+ @Override
+ public String getSortedFormattedCondition() {
+ if (StringUtils.isBlank(conditionAsString)) return "";
+
+ switch (operator) {
+ case IN:
+ return String.format("in [%s]", String.join(",", getSortedFormattedOperands()));
+ case NOT_IN:
+ return String.format("not in [%s]", String.join(",", getSortedFormattedOperands()));
+ default:
+ return getFormattedCondition();
+ }
+ }
+
+ private List getFormattedOperands() {
+ return operands.stream().map(Duration::getFormattedDuration).collect(Collectors.toList());
+ }
+
+ private List getSortedFormattedOperands() {
+ return operands.stream().map(Duration::getFormattedDuration).sorted().collect(Collectors.toList());
+ }
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java
index 099d410..966b432 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationBasedConditionOperator.java
@@ -12,10 +12,13 @@
public enum DurationBasedConditionOperator {
BETWEEN,
+ NOT_BETWEEN,
GREATER_THAN,
GREATER_THAN_EQUAL_TO,
LESS_THAN,
LESS_THAN_EQUAL_TO,
EQUALS,
- IN
+ NOT_EQUALS,
+ IN,
+ NOT_IN
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java
index f48f209..04577f9 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/duration/DurationUnit.java
@@ -11,6 +11,7 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration;
public enum DurationUnit {
+ MINUTES,
HOURS,
DAYS
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java
new file mode 100644
index 0000000..9945b3b
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/AtomicNumberOperand.java
@@ -0,0 +1,30 @@
+/*
+ * AtomicNumberOperand.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+
+/*
+ * Atomic number operands are decimal numbers like 1.0, 3.14 etc that can be used in number based conditions.
+ * They are used for defining static thresholds on rules.
+ */
+public class AtomicNumberOperand extends NumericOperand {
+ public AtomicNumberOperand(final String operand) {
+ super(operand);
+ }
+
+ @Override
+ public String toString() {
+ if (this.isParenthesized()) {
+ return String.format("(%s)", getOperand());
+ } else {
+ return getOperand();
+ }
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java
new file mode 100644
index 0000000..3088131
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/BinaryExpressionOperand.java
@@ -0,0 +1,50 @@
+/*
+ * BinaryExpressionOperand.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+/*
+ * A BinaryExpressionOperand is a numerical expression that consists of two operands and an operator.
+ * The operands can themselves be binary expression operands or atomic number operands or function call operands.
+ * The operator can be one of: +, -, /, *
+ * The purpose of this operand is for combining with a dynamic function call operand to create dynamic rule thresholds.
+ */
+@Getter
+@EqualsAndHashCode(callSuper = true)
+public class BinaryExpressionOperand extends NumericOperand {
+ private final String operator;
+ private final NumericOperand operand1;
+ private final NumericOperand operand2;
+
+ public BinaryExpressionOperand(final String operand,
+ final String operator,
+ final NumericOperand operand1,
+ final NumericOperand operand2,
+ final boolean isParenthesized) {
+ super(operand, isParenthesized);
+ this.operator = operator;
+ this.operand1 = operand1;
+ this.operand2 = operand2;
+ }
+
+ @Override
+ public String toString() {
+ String formatted = String.format("%s %s %s",
+ this.operand1.toString(), this.operator, this.operand2.toString());
+ if (this.isParenthesized()) {
+ return String.format("(%s)", formatted);
+ } else {
+ return formatted;
+ }
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java
new file mode 100644
index 0000000..1cf2183
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/FunctionCallOperand.java
@@ -0,0 +1,49 @@
+/*
+ * FunctionCallOperand.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+/*
+ * A Function Call operand is a special operand that takes operands as parameters returns a number.
+ * The parameters can themselves be function call operands, or atomic number operands or binary expression operands.
+ * Each function must be implemented by an instance of "OperandEvaluator", provided at the time of evaluation.
+ * Through the use of function call operands, we introduce the concept of dynamic rules in DQDL.
+ */
+@Getter
+@EqualsAndHashCode(callSuper = true)
+public class FunctionCallOperand extends NumericOperand {
+ private final String functionName;
+ private final List operands;
+
+ public FunctionCallOperand(final String operand,
+ final String functionName,
+ final List operands) {
+ super(operand);
+ this.functionName = functionName;
+ this.operands = operands;
+ }
+
+ @Override
+ public String toString() {
+ String params = this.operands.stream().map(NumericOperand::toString).collect(Collectors.joining(","));
+ String formatted = String.format("%s(%s)", this.functionName, params);
+ if (this.isParenthesized()) {
+ return String.format("(%s)", formatted);
+ } else {
+ return formatted;
+ }
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java
new file mode 100644
index 0000000..edad45d
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NullNumericOperand.java
@@ -0,0 +1,23 @@
+/*
+ * NullNumericOperand.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+
+public class NullNumericOperand extends NumericOperand {
+
+ public NullNumericOperand(final String operand) {
+ super(operand.toUpperCase());
+ }
+
+ @Override
+ public String toString() {
+ return getOperand();
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java
index 7ec0ad9..6bd5f58 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedCondition.java
@@ -10,55 +10,118 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils;
+import static java.lang.Math.abs;
import lombok.EqualsAndHashCode;
import lombok.Getter;
+import lombok.extern.slf4j.Slf4j;
+import java.text.DecimalFormat;
import java.util.List;
import java.util.stream.Collectors;
@Getter
@EqualsAndHashCode(callSuper = true)
+@Slf4j
public class NumberBasedCondition extends Condition {
private final NumberBasedConditionOperator operator;
- private final List operands;
+ private final List operands;
+
+ private static final DecimalFormat OP_FORMAT = new DecimalFormat("#.###");
public NumberBasedCondition(final String conditionAsString,
final NumberBasedConditionOperator operator,
- final List operands) {
+ final List operands) {
super(conditionAsString);
this.operator = operator;
this.operands = operands;
}
- public Boolean evaluate(Double metric) {
+ @Override
+ public Boolean evaluate(Double metric, DQRule dqRule, OperandEvaluator evaluator) {
if (operands == null) return false;
- List operandsAsDouble = operands.stream().map(Double::parseDouble).collect(Collectors.toList());
+ List operandsAsDouble = operands.stream()
+ .map(operand -> evaluator.evaluate(dqRule, operand)).collect(Collectors.toList());
+
+
+ log.info(String.format("Evaluating condition for rule: %s", dqRule));
+ List formatOps = operandsAsDouble.stream().map(OP_FORMAT::format).collect(Collectors.toList());
+ String formatMetric = OP_FORMAT.format(metric);
switch (operator) {
case BETWEEN:
if (operands.size() != 2) return false;
- else return metric > operandsAsDouble.get(0) && metric < operandsAsDouble.get(1);
+ else {
+ boolean result = metric > operandsAsDouble.get(0) && metric < operandsAsDouble.get(1);
+ log.info("{} between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result);
+ return result;
+ }
+ case NOT_BETWEEN:
+ if (operands.size() != 2) return false;
+ else {
+ boolean result = metric <= operandsAsDouble.get(0) || metric >= operandsAsDouble.get(1);
+ log.info("{} not between {} and {}? {}", formatMetric, formatOps.get(0), formatOps.get(1), result);
+ return result;
+ }
case GREATER_THAN_EQUAL_TO:
if (operands.size() != 1) return false;
- else return metric >= operandsAsDouble.get(0);
+ else {
+ boolean result = metric >= operandsAsDouble.get(0);
+ log.info("{} >= {}? {}", formatMetric, formatOps.get(0), result);
+ return result;
+ }
case GREATER_THAN:
if (operands.size() != 1) return false;
- else return metric > operandsAsDouble.get(0);
+ else {
+ boolean result = metric > operandsAsDouble.get(0);
+ log.info("{} > {}? {}", formatMetric, formatOps.get(0), result);
+ return result;
+ }
case LESS_THAN_EQUAL_TO:
if (operands.size() != 1) return false;
- else return metric <= operandsAsDouble.get(0);
+ else {
+ boolean result = metric <= operandsAsDouble.get(0);
+ log.info("{} <= {}? {}", formatMetric, formatOps.get(0), result);
+ return result;
+ }
case LESS_THAN:
if (operands.size() != 1) return false;
- else return metric < operandsAsDouble.get(0);
+ else {
+ boolean result = metric < operandsAsDouble.get(0);
+ log.info("{} < {}? {}", formatMetric, formatOps.get(0), result);
+ return result;
+ }
case EQUALS:
if (operands.size() != 1) return false;
- else return metric.equals(operandsAsDouble.get(0));
- case IN:
- return operandsAsDouble.contains(metric);
+ else {
+ boolean result = isOperandEqualToMetric(metric, operandsAsDouble.get(0));
+ log.info("{} == {}? {}", formatMetric, formatOps.get(0), result);
+ return result;
+ }
+ case NOT_EQUALS:
+ if (operands.size() != 1) return false;
+ else {
+ boolean result = !isOperandEqualToMetric(metric, operandsAsDouble.get(0));
+ log.info("{} != {}? {}", formatMetric, formatOps.get(0), result);
+ return result;
+ }
+ case IN: {
+ boolean result = operandsAsDouble.stream().anyMatch(operand ->
+ isOperandEqualToMetric(metric, operand));
+ log.info("{} in {}? {}", formatMetric, formatOps, result);
+ return result;
+ }
+ case NOT_IN: {
+ boolean result = !operandsAsDouble.stream().anyMatch(operand ->
+ isOperandEqualToMetric(metric, operand));
+ log.info("{} not in {}? {}", formatMetric, formatOps, result);
+ return result;
+ }
default:
+ log.error("Unknown operator");
return false;
}
}
@@ -69,23 +132,70 @@ public String getFormattedCondition() {
switch (operator) {
case BETWEEN:
- return String.format("between %s and %s", operands.get(0), operands.get(1));
+ return String.format("between %s and %s", operands.get(0).toString(), operands.get(1).toString());
+ case NOT_BETWEEN:
+ return String.format("not between %s and %s", operands.get(0).toString(), operands.get(1).toString());
case GREATER_THAN:
- return String.format("> %s", operands.get(0));
+ return String.format("> %s", operands.get(0).toString());
case GREATER_THAN_EQUAL_TO:
- return String.format(">= %s", operands.get(0));
+ return String.format(">= %s", operands.get(0).toString());
case LESS_THAN:
- return String.format("< %s", operands.get(0));
+ return String.format("< %s", operands.get(0).toString());
case LESS_THAN_EQUAL_TO:
- return String.format("<= %s", operands.get(0));
+ return String.format("<= %s", operands.get(0).toString());
case EQUALS:
- return String.format("= %s", operands.get(0));
+ return String.format("= %s", operands.get(0).toString());
+ case NOT_EQUALS:
+ return String.format("!= %s", operands.get(0).toString());
case IN:
- return String.format("in [%s]", String.join(",", operands));
+ return String.format("in [%s]", getFormattedOperands());
+ case NOT_IN:
+ return String.format("not in [%s]", getFormattedOperands());
default:
break;
}
return "";
}
+
+ @Override
+ public String getSortedFormattedCondition() {
+ if (StringUtils.isBlank(conditionAsString)) return "";
+
+ switch (operator) {
+ case IN:
+ return String.format("in [%s]", getSortedFormattedOperands());
+ case NOT_IN:
+ return String.format("not in [%s]", getSortedFormattedOperands());
+ default:
+ return getFormattedCondition();
+ }
+ }
+
+ private String getFormattedOperands() {
+ return operands.stream()
+ .map(NumericOperand::toString)
+ .collect(Collectors.joining(","));
+ }
+
+ private String getSortedFormattedOperands() {
+ return operands.stream()
+ .map(NumericOperand::toString)
+ .sorted((s1, s2) -> {
+ if (s1.equalsIgnoreCase("NULL") && s2.equalsIgnoreCase("NULL")) {
+ return 0; // Treat both NULLs as equal
+ } else if (s1.equalsIgnoreCase("NULL")) {
+ return 1; // Treat NULL as greater than any other value
+ } else if (s2.equalsIgnoreCase("NULL")) {
+ return -1; // Treat NULL as greater than any other value
+ } else {
+ return Double.compare(Double.parseDouble(s1), Double.parseDouble(s2));
+ }
+ })
+ .collect(Collectors.joining(","));
+ }
+
+ protected boolean isOperandEqualToMetric(Double metric, Double operand) {
+ return abs(metric - operand) <= 0.00001;
+ }
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java
index 828e5f9..cd109d3 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumberBasedConditionOperator.java
@@ -12,10 +12,13 @@
public enum NumberBasedConditionOperator {
BETWEEN,
+ NOT_BETWEEN,
GREATER_THAN,
GREATER_THAN_EQUAL_TO,
LESS_THAN,
LESS_THAN_EQUAL_TO,
EQUALS,
- IN
+ NOT_EQUALS,
+ IN,
+ NOT_IN
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java
new file mode 100644
index 0000000..9623996
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/NumericOperand.java
@@ -0,0 +1,30 @@
+/*
+ * NumericOperand.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+
+import lombok.AllArgsConstructor;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.io.Serializable;
+
+@AllArgsConstructor
+@Getter
+@EqualsAndHashCode
+public abstract class NumericOperand implements Serializable {
+ private final String operand;
+ private final boolean isParenthesized;
+
+ public NumericOperand(final String operand) {
+ this.operand = operand;
+ isParenthesized = false;
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java
new file mode 100644
index 0000000..be2449a
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/number/OperandEvaluator.java
@@ -0,0 +1,24 @@
+/*
+ * OperandEvaluator.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number;
+
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule;
+
+import java.io.Serializable;
+
+/**
+ * Class encapsulates implementation logic for resolving NumericOperand to a number (double).
+ */
+public abstract class OperandEvaluator implements Serializable {
+
+ // resolve operand to number
+ public abstract Double evaluate(DQRule rule, NumericOperand operand);
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java
new file mode 100644
index 0000000..7b6e65b
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/Size.java
@@ -0,0 +1,54 @@
+/*
+ * Size.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size;
+
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.io.Serializable;
+
+@Getter
+@EqualsAndHashCode
+public class Size implements Serializable, Comparable {
+ private final Integer amount;
+ private final SizeUnit unit;
+ private final Long bytes;
+
+ public Size(final Integer amount, final SizeUnit unit) {
+ this.amount = amount;
+ this.unit = unit;
+ this.bytes = convertBytes(amount, unit);
+ }
+
+ public String getFormattedSize() {
+ return String.format("%s %s", amount, unit.name().toUpperCase());
+ }
+
+ private Long convertBytes(Integer bytes, SizeUnit unit) {
+ switch (unit) {
+ case KB:
+ return bytes * 1024L;
+ case MB:
+ return bytes * 1024L * 1024L;
+ case GB:
+ return bytes * 1024L * 1024L * 1024L;
+ case TB:
+ return bytes * 1024L * 1024L * 1024L * 1024L;
+ default:
+ return Long.valueOf(bytes);
+ }
+ }
+
+ @Override
+ public int compareTo(Size other) {
+ return Long.compare(this.getBytes(), other.getBytes());
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java
new file mode 100644
index 0000000..7a2a34e
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedCondition.java
@@ -0,0 +1,96 @@
+/*
+ * SizeBasedCondition.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size;
+
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
+import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+@Getter
+@EqualsAndHashCode(callSuper = true)
+public class SizeBasedCondition extends Condition {
+ private final SizeBasedConditionOperator operator;
+ private final List operands;
+
+ public SizeBasedCondition(final String conditionAsString,
+ final SizeBasedConditionOperator operator,
+ final List operands) {
+ super(conditionAsString);
+ this.operator = operator;
+ this.operands = operands;
+ }
+
+ @Override
+ public String getFormattedCondition() {
+ if (this.operands.isEmpty()) return "";
+
+ switch (operator) {
+ case BETWEEN:
+ return String.format("between %s and %s",
+ operands.get(0).getFormattedSize(),
+ operands.get(1).getFormattedSize());
+ case NOT_BETWEEN:
+ return String.format("not between %s and %s",
+ operands.get(0).getFormattedSize(),
+ operands.get(1).getFormattedSize());
+ case GREATER_THAN:
+ return String.format("> %s", operands.get(0).getFormattedSize());
+ case GREATER_THAN_EQUAL_TO:
+ return String.format(">= %s", operands.get(0).getFormattedSize());
+ case LESS_THAN:
+ return String.format("< %s", operands.get(0).getFormattedSize());
+ case LESS_THAN_EQUAL_TO:
+ return String.format("<= %s", operands.get(0).getFormattedSize());
+ case EQUALS:
+ return String.format("= %s", operands.get(0).getFormattedSize());
+ case NOT_EQUALS:
+ return String.format("!= %s", operands.get(0).getFormattedSize());
+ case IN: {
+ List formattedOperands = getFormattedOperands();
+ return String.format("in [%s]", String.join(",", formattedOperands));
+ }
+ case NOT_IN: {
+ List formattedOperands = getFormattedOperands();
+ return String.format("not in [%s]", String.join(",", formattedOperands));
+ }
+ default:
+ break;
+ }
+
+ return "";
+ }
+
+ @Override
+ public String getSortedFormattedCondition() {
+ if (StringUtils.isBlank(conditionAsString)) return "";
+
+ switch (operator) {
+ case IN:
+ return String.format("in [%s]", String.join(",", getSortedFormattedOperands()));
+ case NOT_IN:
+ return String.format("not in [%s]", String.join(",", getSortedFormattedOperands()));
+ default:
+ return getFormattedCondition();
+ }
+ }
+
+ private List getFormattedOperands() {
+ return operands.stream().map(Size::getFormattedSize).collect(Collectors.toList());
+ }
+
+ private List getSortedFormattedOperands() {
+ return operands.stream().map(Size::getFormattedSize).sorted().collect(Collectors.toList());
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java
new file mode 100644
index 0000000..d4c9b72
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeBasedConditionOperator.java
@@ -0,0 +1,24 @@
+/*
+ * SizeBasedConditionOperator.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size;
+
+public enum SizeBasedConditionOperator {
+ BETWEEN,
+ NOT_BETWEEN,
+ GREATER_THAN,
+ GREATER_THAN_EQUAL_TO,
+ LESS_THAN,
+ LESS_THAN_EQUAL_TO,
+ EQUALS,
+ NOT_EQUALS,
+ IN,
+ NOT_IN
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java
new file mode 100644
index 0000000..076b657
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/size/SizeUnit.java
@@ -0,0 +1,19 @@
+/*
+ * SizeUnit.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size;
+
+public enum SizeUnit {
+ B,
+ KB,
+ MB,
+ GB,
+ TB
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java
new file mode 100644
index 0000000..1969c8e
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Keyword.java
@@ -0,0 +1,17 @@
+/*
+ * Keyword.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string;
+
+public enum Keyword {
+ NULL,
+ EMPTY,
+ WHITESPACES_ONLY
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java
new file mode 100644
index 0000000..23431ba
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/KeywordStringOperand.java
@@ -0,0 +1,28 @@
+/*
+ * KeywordStringOperand.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string;
+
+import lombok.EqualsAndHashCode;
+
+@EqualsAndHashCode(callSuper = true)
+public class KeywordStringOperand extends StringOperand {
+ final Keyword operand;
+
+ public KeywordStringOperand(final Keyword operand) {
+ super(operand.toString());
+ this.operand = operand;
+ }
+
+ @Override
+ public String formatOperand() {
+ return getOperand().toString();
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java
new file mode 100644
index 0000000..dcf74f3
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/QuotedStringOperand.java
@@ -0,0 +1,22 @@
+/*
+ * QuotedStringOperand.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string;
+
+public class QuotedStringOperand extends StringOperand {
+ public QuotedStringOperand(final String operand) {
+ super(operand);
+ }
+
+ @Override
+ public String formatOperand() {
+ return "\"" + getOperand() + "\"";
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java
index 2bc54d0..b76c91c 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedCondition.java
@@ -11,6 +11,7 @@
package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand;
import com.amazonaws.glue.ml.dataquality.dqdl.util.StringUtils;
import lombok.EqualsAndHashCode;
import lombok.Getter;
@@ -22,31 +23,45 @@
@EqualsAndHashCode(callSuper = true)
public class StringBasedCondition extends Condition {
private final StringBasedConditionOperator operator;
- private final List operands;
+ private final List operands;
+ private final List unresolvedOperands;
public StringBasedCondition(final String conditionAsString,
final StringBasedConditionOperator operator,
- final List operands) {
+ final List operands) {
+ this(conditionAsString, operator, operands, null);
+ }
+
+ public StringBasedCondition(final String conditionAsString,
+ final StringBasedConditionOperator operator,
+ final List operands,
+ final List unresolvedOperands) {
super(conditionAsString);
this.operator = operator;
this.operands = operands;
+ this.unresolvedOperands = unresolvedOperands;
}
+
@Override
public String getFormattedCondition() {
if (StringUtils.isBlank(conditionAsString)) return "";
+ List effectiveOperands = getEffectiveOperands();
+
switch (operator) {
case MATCHES:
- return String.format("matches %s", formatOperand(operands.get(0)));
+ return String.format("matches %s", effectiveOperands.get(0).formatOperand());
+ case NOT_MATCHES:
+ return String.format("not matches %s", effectiveOperands.get(0).formatOperand());
case EQUALS:
- return String.format("= %s", formatOperand(operands.get(0)));
- case IN: {
- List formattedOperands = operands.stream()
- .map(this::formatOperand)
- .collect(Collectors.toList());
- return String.format("in [%s]", String.join(",", formattedOperands));
- }
+ return String.format("= %s", effectiveOperands.get(0).formatOperand());
+ case NOT_EQUALS:
+ return String.format("!= %s", effectiveOperands.get(0).formatOperand());
+ case IN:
+ return formatInCondition(false, false);
+ case NOT_IN:
+ return formatInCondition(true, false);
default:
break;
}
@@ -54,7 +69,47 @@ public String getFormattedCondition() {
return "";
}
- private String formatOperand(String operand) {
- return "\"" + operand + "\"";
+ @Override
+ public String getSortedFormattedCondition() {
+ if (StringUtils.isBlank(conditionAsString)) return "";
+
+ switch (operator) {
+ case IN:
+ return formatInCondition(false, true);
+ case NOT_IN:
+ return formatInCondition(true, true);
+ default:
+ return getFormattedCondition();
+ }
+ }
+
+ private String formatInCondition(boolean isNot, boolean sorted) {
+ List effectiveOperands = getEffectiveOperands();
+ List formattedOperands = sorted
+ ? getSortedFormattedOperands(effectiveOperands) : getFormattedOperands(effectiveOperands);
+ String operandStr;
+ if (formattedOperands.size() == 1 && effectiveOperands.get(0) instanceof VariableReferenceOperand) {
+ operandStr = formattedOperands.get(0);
+ } else {
+ operandStr = "[" + String.join(",", formattedOperands) + "]";
+ }
+ return String.format("%sin %s", isNot ? "not " : "", operandStr);
+ }
+
+ private List getFormattedOperands(List operands) {
+ return operands.stream()
+ .map(StringOperand::formatOperand)
+ .collect(Collectors.toList());
+ }
+
+ private List getSortedFormattedOperands(List operands) {
+ return operands.stream()
+ .map(StringOperand::formatOperand)
+ .sorted()
+ .collect(Collectors.toList());
+ }
+
+ private List getEffectiveOperands() {
+ return unresolvedOperands != null ? unresolvedOperands : operands;
}
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java
index f3bd814..afed9f0 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringBasedConditionOperator.java
@@ -12,6 +12,9 @@
public enum StringBasedConditionOperator {
EQUALS,
+ NOT_EQUALS,
IN,
- MATCHES
+ NOT_IN,
+ MATCHES,
+ NOT_MATCHES
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java
new file mode 100644
index 0000000..e121197
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/StringOperand.java
@@ -0,0 +1,28 @@
+/*
+ * StringOperand.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string;
+
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.io.Serializable;
+
+@EqualsAndHashCode
+@Getter
+public abstract class StringOperand implements Serializable {
+ private final String operand;
+
+ public StringOperand(final String operand) {
+ this.operand = operand;
+ }
+
+ public abstract String formatOperand();
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java
new file mode 100644
index 0000000..5b03058
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/string/Tag.java
@@ -0,0 +1,46 @@
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string;
+
+import lombok.AllArgsConstructor;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+@AllArgsConstructor
+public class Tag implements Serializable {
+ private final String key;
+ private final String value;
+
+ public static Map convertToStringMap(Map tags) {
+ if (tags == null) {
+ return Collections.emptyMap();
+ }
+ return tags.entrySet().stream()
+ .collect(Collectors.toMap(
+ entry -> entry.getValue().getKey(),
+ entry -> entry.getValue().getValue()
+ ));
+ }
+
+ public String getKey() {
+ return removeQuotes(this.key);
+ }
+
+ public String getValue() {
+ return removeQuotes(this.value);
+ }
+
+ @Override
+ public String toString() {
+ return String.format(" with %s = %s", key, value);
+ }
+
+ private String removeQuotes(String quotedString) {
+ if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) {
+ quotedString = quotedString.substring(1);
+ quotedString = quotedString.substring(0, quotedString.length() - 1);
+ }
+ return quotedString;
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java
new file mode 100644
index 0000000..0bc31e8
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/model/condition/variable/VariableReferenceOperand.java
@@ -0,0 +1,31 @@
+/*
+ * VariableReferenceOperand.java
+ *
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable;
+
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+@Getter
+@EqualsAndHashCode(callSuper = true)
+public class VariableReferenceOperand extends StringOperand {
+ private final String variableName;
+
+ public VariableReferenceOperand(String variableName) {
+ super(variableName);
+ this.variableName = variableName;
+ }
+
+ @Override
+ public String formatOperand() {
+ return "$" + variableName;
+ }
+}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java
index d84bf8f..7ac2dc3 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParser.java
@@ -16,6 +16,7 @@
import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser;
import com.amazonaws.glue.ml.dataquality.dqdl.util.Either;
+import lombok.extern.slf4j.Slf4j;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CommonTokenStream;
@@ -24,10 +25,12 @@
import java.util.List;
+@Slf4j
public class DQDLParser {
private static final String PARSING_ERROR_MESSAGE_PREFIX = "Parsing Error";
public DQRuleset parse(String dqdl) throws InvalidDataQualityRulesetException {
+
CharStream input = CharStreams.fromString(dqdl);
DQDLErrorListener errorListener = new DQDLErrorListener();
@@ -41,15 +44,18 @@ public DQRuleset parse(String dqdl) throws InvalidDataQualityRulesetException {
parser.addErrorListener(errorListener);
DQDLParserListener listener = new DQDLParserListener(errorListener);
- ParseTreeWalker.DEFAULT.walk(listener, parser.document());
-
+ try {
+ ParseTreeWalker.DEFAULT.walk(listener, parser.document());
+ } catch (StringIndexOutOfBoundsException e) {
+ log.error(e.getMessage(), e);
+ throw new InvalidDataQualityRulesetException("Invalid DQDL.");
+ }
Either, DQRuleset> dqRulesetEither = listener.getParsedRuleset();
-
if (dqRulesetEither.isLeft()) {
throw new InvalidDataQualityRulesetException(generateExceptionMessage(dqRulesetEither.getLeft()));
- } else {
- return dqRulesetEither.getRight();
}
+ return dqRulesetEither.getRight();
+
}
private String generateExceptionMessage(List errorMessages) {
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java
index 657df11..3fa355d 100644
--- a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLParserListener.java
@@ -10,38 +10,71 @@
package com.amazonaws.glue.ml.dataquality.dqdl.parser;
+import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener;
+import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQAnalyzer;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule;
import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleLogicalOperator;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleParameterValue;
import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleType;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQVariable;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedConditionOperator;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.NullDateExpression;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.Duration;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedCondition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationBasedConditionOperator;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.duration.DurationUnit;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.AtomicNumberOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.BinaryExpressionOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.FunctionCallOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NullNumericOperand;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedConditionOperator;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.Size;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedCondition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedConditionOperator;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeUnit;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Keyword;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.KeywordStringOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedConditionOperator;
-import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRule;
-import com.amazonaws.glue.ml.dataquality.dqdl.model.DQRuleset;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand;
import com.amazonaws.glue.ml.dataquality.dqdl.util.Either;
-import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageBaseListener;
-import com.amazonaws.glue.ml.dataquality.dqdl.DataQualityDefinitionLanguageParser;
-import org.antlr.v4.runtime.RuleContext;
-
+import org.antlr.v4.runtime.ParserRuleContext;
+
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.LocalTime;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeParseException;
+import java.time.zone.ZoneRulesException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
+import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.Tag.convertToStringMap;
+
public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListener {
private final DQDLErrorListener errorListener;
private final List errorMessages = new ArrayList<>();
@@ -50,6 +83,8 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene
private String primarySource;
private List additionalSources;
private final List dqRules = new ArrayList<>();
+ private final List dqAnalyzers = new ArrayList<>();
+ private final Map dqVariables = new HashMap<>();
private static final String METADATA_VERSION_KEY = "Version";
private static final Set ALLOWED_METADATA_KEYS;
@@ -57,6 +92,12 @@ public class DQDLParserListener extends DataQualityDefinitionLanguageBaseListene
private static final String PRIMARY_SOURCE_KEY = "Primary";
private static final String ADDITIONAL_SOURCES_KEY = "AdditionalDataSources";
private static final Set ALLOWED_SOURCES_KEYS;
+ private static final String THRESHOLD_KEY = "threshold";
+
+ private static final String MILITARY_TIME_FORMAT = "HH:mm";
+ private static final String AMPM_TIME_FORMAT = "h:mm a";
+
+ private static final int COMPOSITE_RULE_MAX_NESTING_DEPTH = 5;
static {
ALLOWED_METADATA_KEYS = new HashSet<>();
@@ -72,8 +113,13 @@ public DQDLParserListener(DQDLErrorListener errorListener) {
}
public Either, DQRuleset> getParsedRuleset() {
+ // Only add this error message if we did not walk the tree due to empty rules or analyzers sections.
+ if (errorMessages.isEmpty() && dqRules.isEmpty() && dqAnalyzers.isEmpty()) {
+ errorMessages.add("No rules or analyzers provided.");
+ }
+
if (errorMessages.isEmpty() && errorListener.getErrorMessages().isEmpty()) {
- return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules));
+ return Either.fromRight(new DQRuleset(metadata, primarySource, additionalSources, dqRules, dqAnalyzers));
} else {
List allErrorMessages = new ArrayList<>();
allErrorMessages.addAll(errorMessages);
@@ -85,8 +131,18 @@ public Either, DQRuleset> getParsedRuleset() {
@Override
public void enterMetadata(DataQualityDefinitionLanguageParser.MetadataContext ctx) {
- for (DataQualityDefinitionLanguageParser.PairContext pairContext
- : ctx.dictionary().pair()) {
+ // The logic below, just above the loop is a guard against an NPE caused by empty dictionaries.
+ // Need to investigate why dictionaryContext.pair() returns 1 element,
+ // which is an empty string, for an empty dictionary.
+ // We would not have this problem if dictionaryContext.pair() returned 0 entries in the list.
+ DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary();
+ List dictionaryErrors = validateDictionary(dictionaryContext);
+ if (!dictionaryErrors.isEmpty()) {
+ errorMessages.addAll(dictionaryErrors);
+ return;
+ }
+
+ for (DataQualityDefinitionLanguageParser.PairContext pairContext: dictionaryContext.pair()) {
String key = removeEscapes(removeQuotes(pairContext.QUOTED_STRING().getText()));
if (!ALLOWED_METADATA_KEYS.contains(key)) {
errorMessages.add("Unsupported key provided in Metadata section");
@@ -99,16 +155,15 @@ public void enterMetadata(DataQualityDefinitionLanguageParser.MetadataContext ct
}
@Override
- public void enterRules(DataQualityDefinitionLanguageParser.RulesContext ctx) {
- if (ctx.dqRules() == null) {
- errorMessages.add("No rules provided.");
+ public void enterDataSources(DataQualityDefinitionLanguageParser.DataSourcesContext ctx) {
+ DataQualityDefinitionLanguageParser.DictionaryContext dictionaryContext = ctx.dictionary();
+ List dictionaryErrors = validateDictionary(dictionaryContext);
+ if (!dictionaryErrors.isEmpty()) {
+ errorMessages.addAll(dictionaryErrors);
+ return;
}
- }
- @Override
- public void enterDataSources(DataQualityDefinitionLanguageParser.DataSourcesContext ctx) {
- for (DataQualityDefinitionLanguageParser.PairContext pairContext
- : ctx.dictionary().pair()) {
+ for (DataQualityDefinitionLanguageParser.PairContext pairContext: dictionaryContext.pair()) {
String key = removeEscapes(removeQuotes(pairContext.QUOTED_STRING().getText()));
if (!ALLOWED_SOURCES_KEYS.contains(key)) {
@@ -144,44 +199,121 @@ public void enterDqRules(DataQualityDefinitionLanguageParser.DqRulesContext dqRu
return;
}
- for (DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc
- : dqRulesContext.topLevelRule()) {
- if (tlc.AND().size() > 0 || tlc.OR().size() > 0) {
- DQRuleLogicalOperator op = tlc.AND().size() > 0 ? DQRuleLogicalOperator.AND : DQRuleLogicalOperator.OR;
- List nestedRules = new ArrayList<>();
+ for (DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc: dqRulesContext.topLevelRule()) {
+ Either dqRuleEither = parseTopLevelRule(tlc, 0);
+ if (dqRuleEither.isLeft()) {
+ errorMessages.add(dqRuleEither.getLeft());
+ return;
+ } else {
+ dqRules.add(dqRuleEither.getRight());
+ }
+ }
+ }
- for (DataQualityDefinitionLanguageParser.DqRuleContext rc : tlc.dqRule()) {
- Either dqRuleEither = getDQRule(rc);
- if (dqRuleEither.isLeft()) {
- errorMessages.add(dqRuleEither.getLeft());
- return;
- } else {
- nestedRules.add(dqRuleEither.getRight());
- }
- }
+ private Either parseTopLevelRule(DataQualityDefinitionLanguageParser.TopLevelRuleContext tlc,
+ int depth) {
+ if (tlc.LPAREN() != null && tlc.RPAREN() != null) {
+ return parseTopLevelRule(tlc.topLevelRule(0), depth);
+ } else if (tlc.AND() != null || tlc.OR() != null) {
+ DQRuleLogicalOperator op = tlc.AND() != null ? DQRuleLogicalOperator.AND : DQRuleLogicalOperator.OR;
+ List> nestedRuleEitherList =
+ tlc.topLevelRule().stream().map(r -> parseTopLevelRule(r, depth + 1)).collect(Collectors.toList());
- dqRules.add(new DQRule("Composite", null, null, null, op, nestedRules));
- } else if (tlc.dqRule(0) != null) {
- Either dqRuleEither = getDQRule(tlc.dqRule(0));
- if (dqRuleEither.isLeft()) {
- errorMessages.add(dqRuleEither.getLeft());
- return;
+ List allErrorMessages = new ArrayList<>();
+ List allRules = new ArrayList<>();
+
+ nestedRuleEitherList.forEach(arg -> {
+ if (arg.isLeft()) {
+ allErrorMessages.add(arg.getLeft());
} else {
- dqRules.add(dqRuleEither.getRight());
+ allRules.add(arg.getRight());
}
+ });
+
+ if (allErrorMessages.isEmpty()) {
+ return Either.fromRight(
+ new DQRule("Composite", null, null, null, op, allRules)
+ );
} else {
- errorMessages.add("No valid rule found");
+ return Either.fromLeft(allErrorMessages.get(0));
+ }
+ } else if (tlc.dqRule() != null) {
+ if (depth > COMPOSITE_RULE_MAX_NESTING_DEPTH) {
+ return Either.fromLeft(
+ String.format("Maximum nested expression depth of %s reached for composite rule",
+ COMPOSITE_RULE_MAX_NESTING_DEPTH));
+ } else {
+ return getDQRule(tlc.dqRule());
+ }
+ } else {
+ return Either.fromLeft("No valid rule found");
+ }
+ }
+
+ @Override
+ public void enterDqAnalyzers(DataQualityDefinitionLanguageParser.DqAnalyzersContext dqAnalyzersContext) {
+ if (!errorMessages.isEmpty()) {
+ return;
+ }
+
+ for (DataQualityDefinitionLanguageParser.DqAnalyzerContext dac: dqAnalyzersContext.dqAnalyzer()) {
+ Either dqAnalyzerEither = getDQAnalyzer(dac);
+ if (dqAnalyzerEither.isLeft()) {
+ errorMessages.add(dqAnalyzerEither.getLeft());
return;
+ } else {
+ dqAnalyzers.add(dqAnalyzerEither.getRight());
}
}
}
+ @Override
+ public void enterVariableDeclaration(DataQualityDefinitionLanguageParser.VariableDeclarationContext ctx) {
+ if (!errorMessages.isEmpty()) {
+ return;
+ }
+
+ String variableName = ctx.IDENTIFIER().getText();
+
+ if (variableName.startsWith(".") || variableName.startsWith("_")) {
+ errorMessages.add(String.format("Variable name '%s' cannot start with '.' or '_'", variableName));
+ return;
+ }
+
+ if (dqVariables.containsKey(variableName)) {
+ errorMessages.add("Variable '" + variableName + "' is already defined");
+ return;
+ }
+
+ DQVariable variable = null;
+ DataQualityDefinitionLanguageParser.ExpressionContext expr = ctx.expression();
+ if (expr == null) {
+ errorMessages.add(String.format("Missing value for variable '%s'", variableName));
+ return;
+ }
+
+ if (expr.stringValuesArray() != null) {
+ List values = expr.stringValuesArray().stringValues().stream()
+ .map(this::processStringValues)
+ .collect(Collectors.toList());
+ variable = new DQVariable(variableName, DQVariable.VariableType.STRING_ARRAY, values);
+ } else if (expr.stringValues() != null) {
+ String value = processStringValues(expr.stringValues());
+ variable = new DQVariable(variableName, DQVariable.VariableType.STRING, value);
+ }
+
+ if (variable != null) {
+ dqVariables.put(variableName, variable);
+ } else {
+ errorMessages.add(String.format("Failed to parse variable '%s'", variableName));
+ }
+ }
+
private Either getDQRule(
DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) {
String ruleType = dqRuleContext.ruleType().getText();
- List parameters = dqRuleContext.parameter().stream()
- .map(p -> p.getText().replaceAll("\"", ""))
- .collect(Collectors.toList());
+
+ List parameters = parseParameters(dqRuleContext.parameterWithConnectorWord());
Optional optionalDQRuleType = DQRuleType.getRuleType(ruleType, parameters.size());
@@ -191,19 +323,79 @@ private Either getDQRule(
DQRuleType dqRuleType = optionalDQRuleType.get();
+ if (dqRuleType.isAnalyzerOnly()) {
+ return Either.fromLeft(String.format("Analyzer Type: %s is not supported in rules section", ruleType));
+ }
+
Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters);
if (errorMessage.isPresent()) {
return Either.fromLeft(String.format(errorMessage.get() + ": %s", ruleType));
}
- Map parameterMap = dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters);
+ LinkedHashMap parameterMap =
+ dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters);
+
+ String whereClause = null;
+ if (dqRuleContext.whereClause() != null) {
+ if (dqRuleType.isWhereClauseSupported()) {
+ DataQualityDefinitionLanguageParser.WhereClauseContext ctx = dqRuleContext.whereClause();
+ if (ctx.quotedString().getText().isEmpty() || ctx.quotedString().getText().equals("\"\"")) {
+ return Either.fromLeft(
+ String.format("Empty where condition provided for rule type: %s", ruleType));
+ } else {
+ whereClause = removeQuotes(ctx.quotedString().getText());
+ }
+ } else {
+ return Either.fromLeft(String.format("Where clause is not supported for rule type: %s", ruleType));
+ }
+ }
+
+ Condition thresholdCondition = null;
+ Map tags = new HashMap<>();
+ List tagContexts =
+ (dqRuleContext.tagWithCondition() == null) ? new ArrayList<>() : dqRuleContext.tagWithCondition();
+ for (DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext : tagContexts) {
+ if (tagContext.stringBasedCondition() != null) {
+ //process plain string tag
+ final Either outcome = processStringTag(tagContext);
+ if (outcome.isLeft()) {
+ return Either.fromLeft(outcome.getLeft());
+ } else {
+ final Tag tag = outcome.getRight();
+ tags.put(tag.getKey(), tag);
+ }
+ } else if (tagContext.numberBasedCondition() != null) {
+ final String tagName = tagContext.tagValues().getText();
+ if (tagName.equalsIgnoreCase(THRESHOLD_KEY)) {
+ //process threshold tag
+ final Either outcome =
+ processThresholdTag(dqRuleType, thresholdCondition, tagContext, ruleType);
+ if (outcome.isLeft()) {
+ return Either.fromLeft(outcome.getLeft());
+ } else {
+ thresholdCondition = outcome.getRight();
+ }
+ } else {
+ //convert number tag into string tag
+ final Either outcome = processNumberTag(tagContext, tagName);
+ if (outcome.isLeft()) {
+ return Either.fromLeft(outcome.getLeft());
+ } else {
+ final Tag tag = outcome.getRight();
+ tags.put(tag.getKey(), tag);
+ }
+ }
+ } else {
+ return Either.fromLeft(String.format("Invalid tag provided for rule type: %s", ruleType));
+ }
+ }
Condition condition;
List> conditions = Arrays.stream(dqRuleType.getReturnType().split("\\|"))
- .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext))
- .collect(Collectors.toList());
+ .map(rt -> parseCondition(dqRuleType, rt, dqRuleContext, convertToStringMap(tags)))
+ .collect(Collectors.toList());
Optional> optionalCondition = conditions.stream().filter(Either::isRight).findFirst();
if (optionalCondition.isPresent()) {
@@ -214,50 +406,132 @@ private Either getDQRule(
}
} else {
Optional> optionalFailedCondition =
- conditions.stream().filter(Either::isLeft).findFirst();
+ conditions.stream().filter(Either::isLeft).findFirst();
if (optionalFailedCondition.isPresent()) {
return Either.fromLeft(optionalFailedCondition.get().getLeft());
} else {
return Either.fromLeft(
- String.format("Error while parsing condition for rule with rule type: %s", ruleType));
+ String.format("Error while parsing condition for rule with rule type: %s", ruleType));
}
}
- Condition thresholdCondition = null;
- if (dqRuleContext.withThresholdCondition() != null) {
- if (dqRuleType.isThresholdSupported()) {
- DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx =
- dqRuleContext.withThresholdCondition().numberBasedCondition();
-
- if (ctx == null) {
- return Either.fromLeft(
- String.format("Empty threshold condition provided for rule type: %s", ruleType));
- } else {
- Optional possibleCond =
- parseNumberBasedCondition(dqRuleContext.withThresholdCondition().numberBasedCondition());
- if (possibleCond.isPresent()) {
- thresholdCondition = possibleCond.get();
- } else {
- return Either.fromLeft(
- String.format("Unable to parse threshold condition provided for rule type: %s", ruleType));
- }
- }
+ return Either.fromRight(
+ DQRule.createFromParameterValueMapWithVariables(
+ dqRuleType, parameterMap, condition, thresholdCondition, whereClause, tags, dqVariables)
+ );
+ }
- } else {
- return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType));
+ private Either processThresholdTag(DQRuleType dqRuleType,
+ Condition thresholdCondition,
+ DataQualityDefinitionLanguageParser
+ .TagWithConditionContext tagContext,
+ String ruleType) {
+ if (dqRuleType.isThresholdSupported()) {
+ if (thresholdCondition != null) {
+ return Either.fromLeft("Only one threshold condition at a time is supported.");
}
+ return processThresholdTag(tagContext, ruleType);
+ } else {
+ return Either.fromLeft(String.format("Threshold condition not supported for rule type: %s", ruleType));
}
+ }
- return Either.fromRight(
- new DQRule(dqRuleType.getRuleTypeName(), parameterMap, condition,
- thresholdCondition, DQRuleLogicalOperator.AND, new ArrayList<>())
- );
+ private Either processNumberTag(DataQualityDefinitionLanguageParser
+ .TagWithConditionContext tagContext,
+ String tagName) {
+ if (!isTagValid(tagContext.numberBasedCondition())) {
+ return Either.fromLeft("Number tags only support the equality operator.");
+ }
+ final List numberContexts =
+ tagContext.numberBasedCondition().number();
+ if (numberContexts != null && !numberContexts.isEmpty()) {
+ final String tagValue = numberContexts.get(0).getText();
+ return Either.fromRight(new Tag(tagName, tagValue));
+ } else {
+ return Either.fromLeft(String.format("Error Parsing Tag %s", tagName));
+ }
+ }
+
+ private Either processStringTag(
+ DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext) {
+ if (!isTagValid(tagContext.stringBasedCondition())) {
+ return Either.fromLeft("String tags only support the equality operator.");
+ }
+ String tagKey = tagContext.tagValues().getText();
+ Optional valueCondition = parseStringBasedCondition(tagContext.stringBasedCondition());
+ if (valueCondition.isPresent()) {
+ StringBasedCondition stringCondition = (StringBasedCondition) valueCondition.get();
+ String tagValue = stringCondition.getOperands().get(0).formatOperand();
+ return Either.fromRight(new Tag(tagKey, tagValue));
+ } else {
+ return Either.fromLeft(String.format("Error while parsing tag: %s", tagKey));
+ }
+ }
+
+ private Either processThresholdTag(
+ DataQualityDefinitionLanguageParser.TagWithConditionContext tagContext, String ruleType) {
+ DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx =
+ tagContext.numberBasedCondition();
+ Optional possibleCond = parseNumberBasedCondition(ctx);
+ if (possibleCond.isPresent()) {
+ return Either.fromRight(possibleCond.get());
+ } else {
+ return Either.fromLeft(String.format(
+ "Unable to parse threshold condition provided for rule type: %s", ruleType));
+ }
+ }
+
+ private boolean isTagValid(ParserRuleContext ctx) {
+ if (ctx instanceof DataQualityDefinitionLanguageParser.StringBasedConditionContext) {
+ final DataQualityDefinitionLanguageParser.StringBasedConditionContext stringCtx =
+ (DataQualityDefinitionLanguageParser.StringBasedConditionContext) ctx;
+ return stringCtx.EQUAL_TO() != null && stringCtx.NEGATION() == null;
+ } else if (ctx instanceof DataQualityDefinitionLanguageParser.NumberBasedConditionContext) {
+ final DataQualityDefinitionLanguageParser.NumberBasedConditionContext numberCtx =
+ (DataQualityDefinitionLanguageParser.NumberBasedConditionContext) ctx;
+ return numberCtx.EQUAL_TO() != null && numberCtx.NEGATION() == null;
+ } else {
+ return false;
+ }
+ }
+
+ private Either getDQAnalyzer(
+ DataQualityDefinitionLanguageParser.DqAnalyzerContext dqAnalyzerContext) {
+ String analyzerType = dqAnalyzerContext.analyzerType().getText();
+
+ List parameters = parseParameters(dqAnalyzerContext.parameterWithConnectorWord());
+
+ // We just use the DQ Rule names to validate what analyzer names to allow.
+ // This might change closer to re:Invent, but keeping it simple for now.
+ Optional optionalDQAnalyzerType = DQRuleType.getRuleType(analyzerType, parameters.size());
+
+ if (!optionalDQAnalyzerType.isPresent()) {
+ return Either.fromLeft(String.format("Analyzer Type: %s is not valid", analyzerType));
+ }
+
+ DQRuleType dqRuleType = optionalDQAnalyzerType.get();
+
+ if (dqRuleType.getReturnType().equals("BOOLEAN")) {
+ return Either.fromLeft(String.format("Analyzer Type: %s is not supported", analyzerType));
+ }
+
+ Optional errorMessage = dqRuleType.verifyParameters(dqRuleType.getParameters(), parameters);
+
+ if (errorMessage.isPresent()) {
+ return Either.fromLeft(String.format(errorMessage.get() + ": %s", analyzerType));
+ }
+
+ LinkedHashMap parameterMap =
+ dqRuleType.createParameterMap(dqRuleType.getParameters(), parameters);
+
+ return Either.fromRight(DQAnalyzer.createFromValueMap(analyzerType, parameterMap));
}
private Either parseCondition(
DQRuleType ruleType,
String returnType,
- DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext) {
+ DataQualityDefinitionLanguageParser.DqRuleContext dqRuleContext,
+ Map tags) {
Either response =
Either.fromLeft(String.format("Error parsing condition for return type: %s", returnType));
@@ -312,7 +586,7 @@ private Either parseCondition(
ruleType.getRuleTypeName()));
} else {
Optional possibleCond =
- parseDateBasedCondition(dqRuleContext.condition().dateBasedCondition());
+ parseDateBasedCondition(dqRuleContext.condition().dateBasedCondition(), tags);
if (possibleCond.isPresent()) {
response = Either.fromRight(possibleCond.get());
@@ -336,6 +610,31 @@ private Either parseCondition(
}
break;
}
+ case "SIZE":
+ case "SIZE_ARRAY": {
+ DataQualityDefinitionLanguageParser.ConditionContext cx = dqRuleContext.condition();
+ if (cx == null || (cx.sizeBasedCondition() == null && cx.numberBasedCondition() == null)) {
+ return Either.fromLeft(
+ String.format("Unexpected condition for rule of type %s with size return type",
+ ruleType.getRuleTypeName()));
+ } else if (cx.sizeBasedCondition() != null) {
+ Optional possibleCond =
+ parseSizeBasedCondition(dqRuleContext.condition().sizeBasedCondition());
+
+ if (possibleCond.isPresent()) {
+ response = Either.fromRight(possibleCond.get());
+ }
+ } else if (cx.numberBasedCondition() != null) {
+ Optional possibleCond =
+ convertNumberToSizeCondition(
+ parseNumberBasedCondition(dqRuleContext.condition().numberBasedCondition()));
+
+ if (possibleCond.isPresent()) {
+ response = Either.fromRight(possibleCond.get());
+ }
+ }
+ break;
+ }
default:
break;
}
@@ -343,6 +642,24 @@ private Either parseCondition(
return response;
}
+ private Optional convertNumberToSizeCondition(Optional in) {
+ if (!in.isPresent() || !(in.get() instanceof NumberBasedCondition)) {
+ return Optional.empty();
+ }
+ NumberBasedCondition input = (NumberBasedCondition) in.get();
+ final String conditionAsString = input.getConditionAsString();
+ final SizeBasedConditionOperator operator = SizeBasedConditionOperator.valueOf(input.getOperator().name());
+ final List operands = input.getOperands().stream()
+ .filter(x -> x instanceof AtomicNumberOperand)
+ .filter(x -> Double.parseDouble(x.getOperand()) % 1 == 0) // filter only integer
+ .map(x -> new Size(Integer.parseInt(x.getOperand()), SizeUnit.B))
+ .collect(Collectors.toList());
+ if (operands.size() != input.getOperands().size()) {
+ return Optional.empty();
+ }
+ return Optional.of(new SizeBasedCondition(conditionAsString, operator, operands));
+ }
+
private Optional parseNumberBasedCondition(
DataQualityDefinitionLanguageParser.NumberBasedConditionContext ctx) {
@@ -350,118 +667,283 @@ private Optional parseNumberBasedCondition(
Condition condition = null;
if (ctx.BETWEEN() != null && ctx.number().size() == 2) {
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.BETWEEN,
- Arrays.asList(ctx.number(0).getText(), ctx.number(1).getText()));
+ Optional operand1 = parseNumericOperand(ctx.number(0), false);
+ Optional operand2 = parseNumericOperand(ctx.number(1), false);
+
+ if (operand1.isPresent() && operand2.isPresent()) {
+ NumberBasedConditionOperator op = (ctx.NOT() != null) ?
+ NumberBasedConditionOperator.NOT_BETWEEN
+ : NumberBasedConditionOperator.BETWEEN;
+ condition = new NumberBasedCondition(exprStr, op, Arrays.asList(operand1.get(), operand2.get()));
+ }
} else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.number().size() == 1) {
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.GREATER_THAN_EQUAL_TO,
- Collections.singletonList(ctx.number(0).getText()));
+ Optional operand = parseNumericOperand(ctx.number(0), false);
+ if (operand.isPresent()) {
+ condition = new NumberBasedCondition(
+ exprStr, NumberBasedConditionOperator.GREATER_THAN_EQUAL_TO,
+ Collections.singletonList(operand.get()));
+ }
} else if (ctx.GREATER_THAN() != null && ctx.number().size() == 1) {
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.GREATER_THAN,
- Collections.singletonList(ctx.number(0).getText()));
+ Optional operand = parseNumericOperand(ctx.number(0), false);
+ if (operand.isPresent()) {
+ condition = new NumberBasedCondition(
+ exprStr, NumberBasedConditionOperator.GREATER_THAN,
+ Collections.singletonList(operand.get()));
+ }
} else if (ctx.LESS_THAN() != null && ctx.number().size() == 1) {
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.LESS_THAN,
- Collections.singletonList(ctx.number(0).getText()));
+ Optional operand = parseNumericOperand(ctx.number(0), false);
+ if (operand.isPresent()) {
+ condition = new NumberBasedCondition(
+ exprStr, NumberBasedConditionOperator.LESS_THAN,
+ Collections.singletonList(operand.get()));
+ }
} else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.number().size() == 1) {
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.LESS_THAN_EQUAL_TO,
- Collections.singletonList(ctx.number(0).getText()));
+ Optional operand = parseNumericOperand(ctx.number(0), false);
+ if (operand.isPresent()) {
+ condition = new NumberBasedCondition(
+ exprStr, NumberBasedConditionOperator.LESS_THAN_EQUAL_TO,
+ Collections.singletonList(operand.get()));
+ }
} else if (ctx.EQUAL_TO() != null && ctx.number().size() == 1) {
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.EQUALS,
- Collections.singletonList(ctx.number(0).getText()));
+ Optional operand = parseNumericOperand(ctx.number(0), false);
+ if (operand.isPresent()) {
+ NumberBasedConditionOperator op = (ctx.NEGATION() != null) ?
+ NumberBasedConditionOperator.NOT_EQUALS
+ : NumberBasedConditionOperator.EQUALS;
+ condition = new NumberBasedCondition(
+ exprStr, op, Collections.singletonList(operand.get()));
+ }
} else if (ctx.IN() != null && ctx.numberArray() != null && ctx.numberArray().number().size() > 0) {
- List numbers = ctx.numberArray().number().stream()
- .map(RuleContext::getText)
+ List> numbers = ctx.numberArray().number()
+ .stream()
+ .map(op -> parseNumericOperand(op, false))
.collect(Collectors.toList());
- condition = new NumberBasedCondition(exprStr, NumberBasedConditionOperator.IN, numbers);
+ if (numbers.stream().allMatch(Optional::isPresent)) {
+ NumberBasedConditionOperator op = (ctx.NOT() != null) ?
+ NumberBasedConditionOperator.NOT_IN
+ : NumberBasedConditionOperator.IN;
+ condition = new NumberBasedCondition(exprStr, op,
+ numbers.stream().map(Optional::get).collect(Collectors.toList()));
+ }
}
return Optional.ofNullable(condition);
}
+ private Optional parseNumericOperand(
+ DataQualityDefinitionLanguageParser.NumberContext numberContext, boolean isParenthesized
+ ) {
+ if (numberContext.numberOp() != null) {
+ Optional operand1 = parseNumericOperand(numberContext.number(0), false);
+ Optional operand2 = parseNumericOperand(numberContext.number(1), false);
+ if (operand1.isPresent() && operand2.isPresent()) {
+ return Optional.of(
+ new BinaryExpressionOperand(
+ numberContext.getText(),
+ numberContext.numberOp().getText(),
+ operand1.get(), operand2.get(),
+ isParenthesized
+ )
+ );
+ } else {
+ return Optional.empty();
+ }
+ } else if (numberContext.functionCall() != null) {
+ DataQualityDefinitionLanguageParser.FunctionCallContext fcc = numberContext.functionCall();
+ String functionName = fcc.IDENTIFIER().getText();
+ List functionParameters = new ArrayList<>();
+
+ if (fcc.functionParameters() != null) {
+ List> parameters = fcc.functionParameters().number()
+ .stream()
+ .map(op -> parseNumericOperand(op, false))
+ .collect(Collectors.toList());
+
+ if (parameters.stream().allMatch(Optional::isPresent)) {
+ functionParameters = parameters.stream().map(Optional::get).collect(Collectors.toList());
+ return Optional.of(
+ new FunctionCallOperand(fcc.getText(), functionName, functionParameters)
+ );
+ }
+ } else {
+ // No parameter function
+ return Optional.of(
+ new FunctionCallOperand(fcc.getText(), functionName, functionParameters)
+ );
+ }
+ } else if (numberContext.LPAREN() != null) {
+ return parseNumericOperand(numberContext.number(0), true);
+ } else if (numberContext.atomicNumber() != null) {
+ return Optional.of(new AtomicNumberOperand(numberContext.getText()));
+ } else if (numberContext.NULL() != null) {
+ return Optional.of(new NullNumericOperand(numberContext.getText()));
+ }
+
+ return Optional.empty();
+ }
+
private Optional parseStringBasedCondition(
DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx
) {
String exprStr = ctx.getText();
Condition condition = null;
- if (ctx.EQUAL_TO() != null && ctx.quotedString() != null) {
- condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.EQUALS,
- Collections.singletonList(removeQuotes(ctx.quotedString().QUOTED_STRING().getText())));
- } else if (ctx.IN() != null &&
- ctx.quotedStringArray() != null &&
- ctx.quotedStringArray().quotedString().size() > 0) {
- condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.IN,
- ctx.quotedStringArray().quotedString().stream()
- .map(s -> removeQuotes(removeEscapes(s.getText())))
- .collect(Collectors.toList())
- );
+ if (ctx.EQUAL_TO() != null) {
+ StringBasedConditionOperator op = (ctx.NEGATION() != null) ?
+ StringBasedConditionOperator.NOT_EQUALS
+ : StringBasedConditionOperator.EQUALS;
+
+ StringOperand operand;
+ if (ctx.variableDereference() != null) {
+ operand = new VariableReferenceOperand(ctx.variableDereference().IDENTIFIER().getText());
+ } else if (ctx.stringValues() != null) {
+ Optional parsedOperand = parseStringOperand(ctx, Optional.of(ctx.stringValues()), op);
+ if (!parsedOperand.isPresent()) {
+ return Optional.empty();
+ }
+ operand = parsedOperand.get();
+ } else {
+ return Optional.empty();
+ }
+
+ condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand));
+ } else if (ctx.IN() != null) {
+ StringBasedConditionOperator op = (ctx.NOT() != null) ?
+ StringBasedConditionOperator.NOT_IN
+ : StringBasedConditionOperator.IN;
+
+ List operands;
+ if (ctx.variableDereference() != null) {
+ operands = Collections.singletonList(
+ new VariableReferenceOperand(ctx.variableDereference().IDENTIFIER().getText()));
+ } else if (ctx.stringValuesArray() != null && ctx.stringValuesArray().stringValues().size() > 0) {
+ operands = ctx.stringValuesArray().stringValues()
+ .stream()
+ .map(s -> parseStringOperand(ctx, Optional.of(s), op))
+ .filter(Optional::isPresent)
+ .map(Optional::get)
+ .collect(Collectors.toList());
+ } else {
+ return Optional.empty();
+ }
+
+ if (!operands.isEmpty()) {
+ condition = new StringBasedCondition(exprStr, op, operands);
+ }
} else if (ctx.matchesRegexCondition() != null) {
- condition = new StringBasedCondition(exprStr, StringBasedConditionOperator.MATCHES,
- Collections.singletonList(removeQuotes(ctx.matchesRegexCondition().quotedString().getText())));
+ StringBasedConditionOperator op = (ctx.NOT() != null) ?
+ StringBasedConditionOperator.NOT_MATCHES
+ : StringBasedConditionOperator.MATCHES;
+ Optional operand = parseStringOperand(ctx, Optional.ofNullable(ctx.stringValues()), op);
+ if (operand.isPresent()) {
+ condition = new StringBasedCondition(exprStr, op, Collections.singletonList(operand.get()));
+ }
}
return Optional.ofNullable(condition);
}
+ private Optional parseStringOperand(
+ DataQualityDefinitionLanguageParser.StringBasedConditionContext ctx,
+ Optional
+ stringValuesContext, StringBasedConditionOperator op) {
+
+ switch (op) {
+ case NOT_EQUALS:
+ case EQUALS:
+ Keyword keyword = parseKeyword(stringValuesContext.get());
+ if (keyword == null) {
+ return Optional.of(new QuotedStringOperand(
+ removeQuotes(stringValuesContext.get().quotedString().getText())));
+ } else {
+ return Optional.of(new KeywordStringOperand(keyword));
+ }
+ case NOT_IN:
+ case IN:
+ keyword = parseKeyword(stringValuesContext.get());
+ if (keyword == null) {
+ return Optional.of(new QuotedStringOperand(
+ removeQuotes(removeEscapes(stringValuesContext.get().quotedString().getText()))));
+ } else {
+ return Optional.of(new KeywordStringOperand(keyword));
+ }
+ case MATCHES:
+ case NOT_MATCHES:
+ return Optional.of(new QuotedStringOperand(
+ removeQuotes(ctx.matchesRegexCondition().quotedString().getText())));
+ default:
+ return Optional.empty();
+ }
+ }
+
private Optional parseDateBasedCondition(
- DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx) {
+ DataQualityDefinitionLanguageParser.DateBasedConditionContext ctx, Map tags) {
String exprStr = ctx.getText();
Condition condition = null;
if (ctx.BETWEEN() != null && ctx.dateExpression().size() == 2) {
- Optional lower = parseDateExpression(ctx.dateExpression(0));
- Optional upper = parseDateExpression(ctx.dateExpression(1));
+ Optional lower = parseDateExpression(ctx.dateExpression(0), tags);
+ Optional upper = parseDateExpression(ctx.dateExpression(1), tags);
if (lower.isPresent() && upper.isPresent()) {
+ DateBasedConditionOperator op = (ctx.NOT() != null) ?
+ DateBasedConditionOperator.NOT_BETWEEN
+ : DateBasedConditionOperator.BETWEEN;
condition = new DateBasedCondition(
- exprStr, DateBasedConditionOperator.BETWEEN, Arrays.asList(lower.get(), upper.get())
+ exprStr, op, Arrays.asList(lower.get(), upper.get())
);
}
} else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) {
- Optional operand = parseDateExpression(ctx.dateExpression(0));
+ Optional operand = parseDateExpression(ctx.dateExpression(0), tags);
if (operand.isPresent()) {
condition = new DateBasedCondition(
exprStr, DateBasedConditionOperator.GREATER_THAN_EQUAL_TO, Collections.singletonList(operand.get())
);
}
} else if (ctx.GREATER_THAN() != null && ctx.dateExpression().size() == 1) {
- Optional operand = parseDateExpression(ctx.dateExpression(0));
+ Optional operand = parseDateExpression(ctx.dateExpression(0), tags);
if (operand.isPresent()) {
condition = new DateBasedCondition(
exprStr, DateBasedConditionOperator.GREATER_THAN, Collections.singletonList(operand.get())
);
}
} else if (ctx.LESS_THAN() != null && ctx.dateExpression().size() == 1) {
- Optional operand = parseDateExpression(ctx.dateExpression(0));
+ Optional operand = parseDateExpression(ctx.dateExpression(0), tags);
if (operand.isPresent()) {
condition = new DateBasedCondition(
exprStr, DateBasedConditionOperator.LESS_THAN, Collections.singletonList(operand.get())
);
}
} else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.dateExpression().size() == 1) {
- Optional operand = parseDateExpression(ctx.dateExpression(0));
+ Optional operand = parseDateExpression(ctx.dateExpression(0), tags);
if (operand.isPresent()) {
condition = new DateBasedCondition(
exprStr, DateBasedConditionOperator.LESS_THAN_EQUAL_TO, Collections.singletonList(operand.get())
);
}
} else if (ctx.EQUAL_TO() != null && ctx.dateExpression().size() == 1) {
- Optional operand = parseDateExpression(ctx.dateExpression(0));
+ Optional operand = parseDateExpression(ctx.dateExpression(0), tags);
if (operand.isPresent()) {
+ DateBasedConditionOperator op = (ctx.NEGATION() != null) ?
+ DateBasedConditionOperator.NOT_EQUALS
+ : DateBasedConditionOperator.EQUALS;
condition = new DateBasedCondition(
- exprStr, DateBasedConditionOperator.EQUALS, Collections.singletonList(operand.get())
+ exprStr, op, Collections.singletonList(operand.get())
);
}
} else if (ctx.IN() != null &&
ctx.dateExpressionArray() != null &&
ctx.dateExpressionArray().dateExpression().size() > 0) {
List> expressions = ctx.dateExpressionArray().dateExpression().stream()
- .map(this::parseDateExpression)
+ .map(x -> parseDateExpression(x, tags))
.collect(Collectors.toList());
-
if (expressions.stream().allMatch(Optional::isPresent)) {
+ DateBasedConditionOperator op = (ctx.NOT() != null) ?
+ DateBasedConditionOperator.NOT_IN
+ : DateBasedConditionOperator.IN;
condition = new DateBasedCondition(
- exprStr, DateBasedConditionOperator.IN,
+ exprStr, op,
expressions.stream().map(Optional::get).collect(Collectors.toList())
);
}
@@ -481,8 +963,11 @@ private Optional parseDurationBasedCondition(
Optional lower = parseDuration(ctx.durationExpression(0));
Optional upper = parseDuration(ctx.durationExpression(1));
if (lower.isPresent() && upper.isPresent()) {
+ DurationBasedConditionOperator op = (ctx.NOT() != null) ?
+ DurationBasedConditionOperator.NOT_BETWEEN
+ : DurationBasedConditionOperator.BETWEEN;
condition = new DurationBasedCondition(
- exprStr, DurationBasedConditionOperator.BETWEEN, Arrays.asList(lower.get(), upper.get())
+ exprStr, op, Arrays.asList(lower.get(), upper.get())
);
}
} else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.durationExpression().size() == 1) {
@@ -520,8 +1005,11 @@ private Optional parseDurationBasedCondition(
} else if (ctx.EQUAL_TO() != null && ctx.durationExpression().size() == 1) {
Optional operand = parseDuration(ctx.durationExpression(0));
if (operand.isPresent()) {
+ DurationBasedConditionOperator op = (ctx.NEGATION() != null) ?
+ DurationBasedConditionOperator.NOT_EQUALS
+ : DurationBasedConditionOperator.EQUALS;
condition = new DurationBasedCondition(
- exprStr, DurationBasedConditionOperator.EQUALS,
+ exprStr, op,
Collections.singletonList(operand.get())
);
}
@@ -534,8 +1022,11 @@ private Optional parseDurationBasedCondition(
.collect(Collectors.toList());
if (durations.stream().allMatch(Optional::isPresent)) {
+ DurationBasedConditionOperator op = (ctx.NOT() != null) ?
+ DurationBasedConditionOperator.NOT_IN
+ : DurationBasedConditionOperator.IN;
condition = new DurationBasedCondition(
- exprStr, DurationBasedConditionOperator.IN,
+ exprStr, op,
durations.stream().map(Optional::get).collect(Collectors.toList())
);
}
@@ -544,8 +1035,91 @@ private Optional parseDurationBasedCondition(
return Optional.ofNullable(condition);
}
+ private Optional parseSizeBasedCondition(
+ DataQualityDefinitionLanguageParser.SizeBasedConditionContext ctx
+ ) {
+
+ String exprStr = ctx.getText();
+ Condition condition = null;
+
+ if (ctx.BETWEEN() != null && ctx.sizeExpression().size() == 2) {
+ Optional lower = parseSize(ctx.sizeExpression(0));
+ Optional upper = parseSize(ctx.sizeExpression(1));
+ if (lower.isPresent() && upper.isPresent()) {
+ SizeBasedConditionOperator op = (ctx.NOT() != null) ?
+ SizeBasedConditionOperator.NOT_BETWEEN
+ : SizeBasedConditionOperator.BETWEEN;
+ condition = new SizeBasedCondition(
+ exprStr, op, Arrays.asList(lower.get(), upper.get())
+ );
+ }
+ } else if (ctx.GREATER_THAN_EQUAL_TO() != null && ctx.sizeExpression().size() == 1) {
+ Optional operand = parseSize(ctx.sizeExpression(0));
+ if (operand.isPresent()) {
+ condition = new SizeBasedCondition(
+ exprStr, SizeBasedConditionOperator.GREATER_THAN_EQUAL_TO,
+ Collections.singletonList(operand.get())
+ );
+ }
+ } else if (ctx.GREATER_THAN() != null && ctx.sizeExpression().size() == 1) {
+ Optional operand = parseSize(ctx.sizeExpression(0));
+ if (operand.isPresent()) {
+ condition = new SizeBasedCondition(
+ exprStr, SizeBasedConditionOperator.GREATER_THAN,
+ Collections.singletonList(operand.get())
+ );
+ }
+ } else if (ctx.LESS_THAN() != null && ctx.sizeExpression().size() == 1) {
+ Optional operand = parseSize(ctx.sizeExpression(0));
+ if (operand.isPresent()) {
+ condition = new SizeBasedCondition(
+ exprStr, SizeBasedConditionOperator.LESS_THAN,
+ Collections.singletonList(operand.get())
+ );
+ }
+ } else if (ctx.LESS_THAN_EQUAL_TO() != null && ctx.sizeExpression().size() == 1) {
+ Optional operand = parseSize(ctx.sizeExpression(0));
+ if (operand.isPresent()) {
+ condition = new SizeBasedCondition(
+ exprStr, SizeBasedConditionOperator.LESS_THAN_EQUAL_TO,
+ Collections.singletonList(operand.get())
+ );
+ }
+ } else if (ctx.EQUAL_TO() != null && ctx.sizeExpression().size() == 1) {
+ Optional operand = parseSize(ctx.sizeExpression(0));
+ if (operand.isPresent()) {
+ SizeBasedConditionOperator op = (ctx.NEGATION() != null) ?
+ SizeBasedConditionOperator.NOT_EQUALS
+ : SizeBasedConditionOperator.EQUALS;
+ condition = new SizeBasedCondition(
+ exprStr, op,
+ Collections.singletonList(operand.get())
+ );
+ }
+ } else if (ctx.IN() != null &&
+ ctx.sizeExpressionArray() != null &&
+ ctx.sizeExpressionArray().sizeExpression().size() > 0) {
+
+ List> sizes = ctx.sizeExpressionArray().sizeExpression().stream()
+ .map(this::parseSize)
+ .collect(Collectors.toList());
+
+ if (sizes.stream().allMatch(Optional::isPresent)) {
+ SizeBasedConditionOperator op = (ctx.NOT() != null) ?
+ SizeBasedConditionOperator.NOT_IN
+ : SizeBasedConditionOperator.IN;
+ condition = new SizeBasedCondition(
+ exprStr, op,
+ sizes.stream().map(Optional::get).collect(Collectors.toList())
+ );
+ }
+ }
+
+ return Optional.ofNullable(condition);
+ }
+
private Optional parseDateExpression(
- DataQualityDefinitionLanguageParser.DateExpressionContext ctx) {
+ DataQualityDefinitionLanguageParser.DateExpressionContext ctx, Map tags) {
if (ctx.durationExpression() != null) {
Optional duration = parseDuration(ctx.durationExpression());
return duration.map(value -> new DateExpression.CurrentDateExpression(
@@ -556,11 +1130,41 @@ private Optional parseDateExpression(
));
} else if (ctx.dateNow() != null) {
return Optional.of(new DateExpression.CurrentDate());
+ } else if (ctx.NULL() != null) {
+ return Optional.of(new NullDateExpression());
+ } else if (ctx.timeExpression() != null) {
+ final String time = removeQuotes(ctx.timeExpression().MIL_TIME() != null
+ ? ctx.timeExpression().MIL_TIME().getText()
+ : ctx.timeExpression().TIME().getText());
+ final String pattern = ctx.timeExpression().MIL_TIME() != null
+ ? MILITARY_TIME_FORMAT
+ : AMPM_TIME_FORMAT;
+ final String timeZone = tags.getOrDefault("timeZone", "UTC");
+ return parseTime(time, pattern, timeZone);
} else {
return Optional.of(new DateExpression.StaticDate(removeQuotes(ctx.DATE().getText())));
}
}
+ private Optional parseTime(final String in, final String pattern, final String timeZone) {
+ try {
+ final ZoneId zoneId = ZoneId.of(timeZone); // https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html
+ final DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern);
+ final LocalTime time = LocalTime.parse(in, formatter);
+ final LocalDate today = LocalDate.now();
+ final LocalDateTime localDateTime = LocalDateTime.of(today, time);
+ final ZonedDateTime zonedDateTime = localDateTime.atZone(zoneId);
+ final ZonedDateTime utcTime = zonedDateTime.withZoneSameInstant(ZoneOffset.UTC);
+ return Optional.of(new DateExpression.StaticDateTime(utcTime.toLocalDateTime(), in));
+ } catch (final DateTimeParseException e) {
+ errorMessages.add(String.format("Error Parsing Date: %s. %s.", in, e.getMessage()));
+ return Optional.empty();
+ } catch (final ZoneRulesException e) {
+ errorMessages.add(String.format("Error Parsing Time Zone: %s. %s.", timeZone, e.getMessage()));
+ return Optional.empty();
+ }
+ }
+
private Optional parseDuration(
DataQualityDefinitionLanguageParser.DurationExpressionContext ctx) {
int amount = Integer.parseInt(ctx.INT() != null ? ctx.INT().getText() : ctx.DIGIT().getText());
@@ -572,6 +1176,17 @@ private Optional parseDuration(
}
}
+ private Optional parseSize(
+ DataQualityDefinitionLanguageParser.SizeExpressionContext ctx) {
+ int amount = Integer.parseInt(ctx.INT() != null ? ctx.INT().getText() : ctx.DIGIT().getText());
+ if (ctx.sizeUnit().exception != null) {
+ return Optional.empty();
+ } else {
+ SizeUnit unit = SizeUnit.valueOf(ctx.sizeUnit().getText().toUpperCase());
+ return Optional.of(new Size(amount, unit));
+ }
+ }
+
private String removeQuotes(String quotedString) {
if (quotedString.startsWith("\"") && quotedString.endsWith("\"")) {
quotedString = quotedString.substring(1);
@@ -584,4 +1199,69 @@ private String removeEscapes(String stringWithEscapes) {
stringWithEscapes = stringWithEscapes.replaceAll("\\\\(.)", "$1");
return stringWithEscapes;
}
+
+ private List parseParameters(
+ List parameters) {
+ if (parameters == null) return new ArrayList<>();
+ return parameters.stream().map(this::parseParameter).collect(Collectors.toList());
+ }
+
+ private DQRuleParameterValue parseParameter(
+ DataQualityDefinitionLanguageParser.ParameterWithConnectorWordContext pc) {
+ String connectorWord = pc.connectorWord() == null ? "" : pc.connectorWord().getText();
+
+ if (pc.parameter().QUOTED_STRING() != null) {
+ return new DQRuleParameterValue(
+ removeQuotes(pc.parameter().QUOTED_STRING().getText()), true, connectorWord);
+ } else if (pc.parameter().IDENTIFIER() != null) {
+ return new DQRuleParameterValue(
+ pc.parameter().IDENTIFIER().getText(), false, connectorWord);
+ } else {
+ return new DQRuleParameterValue(pc.parameter().getText(), true, connectorWord);
+ }
+ }
+
+ private List validateDictionary(DataQualityDefinitionLanguageParser.DictionaryContext dc) {
+ List dictionaryErrors = new ArrayList<>();
+ if (dc.pair() == null || (dc.pair().size() == 1 && dc.pair().get(0).getText().isEmpty())) {
+ dictionaryErrors.add("Empty dictionary provided");
+ }
+ return dictionaryErrors;
+ }
+
+ private Keyword parseKeyword(
+ DataQualityDefinitionLanguageParser.StringValuesContext stringValuesContext) {
+ Keyword keyword = null;
+ try {
+ String operand = stringValuesContext.getText().toUpperCase();
+ if (isValidEnumValue(operand)) {
+ Method method = stringValuesContext.getClass().getMethod(operand);
+ Object result = method.invoke(stringValuesContext);
+ if (result != null) {
+ keyword = Keyword.valueOf(operand);
+ }
+ }
+ } catch (IllegalArgumentException | IllegalAccessException | NoSuchMethodException |
+ InvocationTargetException e) {
+ errorMessages.add(e.getMessage());
+ }
+ return keyword;
+ }
+
+ private boolean isValidEnumValue(String value) {
+ try {
+ Enum.valueOf(Keyword.class, value);
+ return true;
+ } catch (IllegalArgumentException e) {
+ return false;
+ }
+ }
+
+ private String processStringValues(DataQualityDefinitionLanguageParser.StringValuesContext sv) {
+ if (sv.quotedString() != null) {
+ return removeQuotes(sv.quotedString().getText());
+ }
+ return sv.getText();
+ }
+
}
diff --git a/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java
new file mode 100644
index 0000000..16a7178
--- /dev/null
+++ b/src/com/amazonaws/glue/ml/dataquality/dqdl/parser/DQDLVariableResolver.java
@@ -0,0 +1,59 @@
+package com.amazonaws.glue.ml.dataquality.dqdl.parser;
+
+import com.amazonaws.glue.ml.dataquality.dqdl.model.DQVariable;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.Condition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.QuotedStringOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.variable.VariableReferenceOperand;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public final class DQDLVariableResolver {
+
+ // Private constructor to prevent instantiation
+ private DQDLVariableResolver() {
+ throw new AssertionError("Utility class should not be instantiated");
+ }
+
+ public static Condition resolveVariablesInCondition(Condition condition, Map variables,
+ Map usedVars) {
+ if (!(condition instanceof StringBasedCondition)) {
+ return condition;
+ }
+
+ StringBasedCondition stringCondition = (StringBasedCondition) condition;
+ List resolvedOperands = new ArrayList<>();
+
+ for (StringOperand operand : stringCondition.getOperands()) {
+ if (operand instanceof VariableReferenceOperand) {
+ String varName = operand.getOperand();
+ DQVariable variable = variables.get(varName);
+ if (variable != null) {
+ usedVars.put(varName, variable);
+ Object value = variable.getValue();
+ if (value instanceof List) {
+ for (Object listItem : (List>) value) {
+ resolvedOperands.add(new QuotedStringOperand(listItem.toString()));
+ }
+ } else {
+ resolvedOperands.add(new QuotedStringOperand(value.toString()));
+ }
+ } else {
+ resolvedOperands.add(operand);
+ }
+ } else {
+ resolvedOperands.add(operand);
+ }
+ }
+
+ return new StringBasedCondition(
+ stringCondition.getConditionAsString(),
+ stringCondition.getOperator(),
+ resolvedOperands,
+ stringCondition.getOperands()
+ );
+ }
+}
diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java
new file mode 100644
index 0000000..174144a
--- /dev/null
+++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQAnalyzerTest.java
@@ -0,0 +1,89 @@
+/*
+ * DQAnalyzerTest.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model;
+
+import com.amazonaws.glue.ml.dataquality.dqdl.exception.InvalidDataQualityRulesetException;
+import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.util.stream.Stream;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+public class DQAnalyzerTest {
+ DQDLParser parser = new DQDLParser();
+
+ @Test
+ void test_singleAnalyzer() {
+ String column = "colA";
+ String ruleset = String.format("Rules = [ IsComplete \"%s\" ] Analyzers = [ Completeness \"%s\" ]", column, column);
+
+ try {
+ DQRuleset dqRuleset = parser.parse(ruleset);
+ DQAnalyzer dqAnalyzer = dqRuleset.getAnalyzers().get(0);
+ assertEquals("Completeness", dqAnalyzer.getRuleType());
+ assertEquals(1, dqAnalyzer.getParameters().size());
+ assertTrue(dqAnalyzer.getParameters().containsValue(column));
+ } catch (InvalidDataQualityRulesetException e) {
+ fail(e.getMessage());
+ }
+ }
+
+ @ParameterizedTest
+ @MethodSource("provideRawAnalyzers")
+ void test_analyzerParsingAndGeneratingWithParser(String analyzer) {
+ try {
+ DQRuleset dqRuleset = parser.parse(String.format("Rules = [ IsComplete \"colA\" ] Analyzers = [ %s ]", analyzer));
+ assertEquals(1, dqRuleset.getRules().size());
+ assertEquals(1, dqRuleset.getAnalyzers().size());
+
+ DQAnalyzer dqAnalyzer = dqRuleset.getAnalyzers().get(0);
+ String dqAnalyzerAsString = dqAnalyzer.toString();
+ assertEquals(analyzer, dqAnalyzerAsString);
+ } catch (InvalidDataQualityRulesetException e) {
+ fail(e.getMessage());
+ }
+ }
+
+ private static Stream provideRawAnalyzers() {
+ return Stream.of(
+ Arguments.of("RowCount"),
+ Arguments.of("RowCountMatch \"reference\""),
+ Arguments.of("Completeness \"col_1\""),
+ Arguments.of("ColumnCount"),
+ Arguments.of("ColumnCorrelation \"col_1\" \"col_2\""),
+ Arguments.of("Uniqueness \"col_1\""),
+ Arguments.of("Sum \"col_A-B.C\""),
+ Arguments.of("Mean \"col_A-B.CD\""),
+ Arguments.of("StandardDeviation \"col_A-B.CD\""),
+ Arguments.of("Entropy \"col_A-B.CD\""),
+ Arguments.of("DistinctValuesCount \"col_A-B.CD\""),
+ Arguments.of("UniqueValueRatio \"col_A-B.CD\""),
+ Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\""),
+ Arguments.of("ReferentialIntegrity \"col-A,col-B\" \"reference.{col-A1,col-A2}\""),
+ Arguments.of("DatasetMatch \"reference\" \"ID1,ID2\""),
+ Arguments.of("DatasetMatch \"reference\" \"ID1,ID2\" \"colA,colB,colC\""),
+ Arguments.of("DatasetMatch \"reference\" \"ID1->ID11,ID2->ID22\" \"colA->colAA\""),
+ Arguments.of("SchemaMatch \"ref-1\""),
+ Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(colB)\""),
+ Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\""),
+ Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\""),
+ Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\""),
+ Arguments.of("CustomSql \"select count(*) from primary\""),
+ Arguments.of("AllStatistics \"id\"")
+ );
+ }
+}
diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java
new file mode 100644
index 0000000..1e50325
--- /dev/null
+++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleParameterValueTest.java
@@ -0,0 +1,80 @@
+/*
+ * DQRuleParameterValueTest.java
+ *
+ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * PROPRIETARY/CONFIDENTIAL
+ *
+ * Use is subject to license terms.
+ */
+
+package com.amazonaws.glue.ml.dataquality.dqdl.model;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotSame;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class DQRuleParameterValueTest {
+
+ @Test
+ public void test_constructorWithValueArg() {
+ String value = "col-A";
+ DQRuleParameterValue param = new DQRuleParameterValue(value);
+ assertEquals(value, param.getValue());
+ assertFalse(param.isQuoted());
+ assertTrue(param.getConnectorWord().isEmpty());
+ }
+
+ @Test
+ public void test_constructorWithValueAndIsQuotedArgs() {
+ String value = "col-A";
+ boolean isQuoted = true;
+ DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted);
+ assertEquals(value, param.getValue());
+ assertEquals(isQuoted, param.isQuoted());
+ assertTrue(param.getConnectorWord().isEmpty());
+ }
+
+ @Test
+ public void test_parameterValueToStringWithNoConnectorWordAndNoQuotes() {
+ String value = "col-A";
+ String connectorWord = "";
+ boolean isQuoted = false;
+ DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord);
+ assertEquals(value, param.toString());
+ }
+
+ @Test
+ public void test_parameterValueToStringWithConnectorWordAndNoQuotes() {
+ String value = "col-A";
+ String connectorWord = "of";
+ boolean isQuoted = false;
+ DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord);
+ assertEquals(String.format("%s %s", connectorWord, value), param.toString());
+ }
+
+ @Test
+ public void test_parameterValueToStringWithConnectorWordAndWithQuotes() {
+ String value = "col-A";
+ String connectorWord = "of";
+ boolean isQuoted = true;
+ DQRuleParameterValue param = new DQRuleParameterValue(value, isQuoted, connectorWord);
+ assertEquals(String.format("%s \"%s\"", connectorWord, value), param.toString());
+ }
+
+ @Test
+ public void test_equalsAndHashCode() {
+ String value = "col-A";
+ String connectorWord = "of";
+ boolean isQuoted = true;
+
+ DQRuleParameterValue param1 = new DQRuleParameterValue(value, isQuoted, connectorWord);
+ DQRuleParameterValue param2 = new DQRuleParameterValue(value, isQuoted, connectorWord);
+
+ assertNotSame(param1, param2);
+ assertEquals(param1, param2);
+ }
+}
diff --git a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java
index 98aeed5..3139102 100644
--- a/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java
+++ b/tst/com/amazonaws/glue/ml/dataquality/dqdl/model/DQRuleTest.java
@@ -15,7 +15,10 @@
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateBasedCondition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.date.DateExpression;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumberBasedCondition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.Size;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.size.SizeBasedCondition;
import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringBasedCondition;
+import com.amazonaws.glue.ml.dataquality.dqdl.model.condition.string.StringOperand;
import com.amazonaws.glue.ml.dataquality.dqdl.parser.DQDLParser;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@@ -29,15 +32,22 @@
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.Date;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import static com.amazonaws.glue.ml.dataquality.dqdl.model.condition.number.NumericOperandTest.testEvaluator;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -62,76 +72,153 @@ void test_ruleParsingAndGeneratingWithParser(String rule) {
}
}
+ @ParameterizedTest
+ @MethodSource("provideRawRules")
+ void test_rulesEqualWhenRepresentationsEqual(String ruleStringRepr) {
+ try {
+ DQRule rule1 = parser.parse("Rules = [ " + ruleStringRepr + " ]").getRules().get(0);
+ DQRule rule2 = parser.parse("Rules = [ " + ruleStringRepr + " ]").getRules().get(0);
+
+ assertEquals(rule1, rule2);
+ assertTrue(rule1.equals(rule2));
+ assertEquals(rule1.hashCode(), rule2.hashCode());
+ assertNotSame(rule1, rule2);
+ } catch (InvalidDataQualityRulesetException e) {
+ fail(e.getMessage());
+ }
+ }
+
private static Stream provideRawRules() {
return Stream.of(
- // Arguments.of("JobStatus = \"SUCCEEDED\""),
- // Arguments.of("JobStatus in [\"SUCCEEDED\",\"READY\"]"),
- // Arguments.of("JobDuration between 10 and 1000"),
- // Arguments.of("JobDuration between -10 and 1000"),
- // Arguments.of("FileCount between 10 and 100"),
- // Arguments.of("FileCount between -10000 and -1000"),
Arguments.of("IsPrimaryKey \"colA\""),
Arguments.of("IsPrimaryKey \"colA\" \"colB\""),
+ Arguments.of("IsPrimaryKey colA \"col B\""),
Arguments.of("IsPrimaryKey \"colA\" \"colB\" \"colC\""),
+ Arguments.of("IsPrimaryKey \"colA\" where \"colA > 100\""),
Arguments.of("RowCount = 100"),
+ Arguments.of("RowCount != 100"),
Arguments.of("RowCount = -100"),
+ Arguments.of("RowCount = 100 where \"colA > 100\""),
+ Arguments.of("RowCount between (0.9 * average(last(10))) and 1.1 * average(last(10))"),
+ Arguments.of("RowCount not between (0.9 * average(last(10))) and 1.1 * average(last(10))"),
Arguments.of("RowCountMatch \"reference\" = 1.0"),
Arguments.of("RowCountMatch \"reference\" >= 0.95"),
Arguments.of("RowCountMatch \"reference\" between 0.8 and 0.98"),
Arguments.of("Completeness \"col_1\" between 0.5 and 0.8"),
+ Arguments.of("Completeness of col_1 between 0.5 and 0.8"),
+ Arguments.of("Completeness of col_1 not between 0.5 and 0.8"),
+ Arguments.of("Completeness \"col_1\" between 0.5 and 0.8 where \"col-A > 100\""),
Arguments.of("IsComplete \"col_1\""),
+ Arguments.of("IsComplete \"col_1\" where \"col-A > 100\""),
Arguments.of("Completeness \"col_1\" between -0.5 and -0.4"),
+ Arguments.of("Completeness \"col_1\" between (0.9 * avg(last(10))) and (1.1 * avg(last(10)))"),
Arguments.of("ColumnDataType \"col_1\" = \"String\""),
+ Arguments.of("ColumnDataType \"col_1\" != \"String\""),
+ Arguments.of("ColumnDataType \"col_2\" = \"Integer\""),
Arguments.of("ColumnDataType \"col_1\" = \"String\" with threshold between 0.4 and 0.8"),
Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"]"),
Arguments.of("ColumnDataType \"col_1\" in [\"Date\",\"String\"] with threshold > 0.9"),
+ Arguments.of("ColumnDataType \"col_1\" = \"String\" where \"col-A > 100\""),
Arguments.of("ColumnNamesMatchPattern \"aws_.*_[a-zA-Z0-9]+\""),
Arguments.of("ColumnExists \"load_dt\""),
Arguments.of("ColumnCount >= 100"),
+ Arguments.of("ColumnCount = avg(std(last(10)))"),
+ Arguments.of("ColumnCount != avg(std(last(10)))"),
+ Arguments.of("ColumnCount = avg(std(last(percentile(1,2,3))))"),
Arguments.of("ColumnCount > -100.123456"),
Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8"),
+ Arguments.of("ColumnCorrelation of col_1 col_2 between 0.4 and 0.8"),
+ Arguments.of("ColumnCorrelation of col_1 and \"col abc\" between 0.4 and 0.8"),
Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between -0.44444 and 0.888888"),
+ Arguments.of("ColumnCorrelation \"col_1\" \"col_2\" between 0.4 and 0.8 where \"col-A > 100\""),
Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2"),
+ Arguments.of("Uniqueness \"col_1\" between 0.1 and 0.2 where \"col-A > 100\""),
Arguments.of("IsUnique \"col_1\""),
+ Arguments.of("IsUnique \"col_1\" where \"col-A > 100\""),
Arguments.of("Uniqueness \"col_1\" between -0.00000001 and 0.00000000000002"),
Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\""),
+ Arguments.of("ColumnValues \"col_1\" between \"2022-06-01\" and \"2022-06-30\" where \"col-A > 100\""),
Arguments.of("ColumnValues \"load_dt\" > (now() - 1 days)"),
Arguments.of("ColumnValues \"order-id\" in [1,2,3,4]"),
+ Arguments.of("ColumnValues \"order-id\" in [1,2,3,4,NULL]"),
+ Arguments.of("ColumnValues \"order-id\" not in [1,2,3,4]"),
Arguments.of("ColumnValues \"order-id\" in [\"1\",\"2\",\"3\",\"4\"]"),
+ Arguments.of("ColumnValues \"order-id\" not in [\"1\",\"2\",\"3\",\"4\"]"),
+ Arguments.of("ColumnValues \"col-A\" < (now() + 4 minutes)"),
+ Arguments.of("ColumnValues \"col-A\" < (now() - 25 minutes)"),
+ Arguments.of("ColumnValues \"col-A\" > \"9:30 AM\""),
+ Arguments.of("ColumnValues \"col-A\" > \"9:30 PM\""),
+ Arguments.of("ColumnValues \"col-A\" > \"19:30\""),
+ Arguments.of("ColumnValues \"col-A\" between \"9:00 AM\" and \"21:50\""),
Arguments.of("Sum \"col_A-B.C\" > 100.0"),
Arguments.of("Sum \"col_A-B.C\" > -100.0"),
+ Arguments.of("Sum \"col_A-B.C\" > -100.0 where \"col-A > 100\""),
Arguments.of("Mean \"col_A-B.CD\" between 10 and 20"),
Arguments.of("Mean \"col_A-B.CD\" between -20 and -10"),
+ Arguments.of("Mean \"col_A-B.CD\" between -20 and -10 where \"col-A > 100\""),
Arguments.of("StandardDeviation \"col_A-B.CD\" <= 10.0"),
Arguments.of("StandardDeviation \"col_A-B.CD\" <= -10000.0"),
+ Arguments.of("StandardDeviation \"col_A-B.CD\" <= -10000.0 where \"col-A > 100\""),
Arguments.of("Entropy \"col_A-B.CD\" <= 10.0"),
Arguments.of("Entropy \"col_A-B.CD\" between 10 and 30"),
+ Arguments.of("Entropy \"col_A-B.CD\" between 10 and 30 where \"col-A > 100\""),
Arguments.of("DistinctValuesCount \"col_A-B.CD\" > 1000"),
Arguments.of("DistinctValuesCount \"col_A-B.CD\" between 10 and 30"),
+ Arguments.of("DistinctValuesCount \"col_A-B.CD\" between 10 and 30 where \"col-A > 100\""),
Arguments.of("UniqueValueRatio \"col_A-B.CD\" < 0.5"),
Arguments.of("UniqueValueRatio \"col_A-B.CD\" between 0.1 and 0.5"),
+ Arguments.of("UniqueValueRatio \"col_A-B.CD\" between 0.1 and 0.5 where \"col-A > 100\""),
Arguments.of("ColumnLength \"col_A-B.CD\" < 10"),
Arguments.of("ColumnLength \"col_A-B.CD\" >= 100"),
+ Arguments.of("ColumnLength \"col_A-B.CD\" >= 100 where \"col-A > 100\""),
Arguments.of("ColumnValues \"col-A\" matches \"[a-zA-Z0-9]*\""),
+ Arguments.of("ColumnValues \"col-A\" not matches \"[a-zA-Z0-9]*\""),
Arguments.of("ColumnValues \"col-A\" >= now()"),
Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and now()"),
+ Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and now()"),
Arguments.of("ColumnValues \"col-A\" between now() and (now() + 3 hours)"),
Arguments.of("ColumnValues \"col-A\" < (now() + 4 days)"),
Arguments.of("ColumnValues \"col-A\" = (now() - 3 hours)"),
+ Arguments.of("ColumnValues \"col-A\" != (now() - 3 hours)"),
Arguments.of("ColumnValues \"col-A\" in [now(),(now() - 3 hours),now(),(now() + 4 days)]"),
+ Arguments.of("ColumnValues \"col-A\" not in [now(),(now() - 3 hours),now(),(now() + 4 days)]"),
Arguments.of("ColumnValues \"col-A\" between (now() - 3 hours) and (now() + 14 days)"),
+ Arguments.of("ColumnValues \"col-A\" not between (now() - 3 hours) and (now() + 14 days)"),
Arguments.of("ColumnValues \"col-A\" matches \"[a-z]*\" with threshold <= 0.4"),
+ Arguments.of("ColumnValues \"col-A\" not matches \"[a-z]*\" with threshold <= 0.4"),
Arguments.of("ColumnValues \"col-A\" in [\"A\",\"B\"] with threshold <= 0.4"),
Arguments.of("ColumnValues \"col-A\" in [1,2,3] with threshold > 0.98"),
Arguments.of("ColumnValues \"col-A\" = \"A\" with threshold > 0.98"),
+ Arguments.of("ColumnValues \"col-A\" = NULL"),
+ Arguments.of("ColumnValues \"col-A\" = EMPTY"),
+ Arguments.of("ColumnValues \"col-A\" = WHITESPACES_ONLY"),
+ Arguments.of("ColumnValues \"col-A\" != NULL"),
+ Arguments.of("ColumnValues \"col-A\" != EMPTY"),
+ Arguments.of("ColumnValues \"col-A\" != WHITESPACES_ONLY"),
+ Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL]"),
+ Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL]"),
+ Arguments.of("ColumnValues \"col-A\" not in [\"a\",NULL]"),
+ Arguments.of("ColumnValues \"col-A\" in [\"a\",NULL,EMPTY,WHITESPACES_ONLY]"),
+ Arguments.of("ColumnValues \"col-A\" in [NULL,EMPTY,WHITESPACES_ONLY]"),
+ Arguments.of("(ColumnValues \"col-A\" not in [NULL,EMPTY,WHITESPACES_ONLY]) OR (ColumnValues \"col-B\" != WHITESPACES_ONLY)"),
+ Arguments.of("(ColumnValues \"col-A\" in [NULL,EMPTY,WHITESPACES_ONLY]) AND (ColumnValues \"col-B\" != WHITESPACES_ONLY)"),
Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold between 0.4 and 0.8"),
+ Arguments.of("ColumnValues \"col-A\" <= 0.4 with threshold not between 0.4 and 0.8"),
Arguments.of("ColumnValues \"col-A\" > 0.4 with threshold > 0.4"),
Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\"] with threshold > 0.98"),
+ Arguments.of("ColumnValues \"col-A\" = NULL"),
+ Arguments.of("ColumnValues \"col-A\" != NULL"),
+ Arguments.of("ColumnValues \"col-A\" in [NULL]"),
+ Arguments.of("ColumnValues \"col-A\" in [\"2022-01-01\",NULL] with threshold > 0.98"),
+ Arguments.of("ColumnValues \"col-A\" not in [\"2022-01-01\",NULL] with threshold > 0.98"),
Arguments.of("ColumnValues \"col-A\" = 1 with threshold > 0.98"),
Arguments.of("ColumnValues \"col-A\" = \"2022-01-01\" with threshold > 0.98"),
Arguments.of("DataFreshness \"col-A\" <= 3 days"),
- Arguments.of("DataFreshness \"col-A\" > 30 hours"),
+ Arguments.of("DataFreshness \"col-A\" > 3 minutes"),
+ Arguments.of("DataFreshness \"col-A\" > 90 minutes"),
Arguments.of("DataFreshness \"col-A\" between 2 days and 4 days"),
+ Arguments.of("DataFreshness \"col-A\" between 2 minutes and 4 minutes"),
+ Arguments.of("DataFreshness \"col-A\" <= 3 days where \"col-A > 100\""),
Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" between 0.4 and 0.6"),
Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" > 0.98"),
Arguments.of("ReferentialIntegrity \"col-A\" \"reference.col-A1\" = 0.99"),
@@ -146,12 +233,237 @@ private static Stream provideRawRules() {
Arguments.of("AggregateMatch \"sum(col-A)\" \"sum(reference.colA)\" > 0.1"),
Arguments.of("AggregateMatch \"avg(col-A)\" \"avg(reference.colA)\" between 0.8 and 0.9"),
Arguments.of("AggregateMatch \"SUM(col-A)\" \"SUM(reference.colA)\" >= 0.95"),
- Arguments.of( "CustomSql \"select count(*) from primary\" > 0"),
- Arguments.of( "CustomSql \"select col-A from primary\""),
- Arguments.of( "CustomSql \"select col-A from primary\" with threshold > 0.5")
+ Arguments.of("CustomSql \"select count(*) from primary\" > 0"),
+ Arguments.of("CustomSql \"select col-A from primary\""),
+ Arguments.of("CustomSql \"select col-A from primary\" with threshold > 0.5"),
+ Arguments.of("DetectAnomalies \"RowCount\""),
+ Arguments.of("DetectAnomalies of RowCount"),
+ Arguments.of("DetectAnomalies of Completeness of \"colA\""),
+ Arguments.of("DetectAnomalies of ColumnCorrelation of \"colA\" and \"colB\""),
+ Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\"]"),
+ Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"]"),
+ Arguments.of("FileMatch in [\"hashList\",\"hashList\"]"),
+ Arguments.of("FileMatch \"S3://PATH\" in [\"hashList\",\"hashList\"] with hashAlgorithm = \"MD5\""),
+ Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\" with randomTagThing = \"@sampom\""),
+ Arguments.of("FileMatch \"S3://PATH1\" in [\"a\"] with tag1 = \"sampom\" with tag2 = \"pomsam\""),
+ Arguments.of("FileMatch \"S3://PATH1\" \"S3://PATH2\""),
+ Arguments.of("FileUniqueness \"S3://PATH1\" >= 0.9"),
+ Arguments.of("FileFreshness \"S3://PATH\" between \"2023-02-07\" and \"2024-07-15\""),
+ Arguments.of("FileFreshness \"S3://PATH\" > (now() - 3 days)"),
+ Arguments.of("FileUniqueness \"S3://PATH\" > 0.9"),
+ Arguments.of("FileUniqueness > 0.5"),
+ Arguments.of("FileSize between 1 B and 1 GB"),
+ Arguments.of("FileSize not between 50 GB and 1 TB"),
+ Arguments.of("FileSize > 5 B"),
+ Arguments.of("FileSize >= 5 KB"),
+ Arguments.of("FileSize < 5 MB"),
+ Arguments.of("FileSize <= 5 GB"),
+ Arguments.of("FileSize = 5 TB"),
+ Arguments.of("FileSize != 5 B"),
+ Arguments.of("FileSize in [5 B]"),
+ Arguments.of("FileSize not in [500 KB,150 GB]"),
+ Arguments.of("(RowCount > 0) OR (IsComplete \"colA\") OR (IsUnique \"colA\")"),
+ Arguments.of("(RowCount > 0) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"),
+ Arguments.of("((RowCount > 0) AND (IsComplete \"colB\")) OR ((IsComplete \"colA\") AND (IsUnique \"colA\"))"),
+ Arguments.of("FileFreshness \"S3://PATH\" > (now() - 30 minutes)"),
+ Arguments.of("FileFreshness \"S3://PATH\" > (now() + 45 minutes)"),
+ Arguments.of("FileFreshness \"S3://PATH\" > \"9:30 AM\""),
+ Arguments.of("FileFreshness \"S3://PATH\" > \"9:30 PM\""),
+ Arguments.of("FileFreshness \"S3://PATH\" > \"09:30\""),
+ Arguments.of("FileFreshness \"S3://PATH\" > \"13:30\""),
+ Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\""),
+ Arguments.of("FileFreshness \"S3://PATH\" > \"21:45\" with timeZone = \"America/New_York\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 PM\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between \"9:30 AM\" and \"9:30 AM\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between \"09:30\" and \"21:45\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between (now() - 2 hours) and \"21:45\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between (now() + 5 minutes) and \"21:45\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between \"2024-01-01\" and \"21:45\""),
+ Arguments.of("FileFreshness \"S3://PATH\" between \"2024-01-01\" and (now() + 10 minutes)")
);
}
+ @Test
+ void test_TagFormatting() throws Exception {
+ final String rule = "Rules = [ " +
+ "FileFreshness > \"9:30 AM\" with recentFiles = 1, " +
+ "FileFreshness > \"9:30 AM\" with recentFiles = \"1\", " +
+ "FileFreshness > \"9:30 AM\" with matchFileName = \"True\", " +
+ "FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\" " +
+ "]";
+ List rules = parser.parse(rule).getRules();
+ assertEquals("FileFreshness > \"9:30 AM\" with recentFiles = 1", rules.get(0).toString());
+ assertEquals("FileFreshness > \"9:30 AM\" with recentFiles = \"1\"", rules.get(1).toString());
+ assertEquals("FileFreshness > \"9:30 AM\" with matchFileName = \"True\"", rules.get(2).toString());
+ assertEquals("FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\"", rules.get(3).toString());
+ }
+
+ @Test
+ void test_Timezone() throws Exception {
+ String rule = "Rules = [ FileFreshness > \"9:30 AM\" with timeZone = \"America/New_York\", FileFreshness > \"19:30\" with timeZone = \"Asia/Dubai\", FileFreshness > \"9:30 AM\" ]";
+ List rules = parser.parse(rule).getRules();
+ DateBasedCondition c1 = (DateBasedCondition) rules.get(0).getCondition();
+ DateBasedCondition c2 = (DateBasedCondition) rules.get(1).getCondition();
+ DateBasedCondition c3 = (DateBasedCondition) rules.get(2).getCondition();
+ Date today = new Date();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ String todayStr = sdf.format(today);
+ assertEquals(todayStr + "T14:30", c1.getOperands().get(0).getEvaluatedExpression().toString());
+ assertEquals(todayStr + "T15:30", c2.getOperands().get(0).getEvaluatedExpression().toString());
+ assertEquals(todayStr + "T09:30", c3.getOperands().get(0).getEvaluatedExpression().toString());
+ }
+
+ @Test
+ void test_AMPM_Parsing() throws Exception {
+ String rule = "Rules = [ FileFreshness \"S3://PATH\" between \"9:15 AM\" and \"21:45\" ]";
+ DQRule parsedRule = parser.parse(rule).getRules().get(0);
+ DateBasedCondition c1 = (DateBasedCondition) parsedRule.getCondition();
+ DateExpression d1 = c1.getOperands().get(0);
+ DateExpression d2 = c1.getOperands().get(1);
+ Date today = new Date();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ String todayStr = sdf.format(today);
+ assertEquals("FileFreshness", parsedRule.getRuleType());
+ assertEquals("between \"9:15 AM\" and \"21:45\"", c1.getFormattedCondition());
+ assertEquals("\"9:15 AM\"" ,d1.getFormattedExpression());
+ assertEquals("\"21:45\"" ,d2.getFormattedExpression());
+ assertEquals(todayStr + "T09:15", d1.getEvaluatedExpression().toString());
+ assertEquals(todayStr + "T21:45", d2.getEvaluatedExpression().toString());
+ }
+
+ @Test
+ void test_sizeConditionParsing() throws Exception {
+ List unitList = Arrays.asList("B", "KB", "MB", "GB", "TB");
+ for (String unit : unitList) {
+ String rule = String.format("Rules = [ FileSize = 2 %s ]", unit);
+ DQRule parsedRule = parser.parse(rule).getRules().get(0);
+ assertEquals("FileSize", parsedRule.getRuleType());
+ SizeBasedCondition c = (SizeBasedCondition) parsedRule.getCondition();
+ assertEquals(unit, c.getOperands().get(0).getUnit().name());
+ }
+
+ String defaultByte = "Rules = [ FileSize > 2, FileSize in [3,4,5,6] ]";
+ List rules = parser.parse(defaultByte).getRules();
+ DQRule parsedRuleNoUnit0 = rules.get(0);
+ DQRule parsedRuleNoUnit1 = rules.get(1);
+ assertEquals("FileSize", parsedRuleNoUnit0.getRuleType());
+ assertEquals("FileSize", parsedRuleNoUnit1.getRuleType());
+ SizeBasedCondition c0 = (SizeBasedCondition) parsedRuleNoUnit0.getCondition();
+ SizeBasedCondition c1 = (SizeBasedCondition) parsedRuleNoUnit1.getCondition();
+ assertEquals("B", c0.getOperands().get(0).getUnit().name());
+ for (Size unit : c1.getOperands()) {
+ assertEquals("B", unit.getUnit().name());
+ }
+ }
+
+ @Test
+ void test_fileFileFreshnessParsing() throws Exception {
+ String fileRules = "Rules = [ " +
+ "FileFreshness \"S3://path\" between \"2023-02-07\" and \"2024-07-15\", " +
+ "FileFreshness \"S3://path\" > (now() - 3 days), " +
+ "FileFreshness \"S3://path\" < (now() - 4 days), " +
+ "FileFreshness between \"2023-02-07\" and \"2024-07-15\", " +
+ "FileFreshness > (now() + 35 minutes), " +
+ "FileFreshness <= (now() - 35 minutes), " +
+ "FileFreshness = (now() + 70 minutes) " +
+ "]";
+ DQRuleset dqRuleset = parser.parse(fileRules);
+ List ruleList = dqRuleset.getRules();
+ DQRule rule0 = ruleList.get(0);
+
+ DateBasedCondition c0 = (DateBasedCondition) rule0.getCondition();
+ assertEquals("FileFreshness", rule0.getRuleType());
+ assertEquals("S3://path", rule0.getParameters().get("DataPath"));
+ assertEquals("2023-02-07", removeQuotes(c0.getOperands().get(0).getFormattedExpression()));
+ assertEquals("2024-07-15", removeQuotes(c0.getOperands().get(1).getFormattedExpression()));
+
+ DQRule rule1 = ruleList.get(1);
+ DateBasedCondition c1 = (DateBasedCondition) rule1.getCondition();
+ assertEquals("FileFreshness", rule1.getRuleType());
+ assertEquals("S3://path", rule1.getParameters().get("DataPath"));
+ assertEquals("GREATER_THAN", c1.getOperator().toString());
+ assertEquals("(now() - 3 days)", c1.getOperands().get(0).getFormattedExpression());
+
+ DQRule rule2 = ruleList.get(2);
+ DateBasedCondition c2 = (DateBasedCondition) rule2.getCondition();
+ assertEquals("FileFreshness", rule2.getRuleType());
+ assertEquals("S3://path", rule2.getParameters().get("DataPath"));
+ assertEquals("LESS_THAN", c2.getOperator().toString());
+ assertEquals("(now() - 4 days)", c2.getOperands().get(0).getFormattedExpression());
+
+ DQRule rule3 = ruleList.get(3);
+ DateBasedCondition c3 = (DateBasedCondition) rule3.getCondition();
+ assertEquals("FileFreshness", rule3.getRuleType());
+ assertFalse(rule3.getParameters().containsKey("DataPath"));
+ assertEquals("2023-02-07", removeQuotes(c3.getOperands().get(0).getFormattedExpression()));
+ assertEquals("2024-07-15", removeQuotes(c3.getOperands().get(1).getFormattedExpression()));
+
+ DQRule rule4 = ruleList.get(4);
+ DateBasedCondition c4 = (DateBasedCondition) rule4.getCondition();
+ assertEquals("(now() + 35 minutes)", c4.getOperands().get(0).getFormattedExpression());
+
+ DQRule rule5 = ruleList.get(5);
+ DateBasedCondition c5 = (DateBasedCondition) rule5.getCondition();
+ assertEquals("(now() - 35 minutes)", c5.getOperands().get(0).getFormattedExpression());
+
+ DQRule rule6 = ruleList.get(6);
+ DateBasedCondition c6 = (DateBasedCondition) rule6.getCondition();
+ assertEquals("(now() + 70 minutes)", c6.getOperands().get(0).getFormattedExpression());
+ }
+
+ @Test
+ void test_checksumRuleParsing() throws Exception {
+ String fileRules = "Rules = [ " +
+ "FileMatch in [\"exampleHash\"] with hashAlgorithm = \"MD5\" with dataFrame = \"true\" ," +
+ "FileMatch \"s3://sampom-bucket2/\" in [\"exampleHash2\"] with hashAlgorithm = \"SHA-256\" ," +
+ "FileMatch \"s3://sampom-bucket3/\" in [\"exampleHash3\"] ," +
+ "FileMatch in [\"exampleHash4\"] with dataFrame = \"true\"" +
+ "]";
+ DQRuleset dqRuleset = parser.parse(fileRules);
+ List ruleList = dqRuleset.getRules();
+
+ DQRule rule0 = ruleList.get(0);
+ assertEquals("FileMatch", rule0.getRuleType());
+ assertEquals("exampleHash", ((StringBasedCondition) rule0.getCondition()).getOperands().get(0).getOperand());
+ assertEquals("MD5", rule0.getTags().get("hashAlgorithm"));
+ assertEquals("true", rule0.getTags().get("dataFrame"));
+
+ DQRule rule1 = ruleList.get(1);
+ assertEquals("FileMatch", rule1.getRuleType());
+ assertEquals("s3://sampom-bucket2/", rule1.getParameters().get("DataPath"));
+ assertEquals("exampleHash2", ((StringBasedCondition) rule1.getCondition()).getOperands().get(0).getOperand());
+ assertEquals("SHA-256", rule1.getTags().get("hashAlgorithm"));
+
+ DQRule rule2 = ruleList.get(2);
+ assertEquals("FileMatch", rule2.getRuleType());
+ assertEquals("s3://sampom-bucket3/", rule2.getParameters().get("DataPath"));
+ assertEquals("exampleHash3", ((StringBasedCondition) rule2.getCondition()).getOperands().get(0).getOperand());
+
+ DQRule rule3 = ruleList.get(3);
+ assertEquals("FileMatch", rule3.getRuleType());
+ assertEquals("exampleHash4", ((StringBasedCondition) rule3.getCondition()).getOperands().get(0).getOperand());
+ }
+
+ @Test
+ void test_fileMatchRuleParsing() throws Exception {
+ String fileRules = "Rules = [ " +
+ "FileMatch \"s3://sampom-bucket1/\" \"s3://sampom-bucket2/\"," +
+ "FileMatch \"s3://sampom-bucket1/file1.json\" \"s3://sampom-bucket2/file2.json\"" +
+ "]";
+ DQRuleset dqRuleset = parser.parse(fileRules);
+ List ruleList = dqRuleset.getRules();
+
+ DQRule rule0 = ruleList.get(0);
+ assertEquals("FileMatch", rule0.getRuleType());
+ assertEquals("s3://sampom-bucket1/", rule0.getParameters().get("DataPath"));
+ assertEquals("s3://sampom-bucket2/", rule0.getParameters().get("CompareDataPath"));
+
+ DQRule rule1 = ruleList.get(1);
+ assertEquals("FileMatch", rule0.getRuleType());
+ assertEquals("s3://sampom-bucket1/file1.json", rule1.getParameters().get("DataPath"));
+ assertEquals("s3://sampom-bucket2/file2.json", rule1.getParameters().get("CompareDataPath"));
+ }
+
@Test
void test_toStringIgnoresSpacesOnlyThreshold() {
Map parameters = new HashMap<>();
@@ -190,10 +502,9 @@ void test_setExpressionContainsRuleContainingRule() throws InvalidDataQualityRul
assertEquals(1, dqRuleset.getRules().size());
DQRule dqRule = dqRuleset.getRules().get(0);
assertEquals(StringBasedCondition.class, dqRule.getCondition().getClass());
+ List