Skip to content

Commit

Permalink
Merge pull request jaybaird#17 from whylabs/dev/felipe/patterns
Browse files Browse the repository at this point in the history
improve patterns and update test
  • Loading branch information
FelipeAdachi authored May 10, 2023
2 parents 3159f8b + 0d7ffc6 commit c3605ab
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 10 deletions.
6 changes: 3 additions & 3 deletions langkit/pattern_groups.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
},
{
"expressions": [
"\\b(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\\d{3})\\d{11})\\b"
"\\b(?:\\d[ -]*?){13,16}\\b"
],
"name": "credit card number"
},
{
"expressions": [
"\\b(?!000|.+0{4})(?:\\d{9}|\\d{3}-\\d{2}-\\d{4})\\b"
"(?!(\\d){3}(-| |)\\1{2}\\2\\1{4})(?!666|000|9\\d{2})(\\b\\d{3}(-| |)(?!00)\\d{2}\\4(?!0{4})\\d{4}\\b)"
],
"name": "SSN"
},
Expand All @@ -25,7 +25,7 @@
},
{
"expressions": [
"\\b\\d{1,8}\\b[\\s\\S]{10,100}?\\b(AK|AL|AR|AZ|CA|CO|CT|DC|DE|FL|GA|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VT|WA|WI|WV|WY)\\b\\s\\d{5}\\b"
"\\b\\d+[ ](?:[A-Za-z0-9.-]+[ ]?)+(Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St|Sq)\\b"
],
"name": "mailing address"
}
Expand Down
26 changes: 19 additions & 7 deletions langkit/tests/test_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,22 @@ def ptt_df():
df = pd.DataFrame(
{
"input": [
"address: 123 Main St Anytown, NY 12345",
"address: 123 Main St.",
"2255 140th Ave NE",
"535 Bellevue Sq",
"15220 SE 37th St",
"[email protected]",
"my phone is +1 309-404-7587",
"credit card 4556205848969759",
"credit card 3851-6256-0926-7271",
"Visa Card Number: 4929 5423 7528 1067 \nExpiration Date: 03/24 \nCVV: 348",
"622202049892743 - this is a credit card number",
"my ssn is 856-45-6789",
"ssn - 702-02-9921",
"ssn is 702 02 9921",
"702029921 (SSN)",
"no patterns here.",
],
"output": ["a", "b", "c", "d", "e"],
}
)
return df
Expand Down Expand Up @@ -55,13 +64,16 @@ def test_ptt(ptt_df, user_defined_json):
fi_input_list = result.view().to_pandas()[
"udf/has_patterns:frequent_items/frequent_strings"
]["input"]
fi_output_list = result.view().to_pandas()[
"udf/has_patterns:frequent_items/frequent_strings"
]["output"]
if not user_defined_json:
group_names = {"phone number", "email address", "SSN", "mailing address", "credit card number"}
group_names = {
"",
"credit card number",
"email address",
"SSN",
"phone number",
"mailing address",
}
else:
group_names = {"custom_group", ""}

assert set([x.value for x in fi_input_list]) == group_names
assert set([x.value for x in fi_output_list]) == {""}

0 comments on commit c3605ab

Please sign in to comment.