From 80ba7d6b50ea0c44c0730290f06d4ddff1d297a7 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Tue, 17 Sep 2024 12:04:45 +0100 Subject: [PATCH] feat: enhance field name cleaning The is_field_valid function was never actually being used but half of what it was trying to do was implemented elsewhere. This commit moves the other part of this functionality to the existing field name cleaner and removes the is_field_valid function. --- splitgill/diffing.py | 14 +++++++++++++- splitgill/indexing/fields.py | 15 --------------- tests/indexing/test_fields.py | 9 --------- tests/test_diffing.py | 6 ++++++ 4 files changed, 19 insertions(+), 25 deletions(-) diff --git a/splitgill/diffing.py b/splitgill/diffing.py index fb1a658..26a4d6f 100644 --- a/splitgill/diffing.py +++ b/splitgill/diffing.py @@ -102,11 +102,23 @@ def prepare_field_name(name: Any) -> str: - convert the name to a str, MongoDB only accepts str keys in objects - remove any control characters from the str - replace . with _ as Elasticsearch doesn't like dots in keys + - replace ^ with _ as this is a reserved character we use in parsed field names + + If after cleaning, the field name is an empty string, we return an underscore. :param name: the field name :return: a clean str field name """ - return invalid_key_char_regex.sub("", str(name)).replace(".", "_").strip() + clean_name = ( + invalid_key_char_regex.sub("", str(name)) + .replace(".", "_") + .replace("^", "_") + .strip() + ) + # if this results in the empty string, replace with an underscore + if not clean_name: + clean_name = "_" + return clean_name class DiffOp(NamedTuple): diff --git a/splitgill/indexing/fields.py b/splitgill/indexing/fields.py index 46884d2..f0d15a7 100644 --- a/splitgill/indexing/fields.py +++ b/splitgill/indexing/fields.py @@ -8,21 +8,6 @@ from strenum import LowercaseStrEnum, StrEnum -def is_field_valid(name: str) -> bool: - """ - Defines a valid field in user provided data. - - :param name: the field name - :return: True if the field is valid, False otherwise - """ - # todo: should we disallow fields that have a __ in them for elasticsearch-dsl? - # ^ is used in parsed type field names, and we use . in the parsed and data type - # fields. Additionally, using dots in names is probably going to result in confusing - # outcomes for users given elasticsearch will interpret them as flattened objects - # and expand them out - return name and "^" not in name and "." not in name - - class DocumentField(LowercaseStrEnum): """ Enum representing the fields used in the indexed documents. diff --git a/tests/indexing/test_fields.py b/tests/indexing/test_fields.py index fc1ee78..78b8f80 100644 --- a/tests/indexing/test_fields.py +++ b/tests/indexing/test_fields.py @@ -6,7 +6,6 @@ from splitgill.indexing.fields import ( ParsedType, parsed_path, - is_field_valid, DocumentField, DataType, DataField, @@ -14,14 +13,6 @@ ) -def test_is_field_valid(): - assert is_field_valid("egg") - assert is_field_valid("_egg") - assert not is_field_valid("") - assert not is_field_valid("egg^beans") - assert not is_field_valid("egg.beans") - - @pytest.mark.parametrize("parsed_type", ParsedType) def test_parsed_path(parsed_type: ParsedType): field = "a.field.in.the.record" diff --git a/tests/test_diffing.py b/tests/test_diffing.py index dc13c9e..a5213df 100644 --- a/tests/test_diffing.py +++ b/tests/test_diffing.py @@ -132,6 +132,12 @@ def test_prepare_field_name(): assert prepare_field_name(".x.y.z.1.2.") == "_x_y_z_1_2_" # a mix of horrors assert prepare_field_name("\nx.\ty\r \x07fowien") == "x_y fowien" + # a ^ + assert prepare_field_name("x^y") == "x_y" + # an empty name + assert prepare_field_name("") == "_" + # a name which becomes empty after removing all the junk + assert prepare_field_name(" \t \x07 ") == "_" class TestDiff: