diff --git a/splitgill/diffing.py b/splitgill/diffing.py index fb1a658..26a4d6f 100644 --- a/splitgill/diffing.py +++ b/splitgill/diffing.py @@ -102,11 +102,23 @@ def prepare_field_name(name: Any) -> str: - convert the name to a str, MongoDB only accepts str keys in objects - remove any control characters from the str - replace . with _ as Elasticsearch doesn't like dots in keys + - replace ^ with _ as this is a reserved character we use in parsed field names + + If after cleaning, the field name is an empty string, we return an underscore. :param name: the field name :return: a clean str field name """ - return invalid_key_char_regex.sub("", str(name)).replace(".", "_").strip() + clean_name = ( + invalid_key_char_regex.sub("", str(name)) + .replace(".", "_") + .replace("^", "_") + .strip() + ) + # if this results in the empty string, replace with an underscore + if not clean_name: + clean_name = "_" + return clean_name class DiffOp(NamedTuple): diff --git a/splitgill/indexing/fields.py b/splitgill/indexing/fields.py index 46884d2..f0d15a7 100644 --- a/splitgill/indexing/fields.py +++ b/splitgill/indexing/fields.py @@ -8,21 +8,6 @@ from strenum import LowercaseStrEnum, StrEnum -def is_field_valid(name: str) -> bool: - """ - Defines a valid field in user provided data. - - :param name: the field name - :return: True if the field is valid, False otherwise - """ - # todo: should we disallow fields that have a __ in them for elasticsearch-dsl? - # ^ is used in parsed type field names, and we use . in the parsed and data type - # fields. Additionally, using dots in names is probably going to result in confusing - # outcomes for users given elasticsearch will interpret them as flattened objects - # and expand them out - return name and "^" not in name and "." not in name - - class DocumentField(LowercaseStrEnum): """ Enum representing the fields used in the indexed documents. diff --git a/tests/indexing/test_fields.py b/tests/indexing/test_fields.py index fc1ee78..78b8f80 100644 --- a/tests/indexing/test_fields.py +++ b/tests/indexing/test_fields.py @@ -6,7 +6,6 @@ from splitgill.indexing.fields import ( ParsedType, parsed_path, - is_field_valid, DocumentField, DataType, DataField, @@ -14,14 +13,6 @@ ) -def test_is_field_valid(): - assert is_field_valid("egg") - assert is_field_valid("_egg") - assert not is_field_valid("") - assert not is_field_valid("egg^beans") - assert not is_field_valid("egg.beans") - - @pytest.mark.parametrize("parsed_type", ParsedType) def test_parsed_path(parsed_type: ParsedType): field = "a.field.in.the.record" diff --git a/tests/test_diffing.py b/tests/test_diffing.py index dc13c9e..a5213df 100644 --- a/tests/test_diffing.py +++ b/tests/test_diffing.py @@ -132,6 +132,12 @@ def test_prepare_field_name(): assert prepare_field_name(".x.y.z.1.2.") == "_x_y_z_1_2_" # a mix of horrors assert prepare_field_name("\nx.\ty\r \x07fowien") == "x_y fowien" + # a ^ + assert prepare_field_name("x^y") == "x_y" + # an empty name + assert prepare_field_name("") == "_" + # a name which becomes empty after removing all the junk + assert prepare_field_name(" \t \x07 ") == "_" class TestDiff: