Skip to content

Commit

Permalink
feat: enhance field name cleaning
Browse files Browse the repository at this point in the history
The is_field_valid function was never actually being used but half of what it was trying to do was implemented elsewhere. This commit moves the other part of this functionality to the existing field name cleaner and removes the is_field_valid function.
  • Loading branch information
jrdh committed Sep 17, 2024
1 parent c62cad2 commit 80ba7d6
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 25 deletions.
14 changes: 13 additions & 1 deletion splitgill/diffing.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,23 @@ def prepare_field_name(name: Any) -> str:
- convert the name to a str, MongoDB only accepts str keys in objects
- remove any control characters from the str
- replace . with _ as Elasticsearch doesn't like dots in keys
- replace ^ with _ as this is a reserved character we use in parsed field names
If after cleaning, the field name is an empty string, we return an underscore.
:param name: the field name
:return: a clean str field name
"""
return invalid_key_char_regex.sub("", str(name)).replace(".", "_").strip()
clean_name = (
invalid_key_char_regex.sub("", str(name))
.replace(".", "_")
.replace("^", "_")
.strip()
)
# if this results in the empty string, replace with an underscore
if not clean_name:
clean_name = "_"
return clean_name


class DiffOp(NamedTuple):
Expand Down
15 changes: 0 additions & 15 deletions splitgill/indexing/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,6 @@
from strenum import LowercaseStrEnum, StrEnum


def is_field_valid(name: str) -> bool:
"""
Defines a valid field in user provided data.
:param name: the field name
:return: True if the field is valid, False otherwise
"""
# todo: should we disallow fields that have a __ in them for elasticsearch-dsl?
# ^ is used in parsed type field names, and we use . in the parsed and data type
# fields. Additionally, using dots in names is probably going to result in confusing
# outcomes for users given elasticsearch will interpret them as flattened objects
# and expand them out
return name and "^" not in name and "." not in name


class DocumentField(LowercaseStrEnum):
"""
Enum representing the fields used in the indexed documents.
Expand Down
9 changes: 0 additions & 9 deletions tests/indexing/test_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,13 @@
from splitgill.indexing.fields import (
ParsedType,
parsed_path,
is_field_valid,
DocumentField,
DataType,
DataField,
ParsedField,
)


def test_is_field_valid():
assert is_field_valid("egg")
assert is_field_valid("_egg")
assert not is_field_valid("")
assert not is_field_valid("egg^beans")
assert not is_field_valid("egg.beans")


@pytest.mark.parametrize("parsed_type", ParsedType)
def test_parsed_path(parsed_type: ParsedType):
field = "a.field.in.the.record"
Expand Down
6 changes: 6 additions & 0 deletions tests/test_diffing.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ def test_prepare_field_name():
assert prepare_field_name(".x.y.z.1.2.") == "_x_y_z_1_2_"
# a mix of horrors
assert prepare_field_name("\nx.\ty\r \x07fowien") == "x_y fowien"
# a ^
assert prepare_field_name("x^y") == "x_y"
# an empty name
assert prepare_field_name("") == "_"
# a name which becomes empty after removing all the junk
assert prepare_field_name(" \t \x07 ") == "_"


class TestDiff:
Expand Down

0 comments on commit 80ba7d6

Please sign in to comment.