From dbeb0640779d9921aa6d2cef05aa14abdd86f718 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 7 Nov 2024 17:27:53 +0100 Subject: [PATCH] tests for the xml reader (#96) * check that `xml.remove_includes` works * add checks for `extract_includes`, `normalize` and `schema_paths` * only wrap in `StringIO` where necessary * avoid duplicating schema paths * refactor the schemas * add tests for `open_schemas` * sort the namespaces before stripping them as prefixes * add a test for `read_xml` --- safe_rcm/product/utils.py | 4 +- safe_rcm/tests/test_xml.py | 299 +++++++++++++++++++++++++++++++++++++ safe_rcm/xml.py | 7 +- 3 files changed, 306 insertions(+), 4 deletions(-) create mode 100644 safe_rcm/tests/test_xml.py diff --git a/safe_rcm/product/utils.py b/safe_rcm/product/utils.py index 31020f3..9bd61a4 100644 --- a/safe_rcm/product/utils.py +++ b/safe_rcm/product/utils.py @@ -26,7 +26,9 @@ def strip_namespaces(name, namespaces): trimmed : str The string without prefix and without leading colon. """ - funcs = [flip(str.removeprefix, ns) for ns in namespaces] + funcs = [ + flip(str.removeprefix, ns) for ns in sorted(namespaces, key=len, reverse=True) + ] return pipe(name, *funcs).lstrip(":") diff --git a/safe_rcm/tests/test_xml.py b/safe_rcm/tests/test_xml.py new file mode 100644 index 0000000..060e3d6 --- /dev/null +++ b/safe_rcm/tests/test_xml.py @@ -0,0 +1,299 @@ +import collections +import textwrap + +import fsspec +import pytest + +from safe_rcm import xml + + +def dedent(text): + return textwrap.dedent(text.removeprefix("\n").rstrip()) + + +schemas = [ + dedent( + """ + + + + """ + ), + dedent( + """ + + + + + """ + ), + dedent( + """ + + + + + + """ + ), +] + + +Container = collections.namedtuple("SchemaSetup", ["mapper", "path", "expected"]) +SchemaProperties = collections.namedtuple( + "SchemaProperties", ["root_elements", "simple_types", "complex_types"] +) + + +@pytest.fixture(params=enumerate(schemas)) +def schema_setup(request): + schema_index, schema = request.param + + mapper = fsspec.get_mapper("memory") + mapper["schemas/root.xsd"] = schema.encode() + mapper["schemas/schema1.xsd"] = dedent( + """ + + + + + + """ + ).encode() + mapper["schemas/schema2.xsd"] = dedent( + """ + + + + + + """ + ).encode() + mapper["schemas/schema3.xsd"] = dedent( + """ + + + + + + + + + + + """ + ).encode() + mapper["schemas/schema4.xsd"] = dedent( + """ + + + + + + + + + + """ + ).encode() + + return schema_index, mapper + + +@pytest.fixture +def schema_paths_setup(schema_setup): + schema_index, mapper = schema_setup + + expected = [ + ["schemas/root.xsd"], + ["schemas/root.xsd", "schemas/schema2.xsd", "schemas/schema4.xsd"], + [ + "schemas/root.xsd", + "schemas/schema1.xsd", + "schemas/schema2.xsd", + "schemas/schema3.xsd", + "schemas/schema4.xsd", + ], + ] + + return Container(mapper, "schemas/root.xsd", expected[schema_index]) + + +@pytest.fixture +def schema_content_setup(schema_setup): + schema_index, mapper = schema_setup + + count_type = {"name": "count", "type": "simple", "base_type": "integer"} + manifest_type = {"name": "manifest", "type": "complex"} + + manifest_element = {"name": "manifest", "type": manifest_type} + count_element = {"name": "count", "type": count_type} + expected = [ + SchemaProperties([], [], []), + SchemaProperties([count_element], [count_type], []), + SchemaProperties( + [manifest_element, count_element], [count_type], [manifest_type] + ), + ] + + return Container(mapper, "schemas/root.xsd", expected[schema_index]) + + +@pytest.fixture(params=["data.xml", "data/file.xml"]) +def data_file_setup(request): + path = request.param + mapper = fsspec.get_mapper("memory") + + mapper["schemas/root.xsd"] = dedent( + """ + + + + + + + + + + + + + """ + ).encode() + mapper["schemas/schema1.xsd"] = dedent( + """ + + + + + + + + + + + """ + ).encode() + mapper["schemas/schema2.xsd"] = dedent( + """ + + + + + + + + + + """ + ).encode() + + schema_path = "schemas/root.xsd" if "/" not in path else "../schemas/root.xsd" + mapper[path] = dedent( + f""" + + + + 1 + 2 + + 3 + + """ + ).encode() + + expected = { + "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "@xsi:schemaLocation": f"schema {schema_path}", + "summary": {"quantity_a": 1, "quantity_b": 2}, + "count": 3, + } + + return Container(mapper, path, expected) + + +def convert_type(t): + def strip_namespace(name): + return name.split("}", maxsplit=1)[1] + + if hasattr(t, "content"): + # complex type + return {"name": t.name, "type": "complex"} + elif hasattr(t, "base_type"): + # simple type, only restriction + return { + "name": t.name, + "base_type": strip_namespace(t.base_type.name), + "type": "simple", + } + + +def convert_element(el): + return {"name": el.name, "type": convert_type(el.type)} + + +def extract_schema_properties(schema): + return SchemaProperties( + [convert_element(v) for v in schema.root_elements], + [convert_type(v) for v in schema.simple_types], + [convert_type(v) for v in schema.complex_types], + ) + + +def test_remove_includes(): + expected = schemas[0] + actual = xml.remove_includes(schemas[1]) + + assert actual == expected + + +@pytest.mark.parametrize( + ["schema", "expected"], + ( + (schemas[0], []), + (schemas[1], ["schema2.xsd"]), + (schemas[2], ["schema1.xsd", "schema2.xsd"]), + ), +) +def test_extract_includes(schema, expected): + actual = xml.extract_includes(schema) + + assert actual == expected + + +@pytest.mark.parametrize( + ["root", "path", "expected"], + ( + ("", "file.xml", "file.xml"), + ("/root", "file.xml", "/root/file.xml"), + ("/root", "/other_root/file.xml", "/other_root/file.xml"), + ), +) +def test_normalize(root, path, expected): + actual = xml.normalize(root, path) + + assert actual == expected + + +def test_schema_paths(schema_paths_setup): + actual = xml.schema_paths(schema_paths_setup.mapper, schema_paths_setup.path) + + expected = schema_paths_setup.expected + + assert actual == expected + + +def test_open_schemas(schema_content_setup): + container = schema_content_setup + actual = xml.open_schema(container.mapper, container.path) + expected = container.expected + + assert extract_schema_properties(actual) == expected + + +def test_read_xml(data_file_setup): + container = data_file_setup + + actual = xml.read_xml(container.mapper, container.path) + + assert actual == container.expected diff --git a/safe_rcm/xml.py b/safe_rcm/xml.py index c311079..3a9308a 100644 --- a/safe_rcm/xml.py +++ b/safe_rcm/xml.py @@ -11,7 +11,7 @@ def remove_includes(text): - return io.StringIO(include_re.sub("", text)) + return include_re.sub("", text) def extract_includes(text): @@ -30,7 +30,8 @@ def schema_paths(mapper, root_schema): visited = [] while unvisited: path = unvisited.popleft() - visited.append(path) + if path not in visited: + visited.append(path) text = mapper[path].decode() includes = extract_includes(text) @@ -63,7 +64,7 @@ def open_schema(mapper, schema): The opened schema object """ paths = schema_paths(mapper, schema) - preprocessed = [remove_includes(mapper[p].decode()) for p in paths] + preprocessed = [io.StringIO(remove_includes(mapper[p].decode())) for p in paths] return xmlschema.XMLSchema(preprocessed)