Skip to content

Commit

Permalink
Merge pull request #614 from flairNLP/implement-xpath-search-for-lds
Browse files Browse the repository at this point in the history
Implement XPath queries for LDs
  • Loading branch information
MaxDall authored Sep 17, 2024
2 parents 0f771e4 + 89ad182 commit dc1ec9b
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 2 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ dependencies = [
"tqdm>=4.66, <5",
"fastwarc>=0.14, <1",
"chardet>=5.2, <6",
"dill>=0.3, <1"
"dill>=0.3, <1",
"dict2xml>=1.7.6, <2",
"xmltodict>=0.13.0, <1",
]

[project.urls]
Expand Down
84 changes: 84 additions & 0 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass, fields
from itertools import chain
from typing import (
Any,
Collection,
Expand All @@ -15,9 +17,15 @@
overload,
)

import lxml.etree
import more_itertools
import xmltodict
from dict2xml import dict2xml
from lxml.etree import XPath, tostring
from typing_extensions import Self, TypeAlias

from fundus.utils.serialization import replace_keys_in_nested_dict

LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]]

_sentinel = object()
Expand All @@ -41,6 +49,7 @@ class LinkedDataMapping:
"""

__UNKNOWN_TYPE__ = "UNKNOWN_TYPE"
__xml_transformation_table__ = {":": "U003A", "*": "U002A", "@": "U0040"}

def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
for ld in lds:
Expand All @@ -49,6 +58,7 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
self.add_ld(nested)
else:
self.add_ld(ld)
self.__xml: Optional[lxml.etree._Element] = None

def serialize(self) -> Dict[str, Any]:
return {attribute: value for attribute, value in self.__dict__.items() if "__" not in attribute}
Expand Down Expand Up @@ -92,6 +102,80 @@ def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Opt
tmp = nxt
return tmp

def __as_xml__(self) -> lxml.etree._Element:
pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.keys())))

def to_unicode_characters(text: str) -> str:
return pattern.sub(lambda match: self.__xml_transformation_table__[match.group(0)], text)

if self.__xml is None:
xml = dict2xml(replace_keys_in_nested_dict({"linkedData": self.serialize()}, to_unicode_characters))
self.__xml = lxml.etree.fromstring(xml)
return self.__xml

def xpath_search(self, query: XPath) -> List[Any]:
"""Search through LD using XPath expressions
Internally, the content of the LinkedDataMapping is converted to XML and then
evaluated with an XPath expression <query>.
To search for keys including invalid XML characters, use Unicode representations instead:
I.e. to search for the key "_16:9" write "//_16U003A9"
For all available transformations see LinkedDataMapping.__xml_transformation_table__
Note that values will be converted to strings, i.e. True -> 'True', 1 -> '1'
Examples:
LinkedDataMapping = {
"b": {
"key": value1,
}
"c": {
"key": value2,
}
}
LinkedDataMapping.xpath_search(XPath("//key"))
>> [value1, value2]
LinkedDataMapping.xpath_search(XPath("//b/key"))
>> [value1]
Args:
query: A XPath expression
Returns:
An ordered list of search results
"""

pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values())))

def node2string(n: lxml.etree._Element) -> str:
return "".join(
chunk
for chunk in chain(
(n.text,),
chain(*((tostring(child, with_tail=False, encoding=str), child.tail) for child in n.getchildren())),
(n.tail,),
)
if chunk
)

reversed_table = {v: k for k, v in self.__xml_transformation_table__.items()}

def to_original_characters(text: str) -> str:
return pattern.sub(lambda match: reversed_table[match.group(0)], text)

nodes = query(self.__as_xml__())

results = {}

for i, node in enumerate(nodes):
xml = f"<result{i}>" + node2string(node) + f"</result{i}>"
results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters))

return list(results.values())

def bf_search(self, key: str, depth: Optional[int] = None, default: Optional[_T] = None) -> Union[Any, _T]:
"""
This is a classic BF search on the nested dicts representing the JSON-LD. <key> specifies the dict key to
Expand Down
29 changes: 28 additions & 1 deletion src/fundus/utils/serialization.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,41 @@
import json
from typing import Dict, Sequence, Union
from typing import Any, Callable, Dict, Sequence, TypeVar, Union

from typing_extensions import TypeAlias

JSONVal: TypeAlias = Union[None, bool, str, float, int, Sequence["JSONVal"], Dict[str, "JSONVal"]]

_T = TypeVar("_T")


def is_jsonable(x):
try:
json.dumps(x)
return True
except (TypeError, OverflowError):
return False


def replace_keys_in_nested_dict(data: Dict[str, _T], transformation: Callable[[str], str]) -> Dict[str, _T]:
"""Recursively replace all keys in a nested dictionary with <transformation>.
Args:
data: The dictionary to transform
transformation: The transformation to use
Returns:
The transformed dictionary
"""

def process(element) -> Any:
if isinstance(element, dict):
# Apply transformation to keys and recurse into values
return {transformation(k): process(v) for k, v in element.items()}
elif isinstance(element, list):
# Recursively apply to elements in a list
return [process(i) for i in element]
else:
# Base case: return the value as is if it's not a dict or list
return element

return {transformation(k): process(v) for k, v in data.items()}
24 changes: 24 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Any, Dict, List

from lxml.etree import XPath

from fundus.parser.data import LinkedDataMapping

lds: List[Dict[str, Any]] = [
{"@type": "Example1", "value": 1, "data": {"inner": "value", "nested": {"dict": True}}},
{"@type": "Example2", "value": 2, "_:*@": "Howdy"},
{"this": "should", "be": {"of": "type", "__UNKNOWN_TYPE__": True, "value": 3}},
]


class TestLinkedDataMapping:
def test_constructor(self):
LinkedDataMapping(lds)

def test_xpath_search(self):
ld = LinkedDataMapping(lds)
assert ld.xpath_search(XPath("//value")) == ["1", "2", "3"]
assert ld.xpath_search(XPath("//UNKNOWN_TYPE//value")) == ["3"]
assert ld.xpath_search(XPath("//_U003AU002AU0040")) == ["Howdy"]
assert ld.xpath_search(XPath("//dict")) == ["True"]
assert ld.xpath_search(XPath("//Example2")) == [{"@type": "Example2", "value": "2", "_:*@": "Howdy"}]

0 comments on commit dc1ec9b

Please sign in to comment.