Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change the order of local tags #26

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion bsmetadata/metadata_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,13 @@ class HtmlProcessor(MetadataProcessor):
def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]:
# We represent a html tag `T` by enclosing the corresponding text span with "<T>" and "</T>".
# Example: An <b>apple</b> is an edible fruit.
return f"<{metadata_attrs['value']}>", f"</{metadata_attrs['value']}>"
attributes = " ".join(
f'{attr}:"{value}"'
for attr, value in zip(metadata_attrs["value"]["attrs"]["attr"], metadata_attrs["value"]["attrs"]["value"])
)
if attributes:
attributes = " " + attributes
return f"<{metadata_attrs['value']['tag']}{attributes}>", f"</{metadata_attrs['value']['tag']}>"


class UrlProcessor(MetadataProcessor):
Expand Down
81 changes: 81 additions & 0 deletions tests/test_metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from transformers import GPT2TokenizerFast

from bsmetadata.input_pipeline import DataConfig
from bsmetadata.metadata_processors import PROCESSORS, HtmlProcessor, MetadataProcessor
from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor
from bsmetadata.metadata_utils import (
add_local_metadata_to_text,
Expand Down Expand Up @@ -57,6 +58,76 @@ def setUp(self) -> None:
{"key": "url", "type": "global", "value": "callto:RickAndMorty/Year%202021/"},
],
},
{
"id": "0004",
"text": "useless text The Walking Dead (season 8)\n",
"metadata": [
{
"char_start_idx": 13,
"value": {
"tag": "h1",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 40,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "div",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 0,
"value": {"tag": "a", "attrs": {"attr": [], "value": []}},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "div",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "a",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "div",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {"tag": "i", "attrs": {"attr": [], "value": []}},
"char_end_idx": 29,
"key": "html",
"type": "local",
},
],
},
]

def test_chunks(self):
Expand Down Expand Up @@ -133,6 +204,16 @@ def test_add_no_metadata_and_chunk_examples(self):
for example in mapped_ds:
self.assertTrue(all(not x for x in example["metadata_mask"]))

def test_add_html_tags(self):
cfg = DataConfig()
cfg.metadata_list = ["html"]
PROCESSORS["html"] = HtmlProcessor

text1, mask1 = add_local_metadata_to_text(self.examples[3], cfg)
target_text = '<a>useless text </a><div><a><div><div></div></div></a></div><h1><i>The Walking Dead</i> (season 8)</h1>\n'
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before the change I propose in this PR the result was here:

<a>useless text </div></a></div></a></div><h1><i><div><a><div><div>The Walking Dead</i> (season 8)</h1>


self.assertEqual(text1, target_text)

def test_add_metadata_and_chunk_examples(self):
cfg = DataConfig()
cfg.metadata_list = ["url", "timestamp", "html", "entity"]
Expand Down