Skip to content

Commit

Permalink
Merge branch 'tensorlakeai:main' into donut_focused
Browse files Browse the repository at this point in the history
  • Loading branch information
Ak-Gautam authored Feb 1, 2024
2 parents 4aaea88 + 11fced6 commit 360580b
Show file tree
Hide file tree
Showing 6 changed files with 22,872 additions and 4,363 deletions.
39 changes: 26 additions & 13 deletions web-extractors/wikipedia/test_wikipedia
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import json
import unittest

from pathlib import Path
from typing import List, Type
from typing import List

from indexify_extractor_sdk.base_extractor import Content, Extractor
from parameterized import parameterized
from indexify_extractor_sdk.base_extractor import Content, Feature
from wikipedia import WikipediaExtractor


Expand All @@ -14,20 +13,34 @@ class TestWikipediaExtractor(unittest.TestCase):
self.html_content = self._get_html_content()

def _get_html_content(self) -> List[Content]:
import os

dirname = os.path.dirname(__file__)
file_name = "Stephen_Curry.html"
path = str(Path(__file__).parent) + "/utils/" + file_name
with open(path, "r") as f:
file_path = os.path.join(dirname, "utils/", file_name)

with open(file_path, "rb") as f:
data = f.read()

content = Content.from_text(text=data, labels={"filename": file_name})
return Content(
data=data,
content_type="text/html",
feature=Feature.metadata({"filename": file_name}),
)

def test_wikipedia_extraction(self):
extracted_content = WikipediaExtractor().run_sample_input()
extracted_features = json.loads(extracted_content[0].feature.value)

self.assertIsNotNone(extracted_content[0], "No content extracted")

return content
self.assertEqual(extracted_features["filename"],
"Stephen_Curry.html",
"Filename not correctly extracted")

@parameterized.expand([WikipediaExtractor()])
def test_wikipedia_extraction(self, wikipedia_extractor: Type[Extractor]):
extracted_content = wikipedia_extractor.extract(self.html_content)
self.assertIsNotNone(extracted_content[0])
self.assertIsNotNone(extracted_content[0].labels)
self.assertEqual(extracted_features["title"],
"Stephen Curry",
"Title not correctly extracted")


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 360580b

Please sign in to comment.