Skip to content

Commit

Permalink
🐛 text instead of inner text
Browse files Browse the repository at this point in the history
  • Loading branch information
asim-shrestha committed Nov 5, 2024
1 parent 2ea1fb8 commit 31548e8
Show file tree
Hide file tree
Showing 7 changed files with 13 additions and 13 deletions.
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.44.0"
version = "0.44.1"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
2 changes: 1 addition & 1 deletion core/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ async def capture_html(
:return: HTMLMetadata containing download URL, HTML content and inner text.
:raises ValueError: If the specified selector doesn't match any element.
"""
html, inner_text = await self._get_html(selector, exclude_selectors or [])
html, text = await self._get_html(selector, exclude_selectors or [])

downloads = await self._notify_observers(
method="on_download",
Expand All @@ -286,7 +286,7 @@ async def capture_html(
"url": downloads[0]["url"],
"filename": downloads[0]["filename"],
"html": html,
"inner_text": inner_text,
"text": text,
}

async def _get_html(
Expand All @@ -303,9 +303,9 @@ async def _get_html(
for selector in exclude_selectors:
for element_to_remove in soup.select(selector):
element_to_remove.decompose()
inner_text = soup.get_text(separator="\n", strip=True)
text = soup.get_text(separator="\n", strip=True)

return str(soup), inner_text
return str(soup), text

async def capture_pdf(
self,
Expand Down
2 changes: 1 addition & 1 deletion sdk/harambe/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class DownloadMeta(TypedDict):

class HTMLMetadata(DownloadMeta):
html: str
inner_text: str
text: str


@runtime_checkable
Expand Down
2 changes: 1 addition & 1 deletion sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-sdk"
version = "0.44.0"
version = "0.44.1"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
6 changes: 3 additions & 3 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
doc_data = observer.data[0]
assert "<table" in doc_data["html"]
assert "<tbody" in doc_data["html"]
assert "Apple" in doc_data["inner_text"]
assert "Apple" in doc_data["text"]

# Verify download fields all available
assert doc_data["url"]
Expand All @@ -472,8 +472,8 @@ async def scraper(sdk: SDK, *args, **kwargs):
table_data = observer.data[1]
assert "<tbody" in doc_data["html"]
assert "<thead" not in table_data["html"]
assert "Price" not in table_data["inner_text"]
assert "Apple" in table_data["inner_text"]
assert "Price" not in table_data["text"]
assert "Apple" in table_data["text"]
print(doc_data)


Expand Down
4 changes: 2 additions & 2 deletions sdk/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 31548e8

Please sign in to comment.