forked from aws-samples/bedrock-claude-chat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl.py
89 lines (70 loc) · 2.65 KB
/
url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import logging
import urllib.error
import urllib.request
from typing import Literal
from embedding.loaders.base import BaseLoader, Document
from embedding.loaders.playwright import (
DelayUnstructuredHtmlEvaluator,
PlaywrightURLLoader,
)
from embedding.loaders.unstructured import UnstructuredURLLoader
from embedding.loaders.youtube import YoutubeLoaderWithLangDetection, _parse_video_id
logger = logging.getLogger(__name__)
# Delay seconds to wait for the page to render by JavaScript.
DELAY_SEC = 2
def get_loader(loader_type: str, urls: list[str]) -> BaseLoader:
map = {
"web": PlaywrightURLLoader(
urls=urls, evaluator=DelayUnstructuredHtmlEvaluator(delay_sec=DELAY_SEC)
),
"unstructured": UnstructuredURLLoader(urls, request_timeout=30),
"youtube": YoutubeLoaderWithLangDetection(urls),
}
return map[loader_type]
def check_content_type(url) -> Literal["web", "unstructured", "youtube"]:
if _parse_video_id(url):
return "youtube"
# Using urllib.request instead of requests to avoid 403
# Ref: https://stackoverflow.com/questions/74446830/how-to-fix-403-forbidden-errors-with-python-requests-even-with-user-agent-head
req = urllib.request.Request(url, method="HEAD")
req.add_header(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
)
req.add_header("Accept", "*/*")
req.add_header("Accept-Language", "*")
try:
with urllib.request.urlopen(req, timeout=30) as response:
content_type = response.headers.get("Content-Type", "").lower()
except Exception as e:
logger.warning(
f"Failed to get content type of {url}: {e}. Use unstructured to load."
)
return "unstructured"
if "text/html" in content_type:
return "web"
else:
return "unstructured"
def group_urls_by_content_type(urls: list[str]) -> dict:
res: dict = {
"web": [],
"unstructured": [],
"youtube": [],
}
for url in urls:
content_type = check_content_type(url)
res[content_type].append(url)
return res
class UrlLoader(BaseLoader):
"""Loads a document from a URL."""
def __init__(self, urls: list[str]):
self._urls = urls
def load(self) -> list[Document]:
res = []
categorized_urls = group_urls_by_content_type(self._urls)
logger.info(f"URLs are categorized as: {categorized_urls}")
for loader_type, urls in categorized_urls.items():
loader = get_loader(loader_type, urls)
documents = loader.load()
res.extend(documents)
return res