-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWeb-Scrapper.py
160 lines (130 loc) · 5.62 KB
/
Web-Scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
def descargar_assets(url, carpeta_destino):
# Create an instance of BeautifulSoup to parse the page content
soup = obtener_soup(url)
# Print URL currently page
print(f"Downloading from: {url}")
# Obtener el nombre de la página sin extensiones ni caracteres especiales
nombre_pagina = obtener_nombre_pagina(url)
# Create the folder structure to store the assets
carpeta_pagina = os.path.join(carpeta_destino, nombre_pagina)
carpeta_imagenes = os.path.join(carpeta_pagina, "Imagenes")
carpeta_videos = os.path.join(carpeta_pagina, "Videos")
carpeta_js = os.path.join(carpeta_pagina, "JS")
# Create the folders if they don't exist
os.makedirs(carpeta_imagenes, exist_ok=True)
os.makedirs(carpeta_videos, exist_ok=True)
os.makedirs(carpeta_js, exist_ok=True)
# Download and save the assets of images, videos and JavaScript files
descargar_assets_pagina(url, soup, carpeta_imagenes, carpeta_videos, carpeta_js)
# Find related links on home page
enlaces_relacionados = obtener_enlaces_relacionados(url, soup)
for enlace in enlaces_relacionados:
# Check if the link has already been visited
if enlace not in visitados:
# Download and save the assets of each related link
soup_enlace = obtener_soup(enlace)
if soup_enlace:
# Print the URL of the related link
print(f"Después de: {enlace}")
descargar_assets_pagina(enlace, soup_enlace, carpeta_imagenes, carpeta_videos, carpeta_js)
# Add the link to the visited list
visitados.add(enlace)
print("Asset download completed.")
def obtener_soup(url):
# Make the HTTP request to get the content of the page
response = requests.get(url)
if response.status_code == 200:
# Create an instance of BeautifulSoup to parse the page content
soup = BeautifulSoup(response.content, 'html.parser')
return soup
else:
print(f"Error al descargar la página: {response.status_code}")
return None
def descargar_assets_pagina(url, soup, carpeta_imagenes, carpeta_videos, carpeta_js):
# Download and save the assets of images, videos and JavaScript files
for tag in soup.find_all(["img", {"source": "video/mp4"}, "script"]):
if "src" not in tag.attrs:
continue
asset_url = urljoin(url, tag["src"])
if not is_valid_url(asset_url):
continue
extension = os.path.splitext(asset_url)[1]
if extension in [".jpg", ".jpeg", ".png"]:
asset_type = "Imagen"
carpeta_destino = carpeta_imagenes
elif extension == ".mp4":
asset_type = "Video"
carpeta_destino = carpeta_videos
elif extension == ".js":
asset_type = "Archivo JavaScript"
carpeta_destino = carpeta_js
else:
continue
asset_name = asset_url.split("/")[-1]
asset_name = clean_filename(asset_name) # Limpiar el nombre del archivo
asset_path = os.path.join(carpeta_destino, asset_name)
# Download the asset
asset_response = requests.get(asset_url)
with open(asset_path, "wb") as f:
f.write(asset_response.content)
print(f"{asset_type} descargado: {asset_name}")
def obtener_enlaces_relacionados(url, soup):
enlaces_relacionados = set()
dominio_base = obtener_dominio_base(url)
for a in soup.find_all('a'):
if 'href' in a.attrs:
href = a['href']
if not href.startswith('#') and not href.startswith('http') and not href.startswith('mailto'):
enlace_completo = urljoin(dominio_base, href)
enlaces_relacionados.add(enlace_completo)
return enlaces_relacionados
def descargar_asset(url, carpeta_destino):
nombre_archivo = url.split("/")[-1] # Obtener el nombre del archivo desde la URL
ruta_guardado = os.path.join(carpeta_destino, nombre_archivo)
# Download the file
response = requests.get(url)
if response.status_code == 200:
with open(ruta_guardado, "wb") as archivo:
archivo.write(response.content)
print(f"Archivo descargado correctamente: {ruta_guardado}")
else:
print(f"Error al descargar el archivo: {response.status_code}")
def obtener_nombre_pagina(url):
# Remove disallowed characters for folder names
caracteres_no_permitidos = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
nombre_pagina = url
for caracter in caracteres_no_permitidos:
nombre_pagina = nombre_pagina.replace(caracter, '')
return nombre_pagina
def obtener_dominio_base(url):
from urllib.parse import urlparse
parsed_url = urlparse(url)
dominio_base = parsed_url.scheme + "://" + parsed_url.netloc
return dominio_base
def is_valid_url(url):
"""
Check if a URL is valid.
"""
try:
response = requests.head(url)
return response.status_code == 200
except requests.exceptions.RequestException:
return False
def clean_filename(filename):
"""
Clears a file name by removing illegal characters.
"""
invalid_chars = r'<>:"/\|?*'
for char in invalid_chars:
filename = filename.replace(char, '')
return filename
# Set to store the visited pages
visitados = set()
# Use
url_pagina = input("URL: ")
carpeta_destino = "Assets"
descargar_assets(url_pagina, carpeta_destino)