Skip to content

Commit

Permalink
Merge pull request #85 from ADORSYS-GIS/50-implement-scraping-logic
Browse files Browse the repository at this point in the history
Enhance scrapping logic to scrap both images and number of text
  • Loading branch information
NkwaTambe authored Nov 25, 2023
2 parents b795a7f + e024e10 commit 00a5f57
Showing 1 changed file with 33 additions and 0 deletions.
33 changes: 33 additions & 0 deletions scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from urllib.error import HTTPError, URLError

def scrape_website(url):
total_characters = 0
extracted_data = []

try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

for element in soup.find_all('p'):
extracted_data.append(element.text)
total_characters += len(element.text)

for img in soup.find_all('img', src=True):
extracted_data.append(img['src'])

return extracted_data, total_characters

except requests.HTTPError as e:
print("HTTP error occurred:")
print(e)
except requests.RequestException as e:
print("An error occurred:")
print(e)
except Exception as e:
print(f"Error scraping website: {url}")


0 comments on commit 00a5f57

Please sign in to comment.