-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhaaretz_scrape.py
199 lines (159 loc) · 5.54 KB
/
haaretz_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from bs4 import BeautifulSoup
import requests
import json
from modules import group_dates
import logging
from modules.extensions import search_sort, update_algolia
HAARETZ_SITEMAP = "https://www.haaretz.co.il/sitemap.xml"
def open_json(fname: str) -> dict:
"""
Opens a JSON file and returns its contents as a dictionary.
Args:
fname (str): The file name or path.
Returns:
dict: The contents of the JSON file as a dictionary.
"""
with open(fname, "r") as file:
j_dict = json.load(file)
logging.info(f"Loaded {fname}")
return j_dict
def save_json(fname: str, dict: dict) -> None:
"""
Saves a dictionary as JSON to a file.
Args:
fname (str): The file name or path.
dict (dict): The dictionary to be saved.
Returns:
None
"""
with open(fname, "w") as file:
json.dump(dict, file, indent=4)
logging.info(f"Saved {fname}")
def fetch_sitemap():
"""
Fetches the Haaretz sitemap and returns a list of URLs.
Returns:
list: A list of URLs from the Haaretz sitemap.
"""
response = requests.get(HAARETZ_SITEMAP) # Fetch the sitemap
soup = BeautifulSoup(response.content, "xml") # Parse the XML content
sitemap = [
loc.text for loc in soup.find_all("loc")
] # Extract the URLs from the sitemap
return sitemap
def save_sitemap_to_json():
"""
Saves the Haaretz sitemap to a JSON file.
Returns:
None
"""
sitemap = fetch_sitemap()
with open("data/haaretz_sitemap.json", "w") as file:
json.dump(sitemap, file, indent=4)
def fetch_urls():
"""
Fetches the Haaretz sitemap and returns a list of URLs.
Returns:
dict: A dictionary containing the URLs grouped by month.
"""
sitemap = fetch_sitemap() # Fetch the sitemap
mag_urls = open_json("data/mag_urls.json") # Load existing URLs
for month in sitemap: # Iterate over the URLs in the sitemap
if month in mag_urls.keys(): # Skip URLs that have already been processed
print(f"Skipping: {month}")
continue
response = requests.get(month)
print(f"Fetching URLs from: {month}")
print(f"Response Code: {response.status_code}")
soup = BeautifulSoup(response.content, "xml")
urls = [loc.text for loc in soup.find_all("loc")]
site_relevant_urls = []
for url in urls:
if url.startswith("https://www.haaretz.co.il/magazine/"):
print(f"URL: {url}")
site_relevant_urls.append(url)
mag_urls[month] = site_relevant_urls
save_json("data/mag_urls.json", mag_urls)
group_dates.main()
return mag_urls
def add_titles():
"""
Fetches titles for articles and adds them to the 'titled_urls' dictionary.
This function iterates over the 'grouped_articles' dictionary, which contains articles grouped by month and date.
For each date, it fetches the titles for the corresponding URLs and adds them to the 'titled_urls' dictionary.
The 'titled_urls' dictionary is then saved to a JSON file.
Parameters:
None
Returns:
None
"""
grouped_articles = open_json("data/grouped_articles.json")
titled_urls = open_json("data/titled_urls.json") # Load existing titled URLs
if titled_urls:
print("Loaded existing titled URLs.")
for month, dates in grouped_articles.items():
for date, urls in dates.items():
if month in titled_urls and date in titled_urls[month]:
print(f"Skipping already processed date: {date} in {month}")
continue
titled_urls.setdefault(month, {})[date] = {}
print(f"Fetching titles for: {date}")
for url in urls:
title = get_title(url)
titled_urls[month][date][url] = title if title else "No title found"
print(f"Processed {date} for {month}")
save_json("data/titled_urls.json", titled_urls)
print("Titles added and saved.")
def get_title(url: str) -> str:
"""
Fetches the title of a web page given its URL.
Args:
url (str): The URL of the web page.
Returns:
str: The title of the web page, or None if an error occurred.
"""
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
title = soup.find("title").text
print(f"Response: {response}")
print(f"Title: {title}")
print(f"URL: {url}")
except Exception as e:
response = requests.get(url)
print(f"Error fetching title for {url}: {e}")
print(f"Response: {response}")
title = None
return title
def fetch_months():
"""
Fetches the articles grouped by month.
Returns:
dict: A dictionary containing the articles grouped by month.
"""
urls = fetch_sitemap()
articles = {}
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "xml")
article = [loc.text for loc in soup.find_all("loc")]
articles[url] = article
return articles
def main():
"""
Main function to fetch URLs, add titles, and perform search and sort operations.
Returns:
None
"""
print("Fetching URLs...")
fetch_urls()
print("Adding titles...")
add_titles()
print("Organizing articles for search DB...")
search_sort.main()
print("Updating Algolia...")
update_algolia.main()
print("Algolia updated successfully.")
if __name__ == "__main__":
main()
add_titles()