-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
68 lines (68 loc) · 3.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Required libraries
import requests
from bs4 import BeautifulSoup
import datetime
import time
import csv
import os
import subprocess
# Constants
URL = "http://www.bom.gov.au/cgi-bin/wrap_fwo.pl?IDN60140.html"
SCRAPING_INTERVAL = 60 # in seconds
CSV_FILE = 'river_height_data.csv'
# Check if CSV file already exists, if not, create it with headers
if not os.path.isfile(CSV_FILE):
with open(CSV_FILE, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Station Name", "Date", "Time", "Height"])
# Keep track of the last height to avoid duplicates
last_height = None
# Update scrape_river_data function according to new requirements
def scrape_river_data(last_height):
# Set up a user-agent to mimic a web browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# Perform the web request with the user-agent
response = requests.get(URL, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Find 'Goolmangar Ck at Nimbin' row in the table
target_text = 'Goolmangar Ck at Nimbin'
# Find the table containing the data. We assume the structure is <table> with <tr> and <td>
table = soup.find_all('table')
goolmangar_row = None
for tbl in table:
for row in tbl.find_all('tr'):
cells = row.find_all('td')
if cells and target_text in cells[0].text.strip():
goolmangar_row = cells
break
if goolmangar_row:
break
if goolmangar_row is None:
print(f"Could not find data for {target_text.strip()}.")
return last_height # Return the last height to maintain state
# Extract the station name (first column), date (second column) and height (third column)
station_name = goolmangar_row[0].text.strip()
date_str = datetime.datetime.now().strftime('%Y-%m-%d')
# Adjust to Sydney timezone
sydney_timezone = datetime.timezone(datetime.timedelta(hours=11))
time_str = datetime.datetime.now(sydney_timezone).strftime('%H:%M')
height_str = goolmangar_row[2].text.strip()
# Only write to the CSV file if the height has changed from the last recorded height
if last_height is None or last_height != height_str:
with open(CSV_FILE, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([station_name, date_str, time_str, height_str])
last_height = height_str # Update the last recorded height
return station_name, date_str, time_str, height_str, last_height
# Start the Flask app from nimbinriver.py in a separate process
subprocess.Popen(['python', 'nimbinriver.py'])
# Call the scrape_river_data function periodically
while True:
station_name, date, scraped_time, height, last_height = scrape_river_data(last_height)
if station_name is None or date is None or scraped_time is None or height is None:
print("Failed to scrape data. Retrying after delay...")
else:
print(f"Data scraped for {station_name} on {date} at {scraped_time}: River height is {height}.")
time.sleep(SCRAPING_INTERVAL)