-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
144 lines (121 loc) · 5.22 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import json
import re
import logging
from bs4 import BeautifulSoup
from datetime import datetime
# Configuration
LOG_LEVEL = logging.INFO
LOG_DIR = "logs"
CAVES_DIR = "caves"
OUTPUT_FILE = "caves.jsonl"
def setup_logging():
"""Configure logging with both file and console handlers."""
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = os.path.join(LOG_DIR, f"parse_{timestamp}.log")
logging.basicConfig(
level=LOG_LEVEL,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.FileHandler(log_filename, encoding="utf-8"), logging.StreamHandler()],
)
logging.info("Logging setup complete.")
def parse_cave_directory(cave_path):
data = {}
# Try to parse page_web_archive.html first
page_html_path = os.path.join(cave_path, "page_web_archive.html")
if not os.path.exists(page_html_path):
page_html_path = os.path.join(cave_path, "page.html")
if not os.path.exists(page_html_path):
logging.warning(f"Neither 'page_web_archive.html' nor 'page.html' found in {cave_path}")
return None
try:
with open(page_html_path, "r", encoding="utf-8") as f:
html_content = f.read()
except Exception as e:
logging.error(f"Error reading HTML file in {cave_path}: {e}")
return None
soup = BeautifulSoup(html_content, "html.parser")
table = soup.find("table", id="tableDetails1")
if not table:
logging.warning(f"Table with id 'tableDetails1' not found in {page_html_path}")
return None
# Extract key information from the table
for tr in table.find_all("tr"):
tds = tr.find_all("td")
if len(tds) == 2:
field_name = tds[0].get_text(strip=True)
field_value = tds[1].get_text(strip=True)
# Handle special case for 'Długość [m]' and 'w tym szacowane [m]'
if field_name == "Długość [m]w tym szacowane [m]":
# Extract values from the nested divs
divs = tds[1].find_all("div")
if len(divs) >= 2:
data["Długość [m]"] = divs[0].get_text(strip=True)
data["w tym szacowane [m]"] = divs[1].get_text(strip=True)
else:
data["Długość [m]"] = field_value
else:
# Remove colons from the field name
field_name = field_name.rstrip(":")
data[field_name] = field_value
# Add images information
images = []
image_links = soup.find_all("a", onclick=re.compile(r"showImageInfo\(\d+\)"))
for a_tag in image_links:
onclick_attr = a_tag["onclick"]
match = re.search(r"showImageInfo\((\d+)\)", onclick_attr)
if match:
image_id = match.group(1)
image_data = {}
metadata_file = os.path.join(cave_path, f"metadata_{image_id}.json")
image_file = os.path.join(cave_path, f"image_{image_id}_zoom_10.jpg")
# Load metadata file if it exists
if os.path.exists(metadata_file):
try:
with open(metadata_file, "r", encoding="utf-8") as mf:
metadata = json.load(mf)
image_data["metadata"] = metadata
except Exception as e:
logging.error(f"Error reading metadata file {metadata_file}: {e}")
image_data["metadata"] = None
else:
logging.warning(f"Metadata file {metadata_file} not found")
image_data["metadata"] = None
# Add relative path to image if it exists
if os.path.exists(image_file):
image_data["image_path"] = os.path.relpath(image_file)
else:
logging.warning(f"Image file {image_file} not found")
image_data["image_path"] = None
# Description of the image from the link text
image_description = a_tag.get_text(strip=True)
image_data["description"] = image_description
images.append(image_data)
data["images"] = images
# Add cave ID (directory name)
cave_id = os.path.basename(cave_path)
data["cave_id"] = cave_id
return data
def main():
setup_logging()
logging.info("Starting parsing of caves data.")
output_path = OUTPUT_FILE
with open(output_path, "w", encoding="utf-8") as out_f:
caves_processed = 0
caves_skipped = 0
for cave_dir in sorted(os.listdir(CAVES_DIR)):
cave_path = os.path.join(CAVES_DIR, cave_dir)
if os.path.isdir(cave_path):
logging.info(f"Processing cave {cave_dir}")
cave_data = parse_cave_directory(cave_path)
if cave_data:
out_f.write(json.dumps(cave_data, ensure_ascii=False) + "\n")
caves_processed += 1
else:
logging.warning(f"Skipping cave {cave_dir} due to missing or invalid data.")
caves_skipped += 1
logging.info(f"Processing complete. Caves processed: {caves_processed}, Caves skipped: {caves_skipped}")
if __name__ == "__main__":
main()