-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweb-info-scraping.py
142 lines (107 loc) · 4.29 KB
/
web-info-scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from selenium import webdriver
from bs4 import BeautifulSoup
import wget
import csv
import os
from urllib.error import HTTPError
start_index = 1
output_directory = 'scrape'
URL = 'http://ictcf.biocuckoo.cn/Resource.php'
domain = 'http://ictcf.biocuckoo.cn/'
driver = webdriver.Chrome('/Users/darylfung/chromedriver')
headers_filename = 'scrape/headers.txt'
csv_filename = os.path.join(output_directory, 'data.csv')
all_headers = {'patient id': False, 'hospital': False, 'age': False, 'gender': False, 'body temperature': False,
'underlying disease': False, 'is_covid': False,
'is_ct': False, 'morbidity': False, 'mortality': False}
other_info_header_set = False
def get_overview_info(bsoup):
overview_table = bsoup.find('table', {"class": "array1"})
table_rows = overview_table.findAll('tr')[1:]
all_info = all_headers.copy()
for table_row in table_rows:
row_label = table_row.find('td', {'class': 'tablabel'}).text[:-2].lower()
row_content = table_row.find('td', {'class': 'content'}).text
if 'hospital' in row_label:
all_info['hospital'] = row_content
elif 'age' in row_label:
all_info['age'] = row_content
elif 'gender' in row_label:
all_info['gender'] = row_content
elif 'underlying' in row_label:
all_info['underlying disease'] = row_content
elif 'sars-cov-2' in row_label:
all_info['is_covid'] = row_content
elif 'computed' in row_label:
all_info['is_ct'] = row_content
elif 'morbidity' in row_label:
all_info['morbidity'] = row_content
elif 'mortality' in row_label:
all_info['mortality'] = row_content
return all_info
def get_other_info(bsoup, all_info):
global other_info_header_set
other_tables = bsoup.findAll('table')[1:]
for other_table in other_tables:
row_infos = other_table.findAll('tr')[1:]
for row_info in row_infos:
each_infos = row_info.findAll('td')
name_abbreviation = each_infos[1].text
value = each_infos[2].text
all_info[name_abbreviation] = value
return all_info
domain_url = 'http://ictcf.biocuckoo.cn/view.php?id='
def get_headers(patients):
try:
with open(headers_filename, 'r') as f:
headers = f.read()
for header in headers.split(","):
if all_headers.get(header, None) is None:
all_headers[header] = False
return
except FileNotFoundError:
print("header file not found, generating headers")
for patient in patients:
if 'Patient' not in patient:
continue
driver.get(domain_url + patient)
bsoup = BeautifulSoup(driver.page_source, 'html.parser')
other_tables = bsoup.findAll('table')[1:]
for other_table in other_tables:
row_infos = other_table.findAll('tr')[1:]
for row_info in row_infos:
each_infos = row_info.findAll('td')
name_abbreviation = each_infos[1].text
if not other_info_header_set:
if all_headers.get(name_abbreviation, None) is None:
all_headers[name_abbreviation] = False
# remove this break
# break
# just temporarily
all_patients = os.listdir('scrape/')
all_patients = sorted(all_patients)
get_headers(all_patients)
# convert dictionary to list to write csv file
# write the first row as header first
headers = list(all_headers.keys())
with open(csv_filename, 'a') as f:
writer = csv.writer(f)
writer.writerow(headers)
start = False
for patient in all_patients:
if 'Patient' not in patient:
continue
if patient == 'Patient 1152':
start = True
if start:
driver.get(domain_url + patient)
bsoup = BeautifulSoup(driver.page_source, 'html.parser')
all_info = get_overview_info(bsoup)
all_info = get_other_info(bsoup, all_info)
all_info['patient id'] = patient
with open(csv_filename, 'a') as f:
writer = csv.writer(f)
writer.writerow(list(all_info.values()))
# save image
# patient_path = os.path.join(output_directory, patient)
# os.makedirs(patient_path, exist_ok=True)