-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
177 lines (154 loc) · 6.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
DETAILS = "Detalii produs"
ACTIVE_SUBSTANCES = "Substanţe active"
RISK_AND_SECURITY = "Fraze de risc şi siguranţă"
USAGES = "Utilizări"
MRL = "MRL"
def get_product_json(browser, url):
# %%
product_json = {
DETAILS: {},
ACTIVE_SUBSTANCES: {},
RISK_AND_SECURITY: {},
USAGES: {},
MRL: {}
}
browser.get(url)
main_table = browser.find_element(By.XPATH, "//table[1]")
product_table = main_table.find_element(By.XPATH, ".//table[2]")
# %%
# Details
details = {}
details_table = product_table.find_element(By.XPATH, ".//table[1]")
tr_elements = details_table.find_elements(By.TAG_NAME, "tr")
for tr in tr_elements:
try:
name = tr.find_element(By.XPATH, ".//td[1]").text.replace(":", "").strip()
value = tr.find_element(By.XPATH, ".//td[2]").text.strip()
details[name] = value
except NoSuchElementException:
print(
"Some rows from the DETAILS section have been "
"skipped."
)
continue
product_json[DETAILS] = details
# print(details)
# %%
# Active substances
active_substances = {}
active_substances_table = product_table.find_element(By.XPATH, ".//table[2]")
tr_elements = active_substances_table.find_elements(By.TAG_NAME, "tr")
header = tr_elements[1] if tr_elements else None
if header:
h1 = header.find_element(By.XPATH, ".//td[1]").text
h2 = header.find_element(By.XPATH, ".//td[2]").text
h3 = header.find_element(By.XPATH, ".//td[3]").text
for idx, tr in enumerate(tr_elements[2:]):
try:
substance_name = tr.find_element(By.XPATH, ".//td[1]").text
value = tr.find_element(By.XPATH, ".//td[2]").text
um = tr.find_element(By.XPATH, ".//td[3]").text
active_substances[str(idx)] = {
h1: substance_name,
h2: value,
h3: um
}
except NoSuchElementException:
print(
"Some rows from the ACTIVE SUBSTANCES section have "
"been skipped."
)
continue
product_json[ACTIVE_SUBSTANCES] = active_substances
# print(active_substances)
# %%
# Risk and security
risk_and_security = {}
risk_and_security_table = product_table.find_element(By.XPATH, ".//table[3]")
tr_elements = risk_and_security_table.find_elements(By.TAG_NAME, "tr")
header = tr_elements[1] if tr_elements else None
if header:
h1 = header.find_element(By.XPATH, ".//td[1]").text
h2 = header.find_element(By.XPATH, ".//td[2]").text
for idx, tr in enumerate(tr_elements[2:]):
try:
phrase = tr.find_element(By.XPATH, ".//td[1]").text
category = tr.find_element(By.XPATH, ".//td[2]").text
risk_and_security[str(idx)] = {
h1: phrase,
h2: category
}
except NoSuchElementException:
print(
"Some rows from the RISK AND SECURITY section "
"have been skipped."
)
continue
product_json[RISK_AND_SECURITY] = risk_and_security
# print(risk_and_security)
# %%
# Usages
usages = {}
usages_table = product_table.find_element(By.XPATH, ".//table[4]")
tr_elements = usages_table.find_elements(By.TAG_NAME, "tr")
header = tr_elements[1] if tr_elements else None
if header:
h1 = header.find_element(By.XPATH, ".//td[1]").text
h2 = header.find_element(By.XPATH, ".//td[2]").text
h3 = header.find_element(By.XPATH, ".//td[3]").text
h4 = header.find_element(By.XPATH, ".//td[4]").text
h5 = header.find_element(By.XPATH, ".//td[5]").text
h6 = header.find_element(By.XPATH, ".//td[6]").text
for (idx, tr) in enumerate(tr_elements[2:]):
try:
crop = tr.find_element(By.XPATH, ".//td[1]").text
harmful_agent = tr.find_element(By.XPATH, ".//td[2]").text
name = tr.find_element(By.XPATH, ".//td[3]").text
dose = tr.find_element(By.XPATH, ".//td[4]").text
waiting_time = tr.find_element(By.XPATH, ".//td[5]").text
no_treatments = tr.find_element(By.XPATH, ".//td[6]").text
usages[str(idx)] = {
h1: crop,
h2: harmful_agent,
h3: name,
h4: dose,
h5: waiting_time,
h6: no_treatments
}
except NoSuchElementException:
print(
"Some rows from the USAGES section have been "
"skipped."
)
continue
product_json[USAGES] = usages
# print(usages)
# %%
# MRL
mrl = {}
mrl_table = product_table.find_element(By.XPATH, ".//table[5]")
tr_elements = mrl_table.find_elements(By.TAG_NAME, "tr")
header = tr_elements[1] if tr_elements else None
if header:
h1 = header.find_element(By.XPATH, ".//td[1]").text
h2 = header.find_element(By.XPATH, ".//td[2]").text
h3 = header.find_element(By.XPATH, ".//td[3]").text
for idx, tr in enumerate(tr_elements[2:]):
try:
residues = tr.find_element(By.XPATH, ".//td[1]").text
vegetable_product = tr.find_element(By.XPATH, ".//td[2]").text
mrl = tr.find_element(By.XPATH, ".//td[3]").text
mrl[str(idx)] = {
h1: residues,
h2: vegetable_product,
h3: mrl
}
except NoSuchElementException:
print("Some rows from the MRL section have been skipped.")
continue
product_json[MRL] = mrl
# print(mrl)
return product_json
# %%