-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwss.py
360 lines (311 loc) · 18.1 KB
/
wss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#!/usr/bin/python3
#==============================================================================#
# #
# [prog] : WebScrapeSite (wss) #
# [ver] : 0.0.1 #
# [desc] : It scrapes websites and gives insight into the data within secs #
# [usage] : python3 wss.py -u <url> #
# [dev] : @s41r4j #
# [license]: MIT License #
# [github] : https://github.com/s41r4j/wss #
# #
#==============================================================================#
#==============================================================================#
# #
# [!] Legal/Ethical disclaimer: #
# #
# > `wss` is a tool designed to grather information about a target #
# which is publicly available. #
# > It is the end user's responsibility to obey all applicable #
# local, state and federal laws. #
# > Developers assume no liability and are not responsible for any misuse #
# or damage caused by this program. #
# #
#==============================================================================#
# ----------------------IMPORTS--------------------------
import requests
import argparse
import re
import random
import time
import sys
import os
import whois
import rich
import socket
from rich.table import Table
from bs4 import BeautifulSoup
# ----------------------FUNCTIONS------------------------
def parse_args():
parser = argparse.ArgumentParser(description="\33[7;49;97m[WebScrapeSite]\033[0m: FAST, EASY & INSIGHTFUL Website Analysis Tool")
parser.add_argument("-u", "--url", help="url to scrape", required=True)
parser.add_argument("-o", "--output", help="output filename, default: `<website_name>.csv`")
parser.add_argument("-f", "--filetype", help="output filetype (`csv`, `json`), default: csv")
parser.add_argument("-d", "--download", help="download & save html file only", action="store_true")
parser.add_argument("-a", "--download-all", help="download & save all files (html, css, js, images)", action="store_true")
parser.add_argument("-b", "--debug", help="debug mode", action="store_true")
args = parser.parse_args()
return args
def get_html(url):
# combinations: windows, mac, linux, android, ios - chrome, firefox, safari, edge, opera
user_agent = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Android 11; Pixel 5 Build/RQ3A.210205.001"]
try:
r = requests.get(url, headers={"User-Agent": random.choice(user_agent)})
r.raise_for_status()
return r.text
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
def same_domain(url1, url2):
# extract domain from url using regex (with removal of subdomains)
domain1 = re.findall(r"(?:https?:\/\/)?(?:[^@\n]+@)?([^:\/\n?]+)", url1)[0]
domain2 = re.findall(r"(?:https?:\/\/)?(?:[^@\n]+@)?([^:\/\n?]+)", url2)[0]
# if there are more than 1 "." in domain, remove subdomains
if domain1.count(".") > 1:
domain1 = domain1[domain1.find(".")+1:]
if domain2.count(".") > 1:
domain2 = domain2[domain2.find(".")+1:]
# check if domains are the same
if domain1 == domain2:
return True
else:
return False
def space(n):
if n == 1:
return ' '
elif n == 2:
return ' '
else:
return ''
def save_file(data, opfile, url, ftype):
# file type
ext = ".csv" # default
if ftype != None and ftype.lower() == "json":
ext = ".json"
# output data
file = (opfile if (opfile).endswith(ext) else opfile + ext) if opfile else (url.split("//")[1].split("/")[0].replace(".", "_") + ext)
# open file (create if not exists / overwrite if exists)
with open(file, "w") as f:
# csv
if ext == ".csv":
# write data headers
f.write(f"[WebScrapeSite], Data_Insights\n[GITHUB], https://github.com/s41r4j/wss\n[ERROR/ISSUE], https://github.com/s41r4j/wss/issues\n\n\nDOMAIN_NAME, {data['Whois'][0]['domain_name'][1] if type(data['Whois'][0]['domain_name']) == list else data['Whois'][0]['domain_name']}\nORGANIZATION, {data['Whois'][0]['org']}\nREGISTRAR, {data['Whois'][0]['registrar']}\nIP_ADDRESS, {data['IP_Address']}\nCREATION_DATE, {data['Whois'][0]['creation_date'][1] if type(data['Whois'][0]['creation_date']) == list else data['Whois'][0]['creation_date']}\nUPDATED_DATE, {data['Whois'][0]['updated_date'][1] if type(data['Whois'][0]['updated_date']) == list else data['Whois'][0]['updated_date']}\nEXPIRY_DATE, {data['Whois'][0]['expiration_date'][1] if type(data['Whois'][0]['expiration_date']) == list else data['Whois'][0]['expiration_date']}\nHTML_SIZE, {data['HTML_Size']} bytes\n\n\n")
# write emails
f.write("EMAILS:\n")
for email in data["Emails"]:
f.write(f",{email}\n")
# write numbers
f.write("\nNUMBERS:\n")
for number in data["Numbers"]:
f.write(f",{number}\n")
# write images
f.write("\nIMAGES:\n")
for image in data["Images"]:
f.write(f",{image}\n")
# write files
f.write("\nSCRIPTS (.js):\n")
for afile in data["Files"][0]:
f.write(f",{afile}\n")
f.write("\nSTYLES (.css):\n")
for afile in data["Files"][1]:
f.write(f",{afile}\n")
# write links
f.write("\nINTERNAL LINKS:\n")
for link in data["Links"][0]:
f.write(f",{link}\n")
f.write("\nEXTERNAL LINKS:\n")
for link in data["Links"][1]:
f.write(f",{link}\n")
# json
elif ext == ".json":
# write data headers
f.write('[\n {\n "[WebScrapeSite]": "Data_Insights",\n "[GITHUB]": "https://github.com/s41r4j/wss",\n "[ERROR/ISSUE]": "https://github.com/s41r4j/wss/issues"\n },\n {\n "domain_name": "' + data['Whois'][0]['domain_name'][1] if type(data['Whois'][0]['domain_name']) == list else data['Whois'][0]['domain_name'])
f.write('",\n "organization": "' + data['Whois'][0]['org'] + '",\n "registrar": "' + data['Whois'][0]['registrar'] + '",\n "ip_address": "' + data['IP_Address'] + '"\n },\n {\n "creation_date": "' + str(data['Whois'][0]['creation_date'][1] if type(data['Whois'][0]['creation_date']) == list else data['Whois'][0]['creation_date']) + '",\n "updated_date": "' + str(data['Whois'][0]['updated_date'][1] if type(data['Whois'][0]['updated_date']) == list else data['Whois'][0]['updated_date']) + '",\n "expiration_date": "' + str(data['Whois'][0]['expiration_date'][1] if type(data['Whois'][0]['expiration_date']) == list else data['Whois'][0]['expiration_date']) + '"\n },\n {\n "emails": [\n')
# write emails
for i in range(len(data["Emails"])-1):
f.write(f' "{data["Emails"][i]}",\n')
# write numbers
f.write(f' "{data["Emails"][-1]}"\n ],\n "numbers": [\n')
for i in range(len(data["Numbers"])-1):
f.write(f' "{data["Numbers"][i]}",\n')
# write images
f.write(f' "{data["Numbers"][-1]}"\n ],\n "images": [\n')
for i in range(len(data["Images"])-1):
f.write(f' "{data["Images"][i]}",\n')
# write files
f.write(f' "{data["Images"][-1]}"\n ],\n "scripts": [\n')
for i in range(len(data["Files"][0])-1):
f.write(f' "{data["Files"][0][i]}",\n')
f.write(f' "{data["Files"][0][-1]}"\n ],\n "styles": [\n')
for i in range(len(data["Files"][1])-1):
f.write(f' "{data["Files"][1][i]}",\n')
# write links
f.write(f' "{data["Files"][1][-1]}"\n ],\n "internal_links": [\n')
for i in range(len(data["Links"][0])-1):
f.write(f' "{data["Links"][0][i]}",\n')
f.write(f' "{data["Links"][0][-1]}"\n ],\n "external_links": [\n')
for i in range(len(data["Links"][1])-1):
f.write(f' "{data["Links"][1][i]}",\n')
f.write(f' "{data["Links"][1][-1]}"\n')
f.write(' ]\n }\n]\n')
return file
def data_parser(html, url):
# variables
files = [[], []] # script/style > src/href (0: script, 1: style)
links = [[],[]] # a > href (0: internal/same domain, 1: external)
emails = [] # regex
images = [] # img > src/href
# parsing
soup = BeautifulSoup(html, "html.parser")
# links
for link in soup.find_all("a"):
try:
if link.get("href").startswith("http"):
if same_domain(url, link.get("href")):
links[0].append(link.get("href"))
else:
links[1].append(link.get("href"))
else:
if same_domain(url, url + link.get("href")):
links[0].append(url + link.get("href"))
else:
links[1].append(url + link.get("href"))
except: pass
# files
# script
# get all scripts that have src attribute only
for script in soup.find_all("script", src=True):
try:
if script.get("src").endswith(".js"):
if script.get("src").startswith("http"):
files[0].append(script.get("src"))
else:
files[0].append(url + script.get("src"))
except: pass
# style
# get all styles that have href attribute only
for style in soup.find_all("link", href=True):
try:
if style.get("href").endswith(".css"):
if style.get("href").startswith("http"):
files[1].append(style.get("href"))
else:
files[1].append(url + style.get("href"))
except: pass
# images
for image in soup.find_all("img"):
try:
if image.get("src").startswith("http"):
images.append(image.get("src"))
else:
images.append(url + image.get("src"))
except AttributeError: pass
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
# remove duplicates, blank spaces and None
images = list(filter(lambda x: x.strip(),list(filter(None, list(dict.fromkeys(images))))))
# emails
# emails = re.findall(r'^[a-zA-Z0-9\_\-]+@[a-zA-Z0-9\_\-]+\.[a-zA-Z0-9]*', html)
emails = re.findall(r'[\w\.-]+@[\w\.-]+', html)
emails = list(dict.fromkeys(emails)) # remove duplicates
# numbers
# numbers = re.findall(r'^(\+?\d{1,2}\s?)?([\s\-]?\d{1,3}|\(\d{1,4}\))[\s\-]?\d{1,4}[\s\-]?\d{2,4}$', html)
numbers = re.findall(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', html)
numbers = list(dict.fromkeys(numbers)) # remove duplicates
# html file size (bytes)
html_size = sys.getsizeof(html)
# whois info
whois_data = whois.whois(url)
whois_keys = ['clear', 'copy', 'dayfirst', 'domain', 'fromkeys', 'get', 'items', 'load', 'parse', 'pop', 'popitem', 'setdefault', 'text', 'update', 'values', 'yearfirst']
whois_detials = [whois_data, whois_keys]
# get ip address (socket.gaierror)
ip = socket.gethostbyname(whois_data["domain_name"][1] if type(whois_data["domain_name"]) == list else whois_data["domain_name"])
return {"Links": links, "Files": files, "Images": images, "Emails": emails, "Numbers": numbers, "HTML_Size": html_size, "Whois": whois_detials, "IP_Address": ip}
# need some fixing
def downloader(data, args, html):
# download all files
if args.download_all:
# create folder & file name
plain_name = args.url.split("//")[1].split("/")[0].replace(".", "_")
try: os.mkdir(plain_name)
except FileExistsError: pass
# download html
with open(plain_name + "/" + plain_name + ".html", "w") as f:
try: f.write(html)
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
try:
# download files
for file in data["Files"][0]:
try:
r = requests.get(file)
with open(plain_name + "/" + file.split("/")[-1], "wb") as f:
f.write(r.content)
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
for file in data["Files"][1]:
try:
r = requests.get(file)
with open(plain_name + "/" + file.split("/")[-1], "wb") as f:
f.write(r.content)
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
# download images
for image in data["Images"]:
try:
r = requests.get(image)
with open(plain_name + "/" + image.split("/")[-1], "wb") as f:
f.write(r.content)
except MissingSchema:
pass
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
elif args.download:
# download html
with open(args.url.split("//")[1].split("/")[0].replace(".", "_") + ".html", "w") as f:
try: f.write(html)
except Exception as e:
if args.debug: print('\33[1;49;91m [WSS_ERR]: ', e, '\033[0m')
# ----------------------MAIN-----------------------------
def main():
args = parse_args()
# start time
start_time = time.time()
# check if url starts with http
if not args.url.startswith("http"):
args.url = "https://" + args.url
# get html
html = get_html(args.url)
# parsing data
data = data_parser(html, args.url)
# download files (if specified)
downloader(data, args, html)
# save csv
op_loc = save_file(data, args.output, args.url, args.filetype) # output location
# print data insights
print(f'''
\33[1;49;97m/=====================\\
|\33[7;49;97m[wss] DATA_INSIGHTS \033[0m\33[1;49;97m| \33[7;49;31mhttps://github.com/s41r4j/wss\033[0m
\33[1;49;97m|=====================|
\33[1;49;97m|\33[1;49;92m[+] \33[1;49;96mlinks \33[3;49;97m| \33[1;49;96mdomain_name : \33[4;49;92m{data["Whois"][0]["domain_name"][1] if type(data["Whois"][0]["domain_name"]) == list else data["Whois"][0]["domain_name"]}\033[0m
\33[1;49;97m| \33[1;49;93m- \33[1;49;96minternal: \33[3;49;91m{str(len(data["Links"][0]))+space(len(str(len(data["Links"][0]))))} \33[1;49;97m| \33[3;49;96morganization :\33[1;49;92m {data["Whois"][0]["org"]}\033[0m
\33[1;49;97m| \33[1;49;93m- \33[1;49;96mexternal: \33[3;49;91m{str(len(data["Links"][1]))+space(len(str(len(data["Links"][1]))))} \33[1;49;97m| \33[3;49;96mregistrar :\33[1;49;92m {data["Whois"][0]["registrar"]}\033[0m
\33[1;49;97m|---------------------| \33[3;49;96mip_address : \33[4;49;92m{data["IP_Address"]}\033[0m
\33[1;49;97m|\33[1;49;92m[+] \33[1;49;96mfiles \33[1;49;97m|
\33[1;49;97m| \33[1;49;93m- \33[1;49;96mscripts: \33[3;49;91m{str(len(data["Files"][0]))+space(len(str(len(data["Files"][0]))))} \33[1;49;97m| \33[3;49;96mcreation_date:\33[1;49;92m {data["Whois"][0]["creation_date"][1] if type(data["Whois"][0]["creation_date"]) == list else data["Whois"][0]["creation_date"]}\033[0m
\33[1;49;97m| \33[1;49;93m- \33[1;49;96mstyles: \33[3;49;91m{str(len(data["Files"][1]))+space(len(str(len(data["Files"][1]))))} \33[1;49;97m| \33[3;49;96mupdated_date :\33[1;49;92m {data["Whois"][0]["updated_date"][1] if type(data["Whois"][0]["updated_date"]) == list else data["Whois"][0]["updated_date"]}\033[0m
\33[1;49;97m|---------------------| \33[3;49;96mexpiry_date :\33[1;49;92m {data["Whois"][0]["expiration_date"][1] if type(data["Whois"][0]["expiration_date"]) == list else data["Whois"][0]["expiration_date"]}\033[0m
\33[1;49;97m|\33[1;49;92m[+] \33[1;49;96mimages: \33[3;49;91m{str(len(data["Images"]))+space(len(str(len(data["Images"]))))} \33[1;49;97m|
\33[1;49;97m|---------------------| \33[3;49;96mhtml_size :\33[1;49;92m {data["HTML_Size"]} bytes\033[0m
\33[1;49;97m|\33[1;49;92m[+] \33[1;49;96memails: \33[3;49;91m{str(len(data["Emails"]))+space(len(str(len(data["Emails"]))))} \33[1;49;97m|
\33[1;49;97m|---------------------| \33[3;49;96moutput_file : \33[4;49;93m{op_loc}\033[0m
\33[1;49;97m|\33[1;49;92m[+] \33[1;49;96mnumbers: \33[3;49;91m{str(len(data["Numbers"]))+space(len(str(len(data["Numbers"]))))} \33[1;49;97m| \33[3;49;96mscan_time :\33[1;49;93m {round(time.time() - start_time, 2)}s\033[0m
\33[1;49;97m|=====================|
|\33[7;49;97m [WebScrapeSite] \033[0m\33[1;49;97m| \33[7;49;31mall data is saved in `output_file`\033[0m
\33[1;49;97m\\=====================/
''')
if __name__ == "__main__":
main()