-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuser_agent_count.py
34 lines (26 loc) · 1.35 KB
/
user_agent_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import ujson as json
from sparkcc import CCSparkJob
class UserAgentCount(CCSparkJob):
""" Count server names sent in HTTP response header
(WARC and WAT is allowed as input)"""
name = "CountUserAgent"
fallback_user_agent_name = '(no UserAgent specified in robots.txt file)'
def process_record(self, record):
if self.is_response_record(record):
if record.rec_type == 'response' and record.http_headers and record.content_stream():
# Check if the content type is 'text/plain'
content_type = record.http_headers.get_header('Content-Type')
if content_type and 'text/plain' in content_type:
# Extract content from the record
content = record.content_stream().read().decode('latin-1', errors='replace')
# Find and print the 'User-agent' directives
user_agent_lines = [line.strip() for line in content.splitlines() if line.startswith('User-agent:')]
for user_agent_line in user_agent_lines:
user_agent = user_agent_line[len('User-agent:'):].strip()
yield user_agent, 1
else:
# warcinfo, request, non-WAT metadata records
pass
if __name__ == "__main__":
job = UserAgentCount()
job.run()