Skip to content

Commit

Permalink
Add some notes, queries.
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Nov 21, 2023
1 parent 0622565 commit b6d9024
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 3 deletions.
46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,52 @@ Take crawl events and turn them into Apache Parquet so they can be queried using

Requires Python >= 3.8 to avoid fastparquet bug https://github.com/dask/fastparquet/issues/825

## Example of use

14.5G 100 14.5G 0 0 48.1M 0 0:05:08 0:05:08 --:--:-- 52.2M
[ec2-user@fc crawl-db]$ wc crawl.log.cp00004
37357947 635085091 15578444671 crawl.log.cp00004


(.venv) [ec2-user@fc crawl-db]$ head -1 /mnt/data/fc/crawl-db/crawl.log.cp00004
2023-11-15T17:13:05.828Z -5003 - https://cdn.mos.cms.futurecdn.net/EaNfygyiNnxXmUAXnpcojG-768-80.jpeg LE https://www.wallpaper.com/fashion-beauty/sacai-mercedes-benz-amg-collaboration unknown #070 - - tid:94192:https://www.wallpaper.com/ Q:serverMaxSuccessKb {"scopeDecision":"ACCEPT by rule #2 MatchesRegexDecideRule"}
(.venv) [ec2-user@fc crawl-db]$ tail -1 /mnt/data/fc/crawl-db/crawl.log.cp00004
2023-11-16T12:36:14.536Z 200 357908 https://www.thegazette.co.uk/sitemap-201709-1.xml.gz II https://www.thegazette.co.uk/sitemap.xml application/xml #137 20231116115454719+406 sha1:YUPZK5EJ7WRFWRRLD3CMGHH7ZQW3NDX7 tid:37575:https://www.thegazette.co.uk/ isSitemap,launchTimestamp:20231115090324,err=java.lang.NullPointerException,dol:50002,ip:18.165.201.84 {"contentSize":358576,"warcFilename":"BL-NPLD-20231116115015552-01164-44~npld-heritrix3-worker-1~8443.warc.gz","warcFileOffset":220898170,"scopeDecision":"ACCEPT by rule #1 WatchedFileSurtPrefixedDecideRule","warcFileRecordLength":460354}

1098 2023-11-16:13:05:01 python -m crawldb.parquet.cli import /mnt/data/fc/heritrix/output/frequent-npld/20231115123452/logs/crawl.log.cp00004-20231116123457 /mnt/data/fc/crawl-db/npld-cp00004.parquet

-rw-r--r--. 1 ec2-user ec2-user 2.8G Nov 16 15:33 npld-cp00004.parquet

┌──────────────┐
│ count_star() │
│ int64 │
├──────────────┤
│ 37357947 │
└──────────────┘


┌──────────────────────┬──────────────────────┬──────────────────────┬───────────────┬─────────┬───┬───────────────┬─────────────┬─────────────┬────────────┐
│ id │ url │ host │ domain │ ip │ … │ warc_filename │ warc_offset │ warc_length │ wire_bytes │
│ varchar │ varchar │ varchar │ varchar │ varchar │ │ varchar │ int64 │ int64 │ int64 │
├──────────────────────┼──────────────────────┼──────────────────────┼───────────────┼─────────┼───┼───────────────┼─────────────┼─────────────┼────────────┤
│ 2023-11-15T17:13:0… │ https://cdn.mos.cm… │ cdn.mos.cms.future… │ futurecdn.net │ NULL │ … │ NULL │ NULL │ NULL │ NULL │
├──────────────────────┴──────────────────────┴──────────────────────┴───────────────┴─────────┴───┴───────────────┴─────────────┴─────────────┴────────────┤
│ 1 rows 22 columns (9 shown) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

┌──────────────────────┬──────────────────────┬──────────────────────┬──────────────────┬───┬──────────────────────┬─────────────┬─────────────┬────────────┐
│ id │ url │ host │ domain │ … │ warc_filename │ warc_offset │ warc_length │ wire_bytes │
│ varchar │ varchar │ varchar │ varchar │ │ varchar │ int64 │ int64 │ int64 │
├──────────────────────┼──────────────────────┼──────────────────────┼──────────────────┼───┼──────────────────────┼─────────────┼─────────────┼────────────┤
│ 2023-11-16T12:36:1… │ https://www.thegaz…www.thegazette.co.uk │ thegazette.co.uk │ … │ BL-NPLD-2023111611… │ 220898170 │ 460354 │ 358576 │
├──────────────────────┴──────────────────────┴──────────────────────┴──────────────────┴───┴──────────────────────┴─────────────┴─────────────┴────────────┤
│ 1 rows 22 columns (8 shown) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘





## To Do

- Make some DC2023 data accessible via Apache Superset to explore if we need more fields.
Expand Down
27 changes: 24 additions & 3 deletions duckdb-query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,33 @@
in_file = 'outfile.parquet'
in_file = '/mnt/data/fc/crawl-db/npld-cp00004.parquet'


def run_print_and_save(query, csv_file):
q = duckdb.query(query)
print(q)
q.to_csv(csv_file)

print(duckdb.query(f"DESCRIBE SELECT * FROM '{in_file}'"))
#print(duckdb.query(f"SELECT id,timestamp FROM '{in_file}' WHERE domain == 'bbc.co.uk' LIMIT 10"))
print(duckdb.query(f"SELECT COUNT(*) FROM '{in_file}'"))
print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id ASC LIMIT 1"))
print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id DESC LIMIT 1"))
#run_print_and_save(f"SELECT * FROM '{in_file}' WHERE domain == 'bbc.co.uk' LIMIT 10", "some_rows.csv")
run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
#print(duckdb.query(f"SELECT domain, status_code, COUNT(*) from '{in_file}' GROUP BY domain, status_code ORDER BY COUNT(*) DESC"))
#print(duckdb.query(f"SELECT domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, content_type ORDER BY COUNT(*) DESC"))
print(duckdb.query(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, status_code, COUNT(*) \
FROM '{in_file}' WHERE status_code == 200 OR status_code == -5003 OR status_code == -5002 GROUP BY start_hour, status_code ORDER BY start_hour ASC, COUNT(*) DESC"))

#print(duckdb.query(f"SELECT domain, status_code, annotations, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, annotations ORDER BY COUNT(*) DESC"))

#run_print_and_save(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, COUNT(*) \
#FROM '{in_file}' GROUP BY ALL ORDER BY start_hour ASC", "totals_by_hour.csv")

#run_print_and_save(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, status_code, COUNT(*) \
#FROM '{in_file}' WHERE status_code == 200 OR status_code == -5003 OR status_code == -5002 \
#GROUP BY start_hour, status_code ORDER BY start_hour ASC, COUNT(*) DESC", "critical_status_codes_by_hour.csv")

#print(duckdb.query(f"COPY (SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) AS start_hour, domain, status_code, SUM(content_length) AS total_bytes, COUNT(*) \
#FROM '{in_file}' GROUP BY start_hour, domain, status_code ORDER BY start_hour ASC, status_code DESC) TO 'totals_by_hour_domain_status_code.csv'"))

#print(duckdb.query(f"SELECT ip, COUNT(*) from '{in_file}' WHERE ip IS NOT NULL GROUP BY ip ORDER BY COUNT(*) DESC"))
#print(duckdb.query(f"SELECT host, COUNT(*) from '{in_file}' WHERE status_code > 0 GROUP BY host ORDER BY COUNT(*) DESC"))
#print(duckdb.query(f"SELECT domain, COUNT(DISTINCT host) from '{in_file}' GROUP BY domain ORDER BY COUNT(DISTINCT host) DESC"))
Expand Down

0 comments on commit b6d9024

Please sign in to comment.