From 6a0b93c4a4d17ae4ed9ef95ce10ac8b06fa0f6e1 Mon Sep 17 00:00:00 2001 From: gilh Date: Wed, 22 Nov 2023 16:14:15 +0000 Subject: [PATCH] Add a slighly more complex example. --- README.md | 41 +++++++++++++++++++++++++++++++++++++++-- duckdb-query.py | 13 +++++++------ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e176988..fca7490 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,11 @@ python -m crawldb.parquet.cli import crawl.log.cp00004-20231116123457 crawl-log This took a while, about an hour, but created a queryable file only 2.8GB in size. -Using the example queries (see `duckdb-query.py` for details), we could query this file very quickly, with even fairly intensive aggregation queries only requiring a second or so to run. +Using the example queries (see `duckdb-query.py` for details), we could query this file very quickly, with even fairly intensive aggregation queries only requiring a second or so to run. Some simple examples are: +### Total records + ``` SELECT COUNT(*) FROM 'crawl-log-cp00004.parquet'; @@ -33,7 +35,42 @@ SELECT COUNT(*) FROM 'crawl-log-cp00004.parquet'; ``` - +### Breakdown by status codes + +``` +SELECT status_code, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from 'crawl-log-cp00004.parquet' GROUP BY status_code ORDER BY COUNT(*) DESC; + +┌─────────────┬──────────────┬───────────────┐ +│ status_code │ total_bytes │ total_records │ +│ int64 │ int128 │ int64 │ +├─────────────┼──────────────┼───────────────┤ +│ -5003 │ NULL │ 30199915 │ +│ 200 │ 561472969271 │ 4868164 │ +│ -9998 │ NULL │ 613207 │ +│ 301 │ 336555824 │ 574581 │ +│ 302 │ 459076182 │ 344545 │ +│ 403 │ 977360013 │ 228733 │ +│ 404 │ 8837109684 │ 193986 │ +│ -6 │ NULL │ 87998 │ +│ 307 │ 668014 │ 66263 │ +│ 503 │ 188275757 │ 44856 │ +│ · │ · │ · │ +│ · │ · │ · │ +│ · │ · │ · │ +│ 415 │ 336 │ 3 │ +│ 421 │ 513 │ 3 │ +│ 402 │ 12790 │ 2 │ +│ 417 │ 0 │ 2 │ +│ 299 │ 314915 │ 2 │ +│ 203 │ 14252 │ 1 │ +│ 524 │ 7222 │ 1 │ +│ 999 │ 1530 │ 1 │ +│ 408 │ 12448 │ 1 │ +│ -5 │ NULL │ 1 │ +├─────────────┴──────────────┴───────────────┤ +│ 63 rows (20 shown) 3 columns │ +└────────────────────────────────────────────┘ +``` ## Previous Designs diff --git a/duckdb-query.py b/duckdb-query.py index 463b7c7..6ff2206 100644 --- a/duckdb-query.py +++ b/duckdb-query.py @@ -9,26 +9,27 @@ def run_print_and_save(query, csv_file): q.to_csv(csv_file) -print(duckdb.query(f"SELECT COUNT(*) FROM '{in_file}'")) +print(duckdb.query(f"DESCRIBE SELECT * FROM '{in_file}'").df()) + +print(duckdb.query(f"SELECT COUNT(*) as total_records FROM '{in_file}'")) #print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id ASC LIMIT 1")) #print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id DESC LIMIT 1")) # Scan for activity from a particular url_domain #run_print_and_save(f"SELECT * FROM '{in_file}' WHERE url_domain == 'bbc.co.uk' LIMIT 10", "some_rows.csv") -run_print_and_save(f"SELECT * FROM '{in_file}' WHERE status_code == -5002 LIMIT 10", "some_rows.csv") +run_print_and_save(f"SELECT * FROM '{in_file}' WHERE status_code == 299 LIMIT 10", "some_rows.csv") #run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv") #print(duckdb.query(f"SELECT domain, status_code, COUNT(*) from '{in_file}' GROUP BY domain, status_code ORDER BY COUNT(*) DESC")) #print(duckdb.query(f"SELECT domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, content_type ORDER BY COUNT(*) DESC")) -print(duckdb.query(f"DESCRIBE SELECT * FROM '{in_file}'").df()) - #run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv") -run_print_and_save(f"SELECT url, tries, start_time, duration, launch_time, (start_time - launch_time) AS delay, webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 ORDER BY delay DESC LIMIT 100", "some_rows.csv") +#run_print_and_save(f"SELECT url, tries, start_time, duration, launch_time, (start_time - launch_time) AS delay, webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 ORDER BY delay DESC LIMIT 100", "some_rows.csv") #print(duckdb.query(f"SELECT url_domain, status_code, COUNT(*) from '{in_file}' GROUP BY url_domain, status_code ORDER BY COUNT(*) DESC")) -print(duckdb.query(f"SELECT url_domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' GROUP BY url_domain, status_code, content_type ORDER BY COUNT(*) DESC")) +#print(duckdb.query(f"SELECT url_domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' GROUP BY url_domain, status_code, content_type ORDER BY COUNT(*) DESC")) +print(duckdb.query(f"SELECT status_code, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from '{in_file}' GROUP BY status_code ORDER BY COUNT(*) DESC")) #print(duckdb.query(f"SELECT url_domain, status_code, annotations, SUM(content_length), COUNT(*) from '{in_file}' WHERE url_domain == 'bbc.co.uk' GROUP BY url_domain, status_code, annotations ORDER BY COUNT(*) DESC"))