From 6a0b93c4a4d17ae4ed9ef95ce10ac8b06fa0f6e1 Mon Sep 17 00:00:00 2001
From: gilh <gil.hoggarth@bl.uk>
Date: Wed, 22 Nov 2023 16:14:15 +0000
Subject: [PATCH] Add a slighly more complex example.

---
 README.md       | 41 +++++++++++++++++++++++++++++++++++++++--
 duckdb-query.py | 13 +++++++------
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index e176988..fca7490 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,11 @@ python -m crawldb.parquet.cli import  crawl.log.cp00004-20231116123457 crawl-log
 
 This took a while, about an hour, but created a queryable file only 2.8GB in size.
 
-Using the example queries (see `duckdb-query.py` for details), we could query this file very quickly, with even fairly intensive aggregation queries only requiring a second or so to run.
+Using the example queries (see `duckdb-query.py` for details), we could query this file very quickly, with even fairly intensive aggregation queries only requiring a second or so to run. Some simple examples are:
 
 
+### Total records
+
 ```
 SELECT COUNT(*) FROM 'crawl-log-cp00004.parquet';
 
@@ -33,7 +35,42 @@ SELECT COUNT(*) FROM 'crawl-log-cp00004.parquet';
 ```
 
 
- 
+### Breakdown by status codes
+
+```
+SELECT status_code, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from 'crawl-log-cp00004.parquet' GROUP BY status_code ORDER BY COUNT(*) DESC;
+
+┌─────────────┬──────────────┬───────────────┐
+│ status_code │ total_bytes  │ total_records │
+│    int64    │    int128    │     int64     │
+├─────────────┼──────────────┼───────────────┤
+│       -5003 │         NULL │      30199915 │
+│         200 │ 561472969271 │       4868164 │
+│       -9998 │         NULL │        613207 │
+│         301 │    336555824 │        574581 │
+│         302 │    459076182 │        344545 │
+│         403 │    977360013 │        228733 │
+│         404 │   8837109684 │        193986 │
+│          -6 │         NULL │         87998 │
+│         307 │       668014 │         66263 │
+│         503 │    188275757 │         44856 │
+│          ·  │           ·  │             · │
+│          ·  │           ·  │             · │
+│          ·  │           ·  │             · │
+│         415 │          336 │             3 │
+│         421 │          513 │             3 │
+│         402 │        12790 │             2 │
+│         417 │            0 │             2 │
+│         299 │       314915 │             2 │
+│         203 │        14252 │             1 │
+│         524 │         7222 │             1 │
+│         999 │         1530 │             1 │
+│         408 │        12448 │             1 │
+│          -5 │         NULL │             1 │
+├─────────────┴──────────────┴───────────────┤
+│ 63 rows (20 shown)               3 columns │
+└────────────────────────────────────────────┘
+```
 
 
 ## Previous Designs
diff --git a/duckdb-query.py b/duckdb-query.py
index 463b7c7..6ff2206 100644
--- a/duckdb-query.py
+++ b/duckdb-query.py
@@ -9,26 +9,27 @@ def run_print_and_save(query, csv_file):
     q.to_csv(csv_file)
 
 
-print(duckdb.query(f"SELECT COUNT(*) FROM '{in_file}'"))
+print(duckdb.query(f"DESCRIBE SELECT * FROM '{in_file}'").df())
+
+print(duckdb.query(f"SELECT COUNT(*) as total_records FROM '{in_file}'"))
 
 #print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id ASC LIMIT 1"))
 #print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id DESC LIMIT 1"))
 
 # Scan for activity from a particular url_domain
 #run_print_and_save(f"SELECT * FROM '{in_file}' WHERE url_domain == 'bbc.co.uk' LIMIT 10", "some_rows.csv")
-run_print_and_save(f"SELECT * FROM '{in_file}' WHERE status_code == -5002 LIMIT 10", "some_rows.csv")
+run_print_and_save(f"SELECT * FROM '{in_file}' WHERE status_code == 299 LIMIT 10", "some_rows.csv")
 
 #run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
 #print(duckdb.query(f"SELECT domain, status_code, COUNT(*) from '{in_file}' GROUP BY domain, status_code ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, content_type ORDER BY COUNT(*) DESC"))
 
-print(duckdb.query(f"DESCRIBE SELECT * FROM '{in_file}'").df())
-
 #run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
-run_print_and_save(f"SELECT url, tries, start_time, duration, launch_time, (start_time - launch_time) AS delay, webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 ORDER BY delay DESC LIMIT 100", "some_rows.csv")
+#run_print_and_save(f"SELECT url, tries, start_time, duration, launch_time, (start_time - launch_time) AS delay, webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 ORDER BY delay DESC LIMIT 100", "some_rows.csv")
 
 #print(duckdb.query(f"SELECT url_domain, status_code, COUNT(*) from '{in_file}' GROUP BY url_domain, status_code ORDER BY COUNT(*) DESC"))
-print(duckdb.query(f"SELECT url_domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' GROUP BY url_domain, status_code, content_type ORDER BY COUNT(*) DESC"))
+#print(duckdb.query(f"SELECT url_domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' GROUP BY url_domain, status_code, content_type ORDER BY COUNT(*) DESC"))
+print(duckdb.query(f"SELECT status_code, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from '{in_file}' GROUP BY status_code ORDER BY COUNT(*) DESC"))
 
 #print(duckdb.query(f"SELECT url_domain, status_code, annotations, SUM(content_length), COUNT(*) from '{in_file}' WHERE url_domain == 'bbc.co.uk' GROUP BY url_domain, status_code, annotations  ORDER BY COUNT(*) DESC"))