Add some notes, queries.

ukwa · Nov 21, 2023 · b6d9024 · b6d9024
1 parent 0622565
commit b6d9024
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,52 @@ Take crawl events and turn them into Apache Parquet so they can be queried using
 
 Requires Python >= 3.8 to avoid fastparquet bug https://github.com/dask/fastparquet/issues/825
 
+## Example of use
+
+14.5G  100 14.5G    0     0  48.1M      0  0:05:08  0:05:08 --:--:-- 52.2M
+[ec2-user@fc crawl-db]$ wc crawl.log.cp00004 
+   37357947   635085091 15578444671 crawl.log.cp00004
+
+
+(.venv) [ec2-user@fc crawl-db]$  head -1 /mnt/data/fc/crawl-db/crawl.log.cp00004 
+2023-11-15T17:13:05.828Z -5003          - https://cdn.mos.cms.futurecdn.net/EaNfygyiNnxXmUAXnpcojG-768-80.jpeg LE https://www.wallpaper.com/fashion-beauty/sacai-mercedes-benz-amg-collaboration unknown #070 - - tid:94192:https://www.wallpaper.com/ Q:serverMaxSuccessKb {"scopeDecision":"ACCEPT by rule #2 MatchesRegexDecideRule"}
+(.venv) [ec2-user@fc crawl-db]$  tail -1 /mnt/data/fc/crawl-db/crawl.log.cp00004 
+2023-11-16T12:36:14.536Z   200     357908 https://www.thegazette.co.uk/sitemap-201709-1.xml.gz II https://www.thegazette.co.uk/sitemap.xml application/xml #137 20231116115454719+406 sha1:YUPZK5EJ7WRFWRRLD3CMGHH7ZQW3NDX7 tid:37575:https://www.thegazette.co.uk/ isSitemap,launchTimestamp:20231115090324,err=java.lang.NullPointerException,dol:50002,ip:18.165.201.84 {"contentSize":358576,"warcFilename":"BL-NPLD-20231116115015552-01164-44~npld-heritrix3-worker-1~8443.warc.gz","warcFileOffset":220898170,"scopeDecision":"ACCEPT by rule #1 WatchedFileSurtPrefixedDecideRule","warcFileRecordLength":460354}   
+
+ 1098  2023-11-16:13:05:01  python -m crawldb.parquet.cli import  /mnt/data/fc/heritrix/output/frequent-npld/20231115123452/logs/crawl.log.cp00004-20231116123457 /mnt/data/fc/crawl-db/npld-cp00004.parquet
+
+ -rw-r--r--. 1 ec2-user ec2-user 2.8G Nov 16 15:33 npld-cp00004.parquet
+
+┌──────────────┐
+│ count_star() │
+│    int64     │
+├──────────────┤
+│     37357947 │
+└──────────────┘
+
+
+┌──────────────────────┬──────────────────────┬──────────────────────┬───────────────┬─────────┬───┬───────────────┬─────────────┬─────────────┬────────────┐
+│          id          │         url          │         host         │    domain     │   ip    │ … │ warc_filename │ warc_offset │ warc_length │ wire_bytes │
+│       varchar        │       varchar        │       varchar        │    varchar    │ varchar │   │    varchar    │    int64    │    int64    │   int64    │
+├──────────────────────┼──────────────────────┼──────────────────────┼───────────────┼─────────┼───┼───────────────┼─────────────┼─────────────┼────────────┤
+│ 2023-11-15T17:13:0…  │ https://cdn.mos.cm…  │ cdn.mos.cms.future…  │ futurecdn.net │ NULL    │ … │ NULL          │        NULL │        NULL │       NULL │
+├──────────────────────┴──────────────────────┴──────────────────────┴───────────────┴─────────┴───┴───────────────┴─────────────┴─────────────┴────────────┤
+│ 1 rows                                                                                                                               22 columns (9 shown) │
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────┬──────────────────────┬──────────────────────┬──────────────────┬───┬──────────────────────┬─────────────┬─────────────┬────────────┐
+│          id          │         url          │         host         │      domain      │ … │    warc_filename     │ warc_offset │ warc_length │ wire_bytes │
+│       varchar        │       varchar        │       varchar        │     varchar      │   │       varchar        │    int64    │    int64    │   int64    │
+├──────────────────────┼──────────────────────┼──────────────────────┼──────────────────┼───┼──────────────────────┼─────────────┼─────────────┼────────────┤
+│ 2023-11-16T12:36:1…  │ https://www.thegaz…  │ www.thegazette.co.uk │ thegazette.co.uk │ … │ BL-NPLD-2023111611…  │   220898170 │      460354 │     358576 │
+├──────────────────────┴──────────────────────┴──────────────────────┴──────────────────┴───┴──────────────────────┴─────────────┴─────────────┴────────────┤
+│ 1 rows                                                                                                                               22 columns (8 shown) │
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+
+
+
+
+
 ## To Do
 
 - Make some DC2023 data accessible via Apache Superset to explore if we need more fields.

diff --git a/duckdb-query.py b/duckdb-query.py
@@ -3,12 +3,33 @@
 in_file = 'outfile.parquet'
 in_file = '/mnt/data/fc/crawl-db/npld-cp00004.parquet'
 
+
+def run_print_and_save(query, csv_file):
+    q = duckdb.query(query)
+    print(q)
+    q.to_csv(csv_file)
+
 print(duckdb.query(f"DESCRIBE SELECT * FROM '{in_file}'"))
-#print(duckdb.query(f"SELECT id,timestamp FROM '{in_file}' WHERE domain == 'bbc.co.uk' LIMIT 10"))
+print(duckdb.query(f"SELECT COUNT(*) FROM '{in_file}'"))
+print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id ASC LIMIT 1"))
+print(duckdb.query(f"SELECT * FROM '{in_file}' ORDER BY id DESC LIMIT 1"))
+#run_print_and_save(f"SELECT * FROM '{in_file}' WHERE domain == 'bbc.co.uk' LIMIT 10", "some_rows.csv")
+run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
 #print(duckdb.query(f"SELECT domain, status_code, COUNT(*) from '{in_file}' GROUP BY domain, status_code ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, content_type ORDER BY COUNT(*) DESC"))
-print(duckdb.query(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, status_code, COUNT(*) \
-FROM '{in_file}' WHERE status_code == 200 OR status_code == -5003 OR status_code == -5002 GROUP BY start_hour, status_code ORDER BY start_hour ASC, COUNT(*) DESC"))
+
+#print(duckdb.query(f"SELECT domain, status_code, annotations, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, annotations  ORDER BY COUNT(*) DESC"))
+
+#run_print_and_save(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, COUNT(*) \
+#FROM '{in_file}'  GROUP BY ALL ORDER BY start_hour ASC", "totals_by_hour.csv")
+
+#run_print_and_save(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, status_code, COUNT(*) \
+#FROM '{in_file}' WHERE status_code == 200 OR status_code == -5003 OR status_code == -5002 \
+#GROUP BY start_hour, status_code ORDER BY start_hour ASC, COUNT(*) DESC", "critical_status_codes_by_hour.csv")
+
+#print(duckdb.query(f"COPY (SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) AS start_hour, domain, status_code, SUM(content_length) AS total_bytes, COUNT(*) \
+#FROM '{in_file}' GROUP BY start_hour, domain, status_code ORDER BY start_hour ASC, status_code DESC) TO 'totals_by_hour_domain_status_code.csv'"))
+
 #print(duckdb.query(f"SELECT ip, COUNT(*) from '{in_file}' WHERE ip IS NOT NULL GROUP BY ip ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT host, COUNT(*) from '{in_file}' WHERE status_code > 0 GROUP BY host ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT domain, COUNT(DISTINCT host) from '{in_file}' GROUP BY domain ORDER BY COUNT(DISTINCT host) DESC"))