From ccaf0dde6b546ce46cd57d6220fdb55c48c49525 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 21 Mar 2023 15:37:42 +0100
Subject: [PATCH] Add/improve examples queries - WARC storage metrics by MIME
 type:   extract common/frequent file suffixes from URL - site discovery by
 language: implicit cast to floating point   number (address #23) - select
 robots.txt records for a given list of domains

---
 ...verage-warc-record-length-by-mime-type.sql |  9 +++-
 .../cc-index/get-records-robotstxt.sql        | 49 +++++++++++++++++++
 .../cc-index/site-discovery-by-language.sql   |  2 +-
 3 files changed, 57 insertions(+), 3 deletions(-)
 create mode 100644 src/sql/examples/cc-index/get-records-robotstxt.sql

diff --git a/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql b/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql
index 162ea38..df832cd 100644
--- a/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql
+++ b/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql
@@ -22,9 +22,14 @@ SELECT COUNT(*) as n_pages,
        SUM(warc_record_length) * 100.0 / SUM(SUM(warc_record_length)) OVER() as perc_warc_storage,
        SUM(case when content_truncated is null then 0 else 1 end) * 100.0 / COUNT(*) as perc_truncated,
        content_mime_detected,
-       histogram(content_truncated) as reason_truncated
+       histogram(content_truncated) as reason_truncated,
+       slice(
+         array_sort(
+           map_entries(map_filter(histogram(regexp_extract(url_path, '\.[a-zA-Z0-9_-]{1,7}$')), (k, v) -> v > 4)),
+           (a, b) -> IF(a[2] < b[2], 1, IF(a[2] = b[2], 0, -1))),
+         1, 25) as common_url_path_suffixes
 FROM "ccindex"."ccindex"
-WHERE crawl = 'CC-MAIN-2022-05'
+WHERE crawl = 'CC-MAIN-2023-06'
   AND subset = 'warc'
 GROUP BY content_mime_detected
 ORDER BY n_pages DESC;
diff --git a/src/sql/examples/cc-index/get-records-robotstxt.sql b/src/sql/examples/cc-index/get-records-robotstxt.sql
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/src/sql/examples/cc-index/get-records-robotstxt.sql
@@ -0,0 +1,49 @@
+--
+-- Select robots.txt records for a given list of domains
+-- from the robots.txt subset, cf.
+--   https://commoncrawl.org/2016/09/robotstxt-and-404-redirect-data-sets/
+--
+-- * join with domain list table
+--   (here Alexa top 1 million ranks are used,
+--    see count-domains-alexa-top-1m.sql how to create
+--    the table `alexa`)
+-- * select only the most recent record per same robots.txt URL
+--   (the crawler might fetch the robots.txt repeatedly)
+-- * extract WARC record locations for later processing
+--   of robots.txt files
+-- * MIME types (HTTP Content-Type header and identified
+--   by content)
+-- * fetch time and status
+-- * and redirect locations (since CC-MAIN-2019-47)
+--   to be able to "follow" redirects
+--
+WITH allrobots AS (
+  SELECT alexa.site,
+         alexa.rank,
+         cc.url_host_tld,
+         cc.url_host_registered_domain,
+         cc.url_host_name,
+         cc.url,
+         cc.fetch_time,
+         cc.fetch_status,
+         cc.warc_filename,
+         cc.warc_record_offset,
+         cc.warc_record_length,
+         cc.fetch_redirect,
+         cc.content_mime_type,
+         cc.content_mime_detected,
+         -- enumerate records of same URL, most recent first
+         ROW_NUMBER() OVER(PARTITION BY cc.url ORDER BY cc.fetch_time DESC) AS n
+  FROM "ccindex"."ccindex" AS cc
+    RIGHT OUTER JOIN "ccindex"."alexa_top_1m" AS alexa
+    ON alexa.domain = cc.url_host_registered_domain
+  WHERE cc.crawl = 'CC-MAIN-2022-33'
+    AND cc.subset = 'robotstxt'
+    -- skip host names which differ from the domain name except for an optional "www." prefix
+    AND (length(cc.url_host_name) = length(cc.url_host_registered_domain)
+         OR (length(cc.url_host_name) = (length(cc.url_host_registered_domain)+4)
+             AND substr(cc.url_host_name, 1, 4) = 'www.')))
+SELECT *
+ FROM allrobots
+-- select only the first (most recent) record of the same URL
+WHERE allrobots.n = 1;
diff --git a/src/sql/examples/cc-index/site-discovery-by-language.sql b/src/sql/examples/cc-index/site-discovery-by-language.sql
index 73bdc40..05eaa2d 100644
--- a/src/sql/examples/cc-index/site-discovery-by-language.sql
+++ b/src/sql/examples/cc-index/site-discovery-by-language.sql
@@ -82,5 +82,5 @@ SELECT num_pages,
 FROM tmp
 WHERE num_pages >= 5
   AND regexp_like('(cat|eus|glg|oci)', primary_content_language)
-  AND (num_pages/total_pages_host) >= .05
+  AND (1.0*num_pages/total_pages_host) >= .05
 ORDER BY primary_content_language, tld, num_pages DESC;