From ccaf0dde6b546ce46cd57d6220fdb55c48c49525 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 21 Mar 2023 15:37:42 +0100 Subject: [PATCH] Add/improve examples queries - WARC storage metrics by MIME type: extract common/frequent file suffixes from URL - site discovery by language: implicit cast to floating point number (address #23) - select robots.txt records for a given list of domains --- ...verage-warc-record-length-by-mime-type.sql | 9 +++- .../cc-index/get-records-robotstxt.sql | 49 +++++++++++++++++++ .../cc-index/site-discovery-by-language.sql | 2 +- 3 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 src/sql/examples/cc-index/get-records-robotstxt.sql diff --git a/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql b/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql index 162ea38..df832cd 100644 --- a/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql +++ b/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql @@ -22,9 +22,14 @@ SELECT COUNT(*) as n_pages, SUM(warc_record_length) * 100.0 / SUM(SUM(warc_record_length)) OVER() as perc_warc_storage, SUM(case when content_truncated is null then 0 else 1 end) * 100.0 / COUNT(*) as perc_truncated, content_mime_detected, - histogram(content_truncated) as reason_truncated + histogram(content_truncated) as reason_truncated, + slice( + array_sort( + map_entries(map_filter(histogram(regexp_extract(url_path, '\.[a-zA-Z0-9_-]{1,7}$')), (k, v) -> v > 4)), + (a, b) -> IF(a[2] < b[2], 1, IF(a[2] = b[2], 0, -1))), + 1, 25) as common_url_path_suffixes FROM "ccindex"."ccindex" -WHERE crawl = 'CC-MAIN-2022-05' +WHERE crawl = 'CC-MAIN-2023-06' AND subset = 'warc' GROUP BY content_mime_detected ORDER BY n_pages DESC; diff --git a/src/sql/examples/cc-index/get-records-robotstxt.sql b/src/sql/examples/cc-index/get-records-robotstxt.sql new file mode 100644 index 0000000..7b7bd4e --- /dev/null +++ b/src/sql/examples/cc-index/get-records-robotstxt.sql @@ -0,0 +1,49 @@ +-- +-- Select robots.txt records for a given list of domains +-- from the robots.txt subset, cf. +-- https://commoncrawl.org/2016/09/robotstxt-and-404-redirect-data-sets/ +-- +-- * join with domain list table +-- (here Alexa top 1 million ranks are used, +-- see count-domains-alexa-top-1m.sql how to create +-- the table `alexa`) +-- * select only the most recent record per same robots.txt URL +-- (the crawler might fetch the robots.txt repeatedly) +-- * extract WARC record locations for later processing +-- of robots.txt files +-- * MIME types (HTTP Content-Type header and identified +-- by content) +-- * fetch time and status +-- * and redirect locations (since CC-MAIN-2019-47) +-- to be able to "follow" redirects +-- +WITH allrobots AS ( + SELECT alexa.site, + alexa.rank, + cc.url_host_tld, + cc.url_host_registered_domain, + cc.url_host_name, + cc.url, + cc.fetch_time, + cc.fetch_status, + cc.warc_filename, + cc.warc_record_offset, + cc.warc_record_length, + cc.fetch_redirect, + cc.content_mime_type, + cc.content_mime_detected, + -- enumerate records of same URL, most recent first + ROW_NUMBER() OVER(PARTITION BY cc.url ORDER BY cc.fetch_time DESC) AS n + FROM "ccindex"."ccindex" AS cc + RIGHT OUTER JOIN "ccindex"."alexa_top_1m" AS alexa + ON alexa.domain = cc.url_host_registered_domain + WHERE cc.crawl = 'CC-MAIN-2022-33' + AND cc.subset = 'robotstxt' + -- skip host names which differ from the domain name except for an optional "www." prefix + AND (length(cc.url_host_name) = length(cc.url_host_registered_domain) + OR (length(cc.url_host_name) = (length(cc.url_host_registered_domain)+4) + AND substr(cc.url_host_name, 1, 4) = 'www.'))) +SELECT * + FROM allrobots +-- select only the first (most recent) record of the same URL +WHERE allrobots.n = 1; diff --git a/src/sql/examples/cc-index/site-discovery-by-language.sql b/src/sql/examples/cc-index/site-discovery-by-language.sql index 73bdc40..05eaa2d 100644 --- a/src/sql/examples/cc-index/site-discovery-by-language.sql +++ b/src/sql/examples/cc-index/site-discovery-by-language.sql @@ -82,5 +82,5 @@ SELECT num_pages, FROM tmp WHERE num_pages >= 5 AND regexp_like('(cat|eus|glg|oci)', primary_content_language) - AND (num_pages/total_pages_host) >= .05 + AND (1.0*num_pages/total_pages_host) >= .05 ORDER BY primary_content_language, tld, num_pages DESC;