Skip to content

Commit

Permalink
Add/improve examples queries
Browse files Browse the repository at this point in the history
- WARC storage metrics by MIME type:
  extract common/frequent file suffixes from URL
- site discovery by language: implicit cast to floating point
  number (address #23)
- select robots.txt records for a given list of domains
  • Loading branch information
sebastian-nagel committed Mar 21, 2023
1 parent 8a062ea commit ccaf0dd
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ SELECT COUNT(*) as n_pages,
SUM(warc_record_length) * 100.0 / SUM(SUM(warc_record_length)) OVER() as perc_warc_storage,
SUM(case when content_truncated is null then 0 else 1 end) * 100.0 / COUNT(*) as perc_truncated,
content_mime_detected,
histogram(content_truncated) as reason_truncated
histogram(content_truncated) as reason_truncated,
slice(
array_sort(
map_entries(map_filter(histogram(regexp_extract(url_path, '\.[a-zA-Z0-9_-]{1,7}$')), (k, v) -> v > 4)),
(a, b) -> IF(a[2] < b[2], 1, IF(a[2] = b[2], 0, -1))),
1, 25) as common_url_path_suffixes
FROM "ccindex"."ccindex"
WHERE crawl = 'CC-MAIN-2022-05'
WHERE crawl = 'CC-MAIN-2023-06'
AND subset = 'warc'
GROUP BY content_mime_detected
ORDER BY n_pages DESC;
49 changes: 49 additions & 0 deletions src/sql/examples/cc-index/get-records-robotstxt.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
--
-- Select robots.txt records for a given list of domains
-- from the robots.txt subset, cf.
-- https://commoncrawl.org/2016/09/robotstxt-and-404-redirect-data-sets/
--
-- * join with domain list table
-- (here Alexa top 1 million ranks are used,
-- see count-domains-alexa-top-1m.sql how to create
-- the table `alexa`)
-- * select only the most recent record per same robots.txt URL
-- (the crawler might fetch the robots.txt repeatedly)
-- * extract WARC record locations for later processing
-- of robots.txt files
-- * MIME types (HTTP Content-Type header and identified
-- by content)
-- * fetch time and status
-- * and redirect locations (since CC-MAIN-2019-47)
-- to be able to "follow" redirects
--
WITH allrobots AS (
SELECT alexa.site,
alexa.rank,
cc.url_host_tld,
cc.url_host_registered_domain,
cc.url_host_name,
cc.url,
cc.fetch_time,
cc.fetch_status,
cc.warc_filename,
cc.warc_record_offset,
cc.warc_record_length,
cc.fetch_redirect,
cc.content_mime_type,
cc.content_mime_detected,
-- enumerate records of same URL, most recent first
ROW_NUMBER() OVER(PARTITION BY cc.url ORDER BY cc.fetch_time DESC) AS n
FROM "ccindex"."ccindex" AS cc
RIGHT OUTER JOIN "ccindex"."alexa_top_1m" AS alexa
ON alexa.domain = cc.url_host_registered_domain
WHERE cc.crawl = 'CC-MAIN-2022-33'
AND cc.subset = 'robotstxt'
-- skip host names which differ from the domain name except for an optional "www." prefix
AND (length(cc.url_host_name) = length(cc.url_host_registered_domain)
OR (length(cc.url_host_name) = (length(cc.url_host_registered_domain)+4)
AND substr(cc.url_host_name, 1, 4) = 'www.')))
SELECT *
FROM allrobots
-- select only the first (most recent) record of the same URL
WHERE allrobots.n = 1;
2 changes: 1 addition & 1 deletion src/sql/examples/cc-index/site-discovery-by-language.sql
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,5 @@ SELECT num_pages,
FROM tmp
WHERE num_pages >= 5
AND regexp_like('(cat|eus|glg|oci)', primary_content_language)
AND (num_pages/total_pages_host) >= .05
AND (1.0*num_pages/total_pages_host) >= .05
ORDER BY primary_content_language, tld, num_pages DESC;

0 comments on commit ccaf0dd

Please sign in to comment.