diff --git a/sql/2024/privacy/ads_accounts_distribution.sql b/sql/2024/privacy/ads_accounts_distribution.sql new file mode 100644 index 00000000000..6f119b86094 --- /dev/null +++ b/sql/2024/privacy/ads_accounts_distribution.sql @@ -0,0 +1,64 @@ +WITH publishers AS ( + SELECT + page, + JSON_QUERY(custom_metrics, '$.ads.ads.account_types') AS ads_account_types, + JSON_QUERY(custom_metrics, '$.ads.app_ads.account_types') AS app_ads_account_types + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' AND + is_root_page = TRUE AND + (CAST(JSON_VALUE(custom_metrics, '$.ads.ads.account_count') AS INT64) > 0 OR + CAST(JSON_VALUE(custom_metrics, '$.ads.app_ads.account_count') AS INT64) > 0) +), ads_accounts AS ( + SELECT + page, + CEIL(CAST(JSON_VALUE(ads_account_types, '$.direct.account_count') AS INT64) / 100) * 100 AS direct_account_count_bucket, + CEIL(CAST(JSON_VALUE(ads_account_types, '$.reseller.account_count') AS INT64) / 100) * 100 AS reseller_account_count_bucket, + COUNT(DISTINCT page) OVER () AS total_pages + FROM publishers +), app_ads_accounts AS ( + SELECT + page, + CEIL(CAST(JSON_VALUE(app_ads_account_types, '$.direct.account_count') AS INT64) / 100) * 100 AS direct_account_count_bucket, + CEIL(CAST(JSON_VALUE(app_ads_account_types, '$.reseller.account_count') AS INT64) / 100) * 100 AS reseller_account_count_bucket, + COUNT(DISTINCT page) OVER () AS total_pages + FROM publishers +) + +SELECT + 'ads' AS source, + 'direct' AS account_type, + direct_account_count_bucket AS account_count_bucket, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM ads_accounts +GROUP BY source, direct_account_count_bucket +UNION ALL +SELECT + 'ads' AS source, + 'reseller' AS account_type, + reseller_account_count_bucket AS account_count_bucket, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM ads_accounts +GROUP BY source, reseller_account_count_bucket +UNION ALL +SELECT + 'app_ads' AS source, + 'direct' AS account_type, + direct_account_count_bucket AS account_count_bucket, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM app_ads_accounts +GROUP BY source, direct_account_count_bucket +UNION ALL +SELECT + 'app_ads' AS source, + 'reseller' AS account_type, + reseller_account_count_bucket AS account_count_bucket, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM app_ads_accounts +GROUP BY source, reseller_account_count_bucket + +ORDER BY account_count_bucket ASC +LIMIT 1000 diff --git a/sql/2024/privacy/ads_and_sellers_graph.sql b/sql/2024/privacy/ads_and_sellers_graph.sql new file mode 100644 index 00000000000..9b999ecfefa --- /dev/null +++ b/sql/2024/privacy/ads_and_sellers_graph.sql @@ -0,0 +1,114 @@ +WITH RECURSIVE pages AS ( + SELECT + CASE page -- Publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain. CASE needs to be replaced with a more robust solution from HTTPArchive/custom-metrics#136. + WHEN 'https://www.chunkbase.com/' THEN 'cafemedia.com' + ELSE NET.REG_DOMAIN(page) + END AS page_domain, + JSON_QUERY(ANY_VALUE(custom_metrics), '$.ads') AS ads_metrics + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY page_domain +), ads AS ( + SELECT + page_domain, + JSON_QUERY(ads_metrics, '$.ads.account_types') AS ad_accounts + FROM pages + WHERE + CAST(JSON_VALUE(ads_metrics, '$.ads.account_count') AS INT64) > 0 +), sellers AS ( + SELECT + page_domain, + JSON_QUERY(ads_metrics, '$.sellers.seller_types') AS ad_sellers + FROM pages + WHERE + CAST(JSON_VALUE(ads_metrics, '$.sellers.seller_count') AS INT64) > 0 +), relationships_web AS ( + SELECT + NET.REG_DOMAIN(REGEXP_EXTRACT(NORMALIZE_AND_CASEFOLD(domain), r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b')) AS demand, + 'Web' AS supply, + 'direct' AS relationship, + page_domain AS publisher + FROM ads, UNNEST(JSON_VALUE_ARRAY(ad_accounts, '$.direct.domains')) AS domain + UNION ALL + SELECT + NET.REG_DOMAIN(REGEXP_EXTRACT(NORMALIZE_AND_CASEFOLD(domain), r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b')) AS demand, + 'Web' AS supply, + 'indirect' AS relationship, + page_domain AS publisher + FROM ads, UNNEST(JSON_VALUE_ARRAY(ad_accounts, '$.reseller.domains')) AS domain + UNION ALL + SELECT + page_domain AS demand, + 'Web' AS supply, + 'direct' AS relationship, + NET.REG_DOMAIN(REGEXP_EXTRACT(NORMALIZE_AND_CASEFOLD(domain), r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b')) AS publisher + FROM sellers, UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.publisher.domains')) AS domain + UNION ALL + SELECT + page_domain AS demand, + 'Web' AS supply, + 'direct' AS relationship, + NET.REG_DOMAIN(REGEXP_EXTRACT(NORMALIZE_AND_CASEFOLD(domain), r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b')) AS publisher + FROM sellers, UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.both.domains')) AS domain +), relationships_adtech AS ( + SELECT + page_domain AS demand, + NET.REG_DOMAIN(REGEXP_EXTRACT(NORMALIZE_AND_CASEFOLD(domain), r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b')) AS supply, + 'indirect' AS relationship + FROM sellers, UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.intermediary.domains')) AS domain + UNION ALL + SELECT + page_domain AS demand, + NET.REG_DOMAIN(REGEXP_EXTRACT(NORMALIZE_AND_CASEFOLD(domain), r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b')) AS supply, + 'indirect' AS relationship + FROM sellers, UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.both.domains')) AS domain +), nodes AS ( + ( + SELECT + demand, + supply, + CONCAT(demand, '-', supply) AS path, + relationship, + HLL_COUNT.INIT(publisher) AS supply_sketch + FROM relationships_web + GROUP BY demand, supply, relationship + ) + UNION ALL + ( + SELECT + relationships_grouped.demand AS demand, + relationships_grouped.supply AS supply, + CONCAT(relationships_grouped.demand, '-', nodes.path) AS path, + relationships_grouped.relationship AS relationship, + nodes.supply_sketch AS supply_sketch + FROM ( + SELECT + demand, + supply, + relationship + FROM relationships_adtech + GROUP BY + demand, + supply, + relationship + ) AS relationships_grouped + INNER JOIN nodes + ON relationships_grouped.supply = nodes.demand AND + nodes.supply_sketch IS NOT NULL AND + nodes.relationship = 'indirect' AND + relationships_grouped.demand IS NOT NULL AND + STRPOS(nodes.path, relationships_grouped.demand) = 0 + ) +) + +SELECT + supply, + demand, + HLL_COUNT.MERGE(supply_sketch) AS publishers_count, + relationship, + path +FROM nodes +GROUP BY demand, supply, relationship, path +ORDER BY publishers_count DESC +LIMIT 5000 diff --git a/sql/2024/privacy/ads_lines_distribution.sql b/sql/2024/privacy/ads_lines_distribution.sql new file mode 100644 index 00000000000..b18be9b15f8 --- /dev/null +++ b/sql/2024/privacy/ads_lines_distribution.sql @@ -0,0 +1,45 @@ +WITH RECURSIVE pages AS ( + SELECT + CASE page -- publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain + WHEN 'https://www.chunkbase.com/' THEN 'cafemedia.com' + ELSE NET.REG_DOMAIN(page) + END AS page, + CAST(JSON_VALUE(custom_metrics, '$.ads.ads.line_count') AS INT64) AS ads_line_count, + CAST(JSON_VALUE(custom_metrics, '$.ads.app_ads.line_count') AS INT64) AS app_ads_line_count + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' AND + is_root_page = TRUE +), ads AS ( + SELECT + page, + CEIL(ads_line_count / 100) * 100 AS line_count_bucket, + COUNT(DISTINCT page) OVER () AS total_pages + FROM pages + WHERE ads_line_count > 0 +), app_ads AS ( + SELECT + page, + CEIL(app_ads_line_count / 100) * 100 AS line_count_bucket, + COUNT(DISTINCT page) OVER () AS total_pages + FROM pages + WHERE app_ads_line_count > 0 +) + +SELECT + 'ads.txt' AS type, + line_count_bucket, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM ads +GROUP BY line_count_bucket +HAVING line_count_bucket <= 10000 +UNION ALL +SELECT + 'app-ads.txt' AS type, + line_count_bucket, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM app_ads +GROUP BY line_count_bucket +HAVING line_count_bucket <= 10000 +ORDER BY type, line_count_bucket ASC diff --git a/sql/2024/privacy/ccpa_most_common_phrases.sql b/sql/2024/privacy/ccpa_most_common_phrases.sql new file mode 100644 index 00000000000..f7ef9daf7ae --- /dev/null +++ b/sql/2024/privacy/ccpa_most_common_phrases.sql @@ -0,0 +1,31 @@ +WITH pages_with_phrase AS ( + SELECT + client, + rank_grouping, + page, + COUNT(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, + JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases + FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE date = '2024-06-01' AND + is_root_page = true AND + rank <= rank_grouping AND + array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 +) + +SELECT + client, + rank_grouping, + link_phrase, + COUNT(DISTINCT page) AS num_pages, + COUNT(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages +FROM pages_with_phrase, + UNNEST(ccpa_link_phrases) AS link_phrase +GROUP BY + link_phrase, + rank_grouping, + client +ORDER BY + rank_grouping, + client, + num_pages DESC diff --git a/sql/2024/privacy/ccpa_prevalence.sql b/sql/2024/privacy/ccpa_prevalence.sql new file mode 100644 index 00000000000..e3ac8946e9d --- /dev/null +++ b/sql/2024/privacy/ccpa_prevalence.sql @@ -0,0 +1,27 @@ +WITH pages AS ( + SELECT + client, + rank_grouping, + page, + JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link + FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.0025 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE date = '2024-06-01' AND + is_root_page = true AND + rank <= rank_grouping +) + +SELECT + client, + rank_grouping, + has_ccpa_link, + COUNT(DISTINCT page) AS num_pages +FROM pages +GROUP BY + has_ccpa_link, + rank_grouping, + client +ORDER BY + rank_grouping, + client, + has_ccpa_link diff --git a/sql/2024/privacy/common_ads_variables.sql b/sql/2024/privacy/common_ads_variables.sql new file mode 100644 index 00000000000..8bc07e5f822 --- /dev/null +++ b/sql/2024/privacy/common_ads_variables.sql @@ -0,0 +1,29 @@ +WITH RECURSIVE pages AS ( + SELECT + page, + JSON_QUERY(custom_metrics, '$.ads.ads') AS ads_metrics + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + CAST(JSON_VALUE(custom_metrics, '$.ads.ads.account_count') AS INT64) > 0 +), ads AS ( + SELECT + page, + variable, + COUNT(DISTINCT page) OVER() AS total_publishers + FROM pages, + UNNEST(JSON_VALUE_ARRAY(ads_metrics, '$.variables')) AS variable + WHERE + CAST(JSON_VALUE(ads_metrics, '$.account_types.reseller.account_count') AS INT64) > 0 OR + CAST(JSON_VALUE(ads_metrics, '$.account_types.direct.account_count') AS INT64) > 0 +) + +SELECT + variable, + COUNT(DISTINCT page) / ANY_VALUE(total_publishers) AS pct_publishers, + COUNT(DISTINCT page) AS number_of_publishers +FROM ads +GROUP BY variable +ORDER BY pct_publishers DESC +LIMIT 100 diff --git a/sql/2024/privacy/cookies_top_first_party_names.sql b/sql/2024/privacy/cookies_top_first_party_names.sql new file mode 100644 index 00000000000..1073d24c33a --- /dev/null +++ b/sql/2024/privacy/cookies_top_first_party_names.sql @@ -0,0 +1,35 @@ +-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites. + +WITH pages AS ( + SELECT + client, + root_page, + custom_metrics, + COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' +), cookies AS ( + SELECT + client, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_domains + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie +) + +SELECT + client, + COUNT(DISTINCT firstparty_host) AS domain_count, + COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, + JSON_VALUE(cookie, '$.name') AS cookie_name +FROM cookies +WHERE firstparty_host LIKE '%' || cookie_host +GROUP BY + client, + cookie_name +ORDER BY + domain_count DESC, + client DESC +LIMIT 500 diff --git a/sql/2024/privacy/cookies_top_third_party_domains.sql b/sql/2024/privacy/cookies_top_third_party_domains.sql new file mode 100644 index 00000000000..ff5236f3bff --- /dev/null +++ b/sql/2024/privacy/cookies_top_third_party_domains.sql @@ -0,0 +1,35 @@ +WITH pages AS ( + SELECT + page, + client, + root_page, + custom_metrics, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' +), cookies AS ( + SELECT + client, + page, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_pages + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie +) + +SELECT + client, + cookie_host, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM cookies +WHERE firstparty_host NOT LIKE '%' || cookie_host +GROUP BY + client, + cookie_host +ORDER BY + page_count DESC, + client +LIMIT 500 diff --git a/sql/2024/privacy/cookies_top_third_party_names.sql b/sql/2024/privacy/cookies_top_third_party_names.sql new file mode 100644 index 00000000000..4f37892974f --- /dev/null +++ b/sql/2024/privacy/cookies_top_third_party_names.sql @@ -0,0 +1,35 @@ +-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains. + +WITH pages AS ( + SELECT + client, + root_page, + custom_metrics, + COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' +), +cookies AS ( + SELECT + client, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_domains + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie +) +SELECT + client, + COUNT(DISTINCT firstparty_host) AS domain_count, + COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, + JSON_VALUE(cookie, '$.name') AS cookie_name +FROM cookies +WHERE firstparty_host NOT LIKE '%' || cookie_host +GROUP BY + client, + cookie_name +ORDER BY + domain_count DESC, + client DESC +LIMIT 500 diff --git a/sql/2024/privacy/easylist-tracker-detection.sql b/sql/2024/privacy/easylist-tracker-detection.sql new file mode 100644 index 00000000000..c71dbbae764 --- /dev/null +++ b/sql/2024/privacy/easylist-tracker-detection.sql @@ -0,0 +1,38 @@ +CREATE TEMP FUNCTION +CheckDomainInURL(url STRING, domain STRING) +RETURNS INT64 +LANGUAGE js AS """ + return url.includes(domain) ? 1 : 0; +"""; + +-- We need to use the `easylist_adservers.csv` to populate the table to get the list of domains to block +-- https://github.com/easylist/easylist/blob/master/easylist/easylist_adservers.txt +WITH easylist_data AS ( + SELECT string_field_0 + FROM `httparchive.almanac.easylist_adservers` +), +requests_data AS ( + SELECT url + FROM `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +), +block_status AS ( + SELECT + r.url, + MAX( + CASE + WHEN CheckDomainInURL(r.url, e.string_field_0) = 1 THEN 1 + ELSE 0 + END + ) AS should_block + FROM requests_data r + LEFT JOIN easylist_data e + ON CheckDomainInURL(r.url, e.string_field_0) = 1 + GROUP BY r.url +) +SELECT + COUNT(0) AS blocked_url_count +FROM block_status +WHERE should_block = 1; diff --git a/sql/2024/privacy/fingerprinting_most_common_apis.sql b/sql/2024/privacy/fingerprinting_most_common_apis.sql new file mode 100644 index 00000000000..f4ba43c5cb7 --- /dev/null +++ b/sql/2024/privacy/fingerprinting_most_common_apis.sql @@ -0,0 +1,35 @@ +CREATE TEMP FUNCTION getFingerprintingTypes(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ +if (input) { + try { + return Object.keys(JSON.parse(input)) + } catch (e) { + return [] + } +} else { + return [] +} +"""; + +WITH pages AS ( + SELECT + client, + page, + fingerprinting_type, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages`, + UNNEST(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type + WHERE date = '2024-06-01' +) +SELECT + client, + fingerprinting_type, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages +GROUP BY + client, + fingerprinting_type +ORDER BY + page_count DESC diff --git a/sql/2024/privacy/fingerprinting_most_common_scripts.sql b/sql/2024/privacy/fingerprinting_most_common_scripts.sql new file mode 100644 index 00000000000..84d3ea02e4d --- /dev/null +++ b/sql/2024/privacy/fingerprinting_most_common_scripts.sql @@ -0,0 +1,23 @@ +WITH pages AS ( + SELECT + page, + client, + custom_metrics, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' +) + +SELECT + client, + script, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script +GROUP BY + client, + script +ORDER BY + page_count DESC +LIMIT 100; diff --git a/sql/2024/privacy/fingerprinting_script_count.sql b/sql/2024/privacy/fingerprinting_script_count.sql new file mode 100644 index 00000000000..07a6038e03a --- /dev/null +++ b/sql/2024/privacy/fingerprinting_script_count.sql @@ -0,0 +1,21 @@ +WITH pages AS ( + SELECT + page, + client, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' +) + +SELECT + script_count, + client, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages +GROUP BY + script_count, + client +ORDER BY + script_count ASC; diff --git a/sql/2024/privacy/most_common_client_hints.sql b/sql/2024/privacy/most_common_client_hints.sql new file mode 100644 index 00000000000..87330675a92 --- /dev/null +++ b/sql/2024/privacy/most_common_client_hints.sql @@ -0,0 +1,50 @@ +# Pages that use Client Hints +WITH response_headers AS ( + SELECT + client, + page, + LOWER(response_header.name) AS header_name, + LOWER(response_header.value) AS header_value, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites + FROM `httparchive.all.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + is_main_document = TRUE +), meta_tags AS ( + SELECT + client, + page, + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) AS tag_name, + LOWER(JSON_VALUE(meta_node, '$.content')) AS tag_value + FROM ( + SELECT + client, + page, + JSON_QUERY(custom_metrics, '$.almanac') AS metrics + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE + ), + UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node + WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL +) + +SELECT + client, + IF(header_name = 'accept-ch', header_value, tag_value) AS value, + COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM response_headers +FULL OUTER JOIN meta_tags +USING (client, page) +WHERE + header_name = 'accept-ch' OR + tag_name = 'accept-ch' +GROUP BY + client, + value +ORDER BY pct_pages DESC +LIMIT 200 diff --git a/sql/2024/privacy/most_common_cmps_for_iab_tcf_v2.sql b/sql/2024/privacy/most_common_cmps_for_iab_tcf_v2.sql new file mode 100644 index 00000000000..f2424880804 --- /dev/null +++ b/sql/2024/privacy/most_common_cmps_for_iab_tcf_v2.sql @@ -0,0 +1,27 @@ +# Counts of CMPs using IAB Transparency & Consent Framework +# cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md#tcdata +# CMP vendor list: https://iabeurope.eu/cmp-list/ + +WITH cmps AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.iab_tcf_v2.data.cmpId') AS cmpId, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +) + +SELECT + client, + cmpId, + COUNT(0) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(0) AS number_of_pages +FROM cmps +GROUP BY + client, + cmpId +ORDER BY + pct_pages DESC diff --git a/sql/2024/privacy/most_common_cname_domains.sql b/sql/2024/privacy/most_common_cname_domains.sql new file mode 100644 index 00000000000..56847ebec2d --- /dev/null +++ b/sql/2024/privacy/most_common_cname_domains.sql @@ -0,0 +1,63 @@ +# Most common CNAME domains + +CREATE TEMP FUNCTION convert_cname_json(json_str STRING) +RETURNS ARRAY> +LANGUAGE js AS """ + const obj = JSON.parse(json_str); + const result = []; + for (const key in obj) { + result.push({ + origin: key, + cname: obj[key] + }); + } + return result; +"""; + +WITH whotracksme AS ( + SELECT DISTINCT + domain + FROM `httparchive.almanac.whotracksme` + WHERE date = '2024-06-01' AND + category IN ('advertising', 'site_analytics') +), cnames AS ( + SELECT + client, + NET.REG_DOMAIN(cnames.cname) AS cname_domain, + COUNT(DISTINCT NET.REG_DOMAIN(cnames.origin)) AS number_of_request_domains, + COUNT(DISTINCT page) AS number_of_pages + --ARRAY_AGG(DISTINCT cnames.origin LIMIT 2) AS request_domain_examples, + --ARRAY_AGG(DISTINCT page LIMIT 2) AS page_examples, + FROM `httparchive.all.pages`, + UNNEST(convert_cname_json(JSON_QUERY(custom_metrics, '$.privacy.request_hostnames_with_cname'))) AS cnames + WHERE date = '2024-06-01' AND + is_root_page = TRUE AND + NET.REG_DOMAIN(cnames.origin) = NET.REG_DOMAIN(page) AND + NET.REG_DOMAIN(cnames.cname) != NET.REG_DOMAIN(page) + GROUP BY + client, + cname_domain +), pages_total AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY client +) + +SELECT + client, + cnames.cname_domain AS cname, + number_of_request_domains, + number_of_pages, + (number_of_pages / total_pages) AS pct_pages +--request_domain_examples, +--page_examples +FROM cnames +LEFT JOIN pages_total +USING (client) +INNER JOIN whotracksme +ON cnames.cname_domain = whotracksme.domain +ORDER BY number_of_pages DESC diff --git a/sql/2024/privacy/most_common_countries_for_iab_tcf_v2.sql b/sql/2024/privacy/most_common_countries_for_iab_tcf_v2.sql new file mode 100644 index 00000000000..1c493bfd161 --- /dev/null +++ b/sql/2024/privacy/most_common_countries_for_iab_tcf_v2.sql @@ -0,0 +1,41 @@ +# Counts of countries for publishers using IAB Transparency & Consent Framework +# cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md#tcdata +# "Country code of the country that determines the legislation of +# reference. Normally corresponds to the country code of the country +# in which the publisher's business entity is established." + +WITH totals AS ( + SELECT + client, + COUNT(0) AS total_websites + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY client +), cmps AS ( + SELECT + client, + JSON_VALUE(custom_metrics, '$.privacy.iab_tcf_v2.data.publisherCC') AS publisherCC, + COUNT(0) AS number_of_pages + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + JSON_VALUE(custom_metrics, '$.privacy.iab_tcf_v2.data.publisherCC') IS NOT NULL + GROUP BY + client, + publisherCC +) + +SELECT + client, + publisherCC, + number_of_pages / total_websites AS pct_pages, + number_of_pages +FROM cmps +JOIN totals +USING (client) +ORDER BY + client, + pct_pages DESC diff --git a/sql/2024/privacy/most_common_referrer_policy.sql b/sql/2024/privacy/most_common_referrer_policy.sql new file mode 100644 index 00000000000..0e50f6b37fe --- /dev/null +++ b/sql/2024/privacy/most_common_referrer_policy.sql @@ -0,0 +1,63 @@ +# Most common values for Referrer-Policy (at site level) + +WITH totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY client +), + +referrer_policy_custom_metrics AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.entire_document_policy') AS policy_meta + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +), + +response_headers AS ( + SELECT + client, + page, + LOWER(response_header.name) AS name, + LOWER(response_header.value) AS value + FROM `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + date = '2024-06-01' AND + is_main_document = TRUE +), + +referrer_policy_headers AS ( + SELECT + client, + page, + value AS policy_header + FROM response_headers + WHERE + name = 'referrer-policy' +) + +SELECT + client, + COALESCE(policy_header, policy_meta) AS policy, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM referrer_policy_custom_metrics +FULL OUTER JOIN referrer_policy_headers +USING (client, page) +JOIN totals +USING (client) +GROUP BY + client, + policy +ORDER BY + pct_pages DESC +LIMIT 100 diff --git a/sql/2024/privacy/most_common_strings_for_iab_usp.sql b/sql/2024/privacy/most_common_strings_for_iab_usp.sql new file mode 100644 index 00000000000..7510cf6cb54 --- /dev/null +++ b/sql/2024/privacy/most_common_strings_for_iab_usp.sql @@ -0,0 +1,27 @@ +# Counts of US Privacy String values for websites using IAB US Privacy Framework +# cf. https://github.com/InteractiveAdvertisingBureau/USPrivacy/blob/master/CCPA/US%20Privacy%20String.md + +WITH usp_data AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.iab_usp.privacy_string.uspString') AS uspString, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS pages_total + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +) + +SELECT + client, + uspString, + COUNT(DISTINCT page) / ANY_VALUE(pages_total) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM usp_data +GROUP BY + client, + uspString +ORDER BY + pct_pages DESC +LIMIT 100 diff --git a/sql/2024/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql b/sql/2024/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql new file mode 100644 index 00000000000..639f7e582b9 --- /dev/null +++ b/sql/2024/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql @@ -0,0 +1,93 @@ +#standardSQL +# Number of Attribution Reporting API Destinations (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) + +-- Extracting third-parties observed using ARA API on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting ARA API source registration details being passed by a given third-party (passed AS "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + const result = []; + + values.forEach(value => { + if (value.toLowerCase().startsWith('attribution-reporting-register-source|')) { + const parts = value.replace('attribution-reporting-register-source|', '').split('|'); + parts.forEach(part => { + if (part.startsWith('destination=')) { + const destinations = part.replace('destination=', '').split(','); + destinations.forEach(destination => { + result.push('destination=' + destination.trim()); + }); + } else { + result.push(part.trim()); + } + }); + } + }); + + return result; +"""; + +WITH ara_features AS ( + SELECT + client, + CASE + WHEN rank <= 1000 THEN '1000' + WHEN rank <= 10000 THEN '10000' + WHEN rank <= 100000 THEN '100000' + WHEN rank <= 1000000 THEN '1000000' + WHEN rank <= 10000000 THEN '10000000' + ELSE 'Other' + END AS rank_group, + NET.REG_DOMAIN(page) AS publisher, + CASE + WHEN ara LIKE 'destination=%' THEN NET.REG_DOMAIN(REPLACE(ara, 'destination=', '')) + ELSE NULL + END AS destination, + third_party_domain + FROM `httparchive.all.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + ara LIKE 'destination%' +) +SELECT + client, + rank_group, + COUNT(destination) AS total_destinations, + COUNT(DISTINCT destination) AS distinct_destinations, + ROUND(COUNT(DISTINCT destination) * 100 / COUNT(destination), 2) AS destination_pct, + COUNT(third_party_domain) AS total_third_party_domains, + COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains, + ROUND(COUNT(DISTINCT third_party_domain) * 100 / COUNT(third_party_domain), 2) AS third_party_domain_pct, + COUNT(publisher) AS total_publishers, + COUNT(DISTINCT publisher) AS distinct_publishers, + ROUND(COUNT(DISTINCT publisher) * 100 / COUNT(publisher), 2) AS publisher_pct +FROM ara_features +WHERE destination IS NOT NULL AND third_party_domain IS NOT NULL +GROUP BY client, rank_group +ORDER BY + client, + CASE rank_group + WHEN '1000' THEN 1 + WHEN '10000' THEN 2 + WHEN '100000' THEN 3 + WHEN '1000000' THEN 4 + WHEN '10000000' THEN 5 + ELSE 6 + END; diff --git a/sql/2024/privacy/number_of_privacy_sandbox_attested_domains.sql b/sql/2024/privacy/number_of_privacy_sandbox_attested_domains.sql new file mode 100644 index 00000000000..97c13be2213 --- /dev/null +++ b/sql/2024/privacy/number_of_privacy_sandbox_attested_domains.sql @@ -0,0 +1,44 @@ +#standardSQL +# Privacy Sandbox Attestation and Related Websites JSON status (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) + +WITH wellknown AS ( + SELECT + client, + NET.HOST(page) AS host, + CASE + WHEN rank <= 1000 THEN '1000' + WHEN rank <= 10000 THEN '10000' + WHEN rank <= 100000 THEN '100000' + WHEN rank <= 1000000 THEN '1000000' + WHEN rank <= 10000000 THEN '10000000' + ELSE 'Other' + END AS rank_group, + CAST(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/related-website-set.json".found') AS BOOL) AS rws, + CAST(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/privacy-sandbox-attestations.json".found') AS BOOL) AS attestation + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +) + +SELECT + client, + rank_group, + SUM(CASE WHEN rws THEN 1 ELSE 0 END) AS related_websites_set, + SUM(CASE WHEN attestation THEN 1 ELSE 0 END) AS privacy_sandbox_attestation +FROM + wellknown +WHERE + rws OR attestation +GROUP BY client, rank_group +ORDER BY + client, + CASE rank_group + WHEN '1000' THEN 1 + WHEN '10000' THEN 2 + WHEN '100000' THEN 3 + WHEN '1000000' THEN 4 + WHEN '10000000' THEN 5 + ELSE 6 + END; diff --git a/sql/2024/privacy/number_of_websites_per_technology.sql b/sql/2024/privacy/number_of_websites_per_technology.sql new file mode 100644 index 00000000000..38433122df4 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_per_technology.sql @@ -0,0 +1,34 @@ +WITH technologies AS ( + SELECT + client, + page, + category, + technology, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites + FROM `httparchive.all.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS category + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +) + +SELECT + client, + technology, + COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages, + ARRAY_AGG(DISTINCT category) AS categories +FROM technologies +WHERE + category IN ( + 'Analytics', 'Browser fingerprinting', 'Customer data platform', + 'Geolocation', + 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation', + 'Cookie compliance' + ) +GROUP BY + client, + technology +ORDER BY + pct_pages DESC diff --git a/sql/2024/privacy/number_of_websites_per_technology_category.sql b/sql/2024/privacy/number_of_websites_per_technology_category.sql new file mode 100644 index 00000000000..648d1cda6a0 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_per_technology_category.sql @@ -0,0 +1,22 @@ +SELECT + client, + category, + COUNT(DISTINCT IF(category = tech_category, page, NULL)) / COUNT(DISTINCT page) AS pct_pages, + COUNT(DISTINCT IF(category = tech_category, page, NULL)) AS number_of_pages +FROM `httparchive.all.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS tech_category, + UNNEST([ + 'Analytics', 'Browser fingerprinting', 'Customer data platform', + 'Geolocation', + 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation', + 'Cookie compliance' + ]) AS category +WHERE + date = '2024-06-01' AND + is_root_page = TRUE +GROUP BY + client, + category +ORDER BY + pct_pages DESC diff --git a/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql b/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql new file mode 100644 index 00000000000..eeaebeeda1e --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql @@ -0,0 +1,90 @@ +-- Detection logic explained: +-- https://github.com/privacycg/proposals/issues/6 +-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md +WITH redirect_requests AS ( + SELECT + client, + url, + index, + response_headers, + page + FROM `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + type NOT IN ('css', 'image', 'font', 'video', 'audio') AND + LEFT(JSON_VALUE(summary, '$.status'), 1) = '3' AND + index <= 2 +), navigation_redirect AS ( + -- Find the first navigation redirect + SELECT + client, + url, + page, + headers.value AS navigation_redirect_location + FROM redirect_requests, + UNNEST(response_headers) AS headers + WHERE + index = 1 AND + LOWER(headers.name) = 'location' AND + NET.REG_DOMAIN(page) != NET.REG_DOMAIN(headers.value) +), bounce_redirect AS ( + -- Find the second navigation redirect + SELECT + client, + url, + page, + headers.value AS bounce_redirect_location, + response_headers + FROM redirect_requests, + UNNEST(response_headers) AS headers + WHERE + index = 2 AND + LOWER(headers.name) = 'location' AND + NET.REG_DOMAIN(headers.value) = NET.REG_DOMAIN(page) +), bounce_redirect_with_cookies AS ( + -- Find the cookies set during the second navigation redirect + SELECT + client, + url, + page, + bounce_redirect_location + --response_headers.value AS bounce_tracking_cookies + FROM bounce_redirect, + UNNEST(response_headers) AS response_headers + WHERE + LOWER(response_headers.name) = 'set-cookie' +), bounce_sequences AS ( + -- Combine the first and second navigation redirects + SELECT + nav.client, + nav.page, + nav.url AS navigation_url, + nav.navigation_redirect_location, + bounce.bounce_redirect_location + --ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies + FROM navigation_redirect AS nav + LEFT JOIN bounce_redirect_with_cookies AS bounce + ON + nav.client = bounce.client AND + nav.page = bounce.page AND + nav.navigation_redirect_location = bounce.url + WHERE bounce_redirect_location IS NOT NULL + GROUP BY + nav.client, + page, + navigation_url, + navigation_redirect_location, + bounce_redirect_location +) + +-- Count the number of websites with bounce tracking per bounce hostname +SELECT + client, + NET.HOST(navigation_redirect_location) AS bounce_hostname, + COUNT(DISTINCT page) AS number_of_pages +--ARRAY_AGG(page LIMIT 2) AS page_examples +FROM bounce_sequences +GROUP BY client, bounce_hostname +ORDER BY number_of_pages DESC +LIMIT 100 diff --git a/sql/2024/privacy/number_of_websites_with_client_hints.sql b/sql/2024/privacy/number_of_websites_with_client_hints.sql new file mode 100644 index 00000000000..4a70eff6aa2 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_client_hints.sql @@ -0,0 +1,42 @@ +WITH response_headers AS ( + SELECT + client, + page, + LOWER(response_header.name) AS header_name, + LOWER(response_header.value) AS header_value, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites + FROM `httparchive.all.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2024-06-01' AND + is_main_document = TRUE +), meta_tags AS ( + SELECT + client, + page, + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) AS tag_name, + LOWER(JSON_VALUE(meta_node, '$.content')) AS tag_value + FROM ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.almanac') AS metrics + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' + ), + UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node + WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL +) + +SELECT + client, + COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM response_headers +FULL OUTER JOIN meta_tags +USING (client, page) +WHERE + header_name = 'accept-ch' OR + tag_name = 'accept-ch' +GROUP BY client +ORDER BY pct_pages DESC diff --git a/sql/2024/privacy/number_of_websites_with_dnt.sql b/sql/2024/privacy/number_of_websites_with_dnt.sql new file mode 100644 index 00000000000..2d430867e6d --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_dnt.sql @@ -0,0 +1,34 @@ +# Pages that request DNT status + +WITH blink AS ( + SELECT DISTINCT + client, + num_urls, + pct_urls + FROM `httparchive.blink_features.usage` + WHERE + yyyymmdd = '20240601' AND + feature IN ('NavigatorDoNotTrack') +), + +pages AS ( + SELECT + client, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_doNotTrack') = 'true', page, NULL)) AS num_urls, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_doNotTrack') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_urls + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY client +) + +SELECT + COALESCE(blink.client, pages.client) AS client, + blink.num_urls AS number_of_pages_usage_per_blink, + blink.pct_urls AS pct_of_websites_usage_per_blink, + pages.num_urls AS number_of_pages_usage_per_custom_metric, + pages.pct_urls AS pct_of_websites_usage_per_custom_metric +FROM blink +FULL OUTER JOIN pages +ON blink.client = pages.client diff --git a/sql/2024/privacy/number_of_websites_with_gpc.sql b/sql/2024/privacy/number_of_websites_with_gpc.sql new file mode 100644 index 00000000000..aca3a65c4d0 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_gpc.sql @@ -0,0 +1,34 @@ +# Pages that provide `/.well-known/gpc.json` for Global Privacy Control + +WITH pages AS ( + SELECT + client, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/gpc.json".found') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_well_known, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/gpc.json".found') = 'true', page, NULL)) AS number_of_pages_well_known, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_globalPrivacyControl') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_js_api, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_globalPrivacyControl') = 'true', page, NULL)) AS number_of_pages_js_api + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY client +), + +headers AS ( + SELECT + client, + COUNT(DISTINCT IF(headers.name = 'sec-gpc' AND headers.value = '1', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_headers, + COUNT(DISTINCT IF(headers.name = 'sec-gpc' AND headers.value = '1', page, NULL)) AS number_of_pages_headers + FROM `httparchive.all.requests`, + UNNEST(response_headers) headers + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + is_main_document = TRUE + GROUP BY client +) + +SELECT * +FROM pages +FULL OUTER JOIN headers +USING (client) diff --git a/sql/2024/privacy/number_of_websites_with_iab.sql b/sql/2024/privacy/number_of_websites_with_iab.sql new file mode 100644 index 00000000000..ae42d9687e6 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_iab.sql @@ -0,0 +1,60 @@ +# Counts of pages with IAB Frameworks + +WITH privacy_custom_metrics_data AS ( + SELECT + client, + JSON_QUERY(custom_metrics, '$.privacy') AS metrics + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +) + +SELECT + client, + number_of_pages_with_tcfv1 / number_of_pages AS pct_pages_with_tcfv1, + number_of_pages_with_tcfv1, + number_of_pages_with_tcfv2 / number_of_pages AS pct_pages_with_tcfv2, + number_of_pages_with_tcfv2, + number_of_pages_with_usp / number_of_pages AS pct_pages_with_usp, + number_of_pages_with_usp, + number_of_pages_with_tcf / number_of_pages AS pct_pages_with_tcf, + number_of_pages_with_tcf, + number_of_pages_with_any / number_of_pages AS pct_pages_with_any, + number_of_pages_with_any, + number_of_pages_with_tcfv1_compliant / number_of_pages AS pct_pages_with_tcfv1_compliant, + number_of_pages_with_tcfv1_compliant, + number_of_pages_with_tcfv2_compliant / number_of_pages AS pct_pages_with_tcfv2_compliant, + number_of_pages_with_tcfv2_compliant, + number_of_pages_with_gpp / number_of_pages AS pct_pages_with_gpp, + number_of_pages_with_gpp, + number_of_pages_with_gpp_data / number_of_pages AS pct_pages_with_gpp_data, + number_of_pages_with_gpp_data +FROM ( + SELECT + client, + COUNT(0) AS number_of_pages, + COUNTIF(tcfv1) AS number_of_pages_with_tcfv1, + COUNTIF(tcfv2) AS number_of_pages_with_tcfv2, + COUNTIF(usp) AS number_of_pages_with_usp, + COUNTIF(tcfv1 OR tcfv2) AS number_of_pages_with_tcf, + COUNTIF(tcfv1 OR tcfv2 OR usp OR gpp) AS number_of_pages_with_any, + COUNTIF(tcfv1 AND tcfv1_compliant) AS number_of_pages_with_tcfv1_compliant, + COUNTIF(tcfv2 AND tcfv2_compliant) AS number_of_pages_with_tcfv2_compliant, + COUNTIF(gpp) AS number_of_pages_with_gpp, + COUNTIF(gpp_data) AS number_of_pages_with_gpp_data + FROM ( + SELECT + client, + JSON_VALUE(metrics, '$.iab_tcf_v1.present') = 'true' AS tcfv1, + JSON_VALUE(metrics, '$.iab_tcf_v2.present') = 'true' AS tcfv2, + JSON_VALUE(metrics, '$.iab_gpp.present') = 'true' AS gpp, + JSON_VALUE(metrics, '$.iab_usp.present') = 'true' AS usp, + JSON_VALUE(metrics, '$.iab_tcf_v1.compliant_setup') = 'true' AS tcfv1_compliant, + JSON_VALUE(metrics, '$.iab_tcf_v2.compliant_setup') = 'true' AS tcfv2_compliant, + JSON_VALUE(metrics, '$.iab_gpp.data') IS NOT NULL AS gpp_data + FROM + privacy_custom_metrics_data + ) + GROUP BY client +) diff --git a/sql/2024/privacy/number_of_websites_with_referrerpolicy.sql b/sql/2024/privacy/number_of_websites_with_referrerpolicy.sql new file mode 100644 index 00000000000..6d45aa3e92e --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_referrerpolicy.sql @@ -0,0 +1,79 @@ +WITH referrer_policy_custom_metrics AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.entire_document_policy') AS meta_policy, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.referrerPolicy.individual_requests')) > 0 AS individual_requests, + CAST(JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.link_relations.A') AS INT64) > 0 AS link_relations + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +), +referrer_policy_headers AS ( + SELECT + client, + page, + LOWER(response_header.value) AS header_policy + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + is_main_document = TRUE AND + response_header.name = 'referrer-policy' +) + +SELECT + client, + number_of_pages_with_entire_document_policy_meta / number_of_pages AS pct_pages_with_entire_document_policy_meta, + number_of_pages_with_entire_document_policy_meta, + number_of_pages_with_entire_document_policy_header / number_of_pages AS pct_pages_with_entire_document_policy_header, + number_of_pages_with_entire_document_policy_header, + number_of_pages_with_entire_document_policy / number_of_pages AS pct_pages_with_entire_document_policy, + number_of_pages_with_entire_document_policy, + number_of_pages_with_any_individual_requests / number_of_pages AS pct_pages_with_any_individual_requests, + number_of_pages_with_any_individual_requests, + number_of_pages_with_any_link_relations / number_of_pages AS pct_pages_with_any_link_relations, + number_of_pages_with_any_link_relations, + number_of_pages_with_any_referrer_policy / number_of_pages AS pct_pages_with_any_referrer_policy, + number_of_pages_with_any_referrer_policy +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS number_of_pages, + COUNT(DISTINCT IF( + meta_policy IS NOT NULL, + page, NULL)) AS number_of_pages_with_entire_document_policy_meta, + COUNT(DISTINCT IF( + header_policy IS NOT NULL, + page, NULL)) AS number_of_pages_with_entire_document_policy_header, + COUNT(DISTINCT IF( + meta_policy IS NOT NULL OR + header_policy IS NOT NULL, + page, NULL) + ) AS number_of_pages_with_entire_document_policy, + COUNT(DISTINCT IF( + individual_requests, + page, NULL)) AS number_of_pages_with_any_individual_requests, + COUNT(DISTINCT IF( + link_relations, + page, NULL)) AS number_of_pages_with_any_link_relations, + COUNT(DISTINCT IF( + meta_policy IS NOT NULL OR + header_policy IS NOT NULL OR + individual_requests OR + link_relations, + page, NULL) + ) AS number_of_pages_with_any_referrer_policy + FROM + referrer_policy_custom_metrics + FULL OUTER JOIN + referrer_policy_headers + USING (client, page) + GROUP BY client +) +ORDER BY + client diff --git a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql new file mode 100644 index 00000000000..10b620620c7 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql @@ -0,0 +1,102 @@ +# Pages that participate in the privacy-relayed origin trials +CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( + SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) +); + +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) +RETURNS STRUCT< + token STRING, + origin STRING, + feature STRING, + expiry TIMESTAMP, + is_subdomain BOOL, + is_third_party BOOL +> AS ( + STRUCT( + DECODE_ORIGIN_TRIAL(token) AS token, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party + ) +); + +WITH pages AS ( + SELECT + client, + page, + JSON_QUERY(custom_metrics, '$.origin-trials') AS ot_metrics, + JSON_QUERY(custom_metrics, '$.almanac') AS almanac_metrics + FROM `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +), +response_headers AS ( + SELECT + client, + page, + PARSE_ORIGIN_TRIAL(response_header.value) AS ot -- may not lowercase this value as it is a base64 string + FROM `httparchive.all.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + is_main_document = TRUE AND + LOWER(response_header.name) = 'origin-trial' +), +meta_tags AS ( + SELECT + client, + page, + PARSE_ORIGIN_TRIAL(JSON_VALUE(meta_node, '$.content')) AS ot -- may not lowercase this value as it is a base64 string + FROM pages, + UNNEST(JSON_QUERY_ARRAY(almanac_metrics, '$.meta-nodes.nodes')) meta_node + WHERE + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = 'origin-trial' +), +ot_from_custom_metric AS ( + SELECT + client, + page, + PARSE_ORIGIN_TRIAL(JSON_VALUE(metric, '$.token')) AS ot + FROM pages, + UNNEST(JSON_QUERY_ARRAY(ot_metrics)) metric +) + +SELECT + client, + feature, + number_of_pages / total_pages AS pct_pages, + number_of_pages, + is_active +FROM ( + SELECT + client, + ot.feature AS feature, + ot.expiry >= CURRENT_TIMESTAMP() AS is_active, + COUNT(DISTINCT page) AS number_of_pages + FROM ( + SELECT * FROM response_headers + UNION ALL + SELECT * FROM meta_tags + UNION ALL + SELECT * FROM ot_from_custom_metric + ) + GROUP BY + client, + feature, + is_active +) +LEFT JOIN ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM pages + GROUP BY + client +) +USING (client) +ORDER BY + number_of_pages DESC diff --git a/sql/2024/privacy/number_of_websites_with_whotracksme_trackers.sql b/sql/2024/privacy/number_of_websites_with_whotracksme_trackers.sql new file mode 100644 index 00000000000..f8beea9ff04 --- /dev/null +++ b/sql/2024/privacy/number_of_websites_with_whotracksme_trackers.sql @@ -0,0 +1,42 @@ +WITH whotracksme AS ( + SELECT + domain, + category, + tracker + FROM `max-ostapenko.Public.whotracksme` + WHERE date = '2024-06-01' +), +pre_aggregated AS ( + SELECT + client, + category, + page, + tracker, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.requests` + JOIN whotracksme + ON NET.REG_DOMAIN(url) = domain + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) -- third party + GROUP BY + client, + category, + tracker, + page +) + +SELECT + client, + category, + tracker, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM pre_aggregated +GROUP BY + client, + category, + tracker +ORDER BY + pct_pages DESC diff --git a/sql/2024/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql b/sql/2024/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql new file mode 100644 index 00000000000..c9d76574569 --- /dev/null +++ b/sql/2024/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql @@ -0,0 +1,146 @@ +#standardSQL +# Adoption of different Privacy Sandbox (PS) features by different third-parties and by different publishers + +-- Extracting third-parties observed using PS APIs on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting PS APIs being called by a given third-party (passed as "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + + function splitByDelimiters(value) { + const delimiterRegex = new RegExp(',|, |\\n|\\u0000', 'g'); + return value.split(delimiterRegex).map(v => v.trim()).filter(v => v); + } + + const result = []; + const replacements = { + 'Ch': 'CH', 'Ua': 'UA', 'Wow64': 'WoW64', 'Dpr': 'DPR', 'Rtt': 'RTT', 'Ect': 'ECT', 'Etc': 'ETC', '-Architecture': '-Arch', '-Arc': '-Arch', '-Archh': '-Arch', + '-Factors': '-Factor', '-ETC': '-ECT', '-Modal': '-Model', '-UA-UA': '-UA', '-UAm': '-UA', 'UAmodel': 'UA-Model', 'UAplatform': 'UA-Platform', 'Secch-UA': 'Sec-CH-UA', + 'CH-Width': 'CH-Viewport-Width', '-UAodel': '-UA-Model', '-Platformua-Platform': '-Platform', '-Platformuser-Agent': '-Platform', '-Version"': '-Version' + }; + values.forEach(value => { + if (value.startsWith('accept-ch|')) { + const parts = splitByDelimiters(value.replace('accept-ch|', '')); + parts.forEach(part => { + if (["UA", "Arch", "Bitness", "Full-Version-List", "Mobile", "Model", "Platform", "Platform-Version", "WoW64"].includes(part)) { + result.push("Sec-CH-UA-" + part); + } else { + let formattedPart = part.split('-').map(segment => + segment.charAt(0).toUpperCase() + segment.slice(1).toLowerCase() + ).join('-'); + for (const [key, value] of Object.entries(replacements)) { + formattedPart = formattedPart.replace(new RegExp(key, 'g'), value); + } + result.push(formattedPart); + } + }); + } else { + result.push(value); + } + }); + + return result; +"""; + +WITH privacy_sandbox_features AS ( + SELECT + client, + CASE + WHEN rank <= 1000 THEN '1000' + WHEN rank <= 10000 THEN '10000' + WHEN rank <= 100000 THEN '100000' + WHEN rank <= 1000000 THEN '1000000' + WHEN rank <= 10000000 THEN '10000000' + ELSE 'Other' + END AS rank_group, + NET.REG_DOMAIN(page) AS publisher, + third_party_domain, + CASE + WHEN api LIKE '%opics%|%' THEN + REPLACE(SUBSTR(api, 0, STRPOS(api, '|') - 1) || '-' || SPLIT(api, '|')[SAFE_OFFSET(1)], '|', '-') + WHEN api LIKE 'attribution-reporting-register-source%' THEN + SPLIT(api, '|')[OFFSET(0)] + ELSE + api + END AS feature + FROM `httparchive.all.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS api + WHERE + date = '2024-06-01' AND + is_root_page = TRUE +), + +grouped_features AS ( + SELECT + rank_group, + feature, + COUNT(DISTINCT publisher) AS publisher_count, + COUNT(DISTINCT third_party_domain) AS third_party_count + FROM privacy_sandbox_features + GROUP BY rank_group, feature +), +aggregated_features AS ( + SELECT + feature, + SUM(CASE WHEN rank_group = '1000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_1000, + SUM(CASE WHEN rank_group = '1000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_1000, + SUM(CASE WHEN rank_group = '1000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_1000, + SUM(CASE WHEN rank_group = '1000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_1000, + SUM(CASE WHEN rank_group = '10000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_10000, + SUM(CASE WHEN rank_group = '10000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_10000, + SUM(CASE WHEN rank_group = '10000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_10000, + SUM(CASE WHEN rank_group = '10000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_10000, + SUM(CASE WHEN rank_group = '100000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_100000, + SUM(CASE WHEN rank_group = '100000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_100000, + SUM(CASE WHEN rank_group = '100000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_100000, + SUM(CASE WHEN rank_group = '100000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_100000, + SUM(CASE WHEN rank_group = '1000000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_1000000, + SUM(CASE WHEN rank_group = '1000000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_1000000, + SUM(CASE WHEN rank_group = '1000000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_1000000, + SUM(CASE WHEN rank_group = '1000000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_1000000, + SUM(CASE WHEN rank_group = '10000000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_10000000, + SUM(CASE WHEN rank_group = '10000000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_10000000, + SUM(CASE WHEN rank_group = '10000000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_10000000, + SUM(CASE WHEN rank_group = '10000000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_10000000 + FROM grouped_features + GROUP BY feature +) +SELECT + feature AS privacy_sandbox_features, + total_publisher_leq_1000, + distinct_publisher_leq_1000, + total_third_parties_leq_1000, + distinct_third_parties_leq_1000, + total_publisher_leq_10000, + distinct_publisher_leq_10000, + total_third_parties_leq_10000, + distinct_third_parties_leq_10000, + total_publisher_leq_100000, + distinct_publisher_leq_100000, + total_third_parties_leq_100000, + distinct_third_parties_leq_100000, + total_publisher_leq_1000000, + distinct_publisher_leq_1000000, + total_third_parties_leq_1000000, + distinct_third_parties_leq_1000000, + total_publisher_leq_10000000, + distinct_publisher_leq_10000000, + total_third_parties_leq_10000000, + distinct_third_parties_leq_10000000 +FROM aggregated_features +ORDER BY feature; diff --git a/sql/2024/privacy/top_ara_destinations_registered_by_most_publishers.sql b/sql/2024/privacy/top_ara_destinations_registered_by_most_publishers.sql new file mode 100644 index 00000000000..53959da294c --- /dev/null +++ b/sql/2024/privacy/top_ara_destinations_registered_by_most_publishers.sql @@ -0,0 +1,78 @@ +#standardSQL +# Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct publishers (at site level) + +-- Extracting third-parties observed using ARA API on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting ARA API source registration details being passed by a given third-party (passed as "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + const result = []; + + values.forEach(value => { + if (value.toLowerCase().startsWith('attribution-reporting-register-source|')) { + const parts = value.replace('attribution-reporting-register-source|', '').split('|'); + parts.forEach(part => { + if (part.startsWith('destination=')) { + const destinations = part.replace('destination=', '').split(','); + destinations.forEach(destination => { + result.push('destination=' + destination.trim()); + }); + } else { + result.push(part.trim()); + } + }); + } + }); + + return result; +"""; + +WITH ara_features AS ( + SELECT + client, + CASE + WHEN ara LIKE 'destination=%' THEN NET.REG_DOMAIN(REPLACE(ara, 'destination=', '')) + ELSE NULL + END AS destination, + COUNT(NET.REG_DOMAIN(page)) AS total_publishers, + COUNT(DISTINCT NET.REG_DOMAIN(page)) AS distinct_publishers, + COUNT(third_party_domain) AS total_third_party_domains, + COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains + FROM `httparchive.all.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + ara LIKE 'destination%' + GROUP BY client, destination + HAVING destination IS NOT NULL +), +ranked_features AS ( + SELECT + client, + destination, + total_publishers, + distinct_publishers, + total_third_party_domains, + distinct_third_party_domains, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY distinct_publishers DESC) AS publisher_rank + FROM ara_features +) +SELECT * FROM ranked_features +WHERE publisher_rank <= 25 +ORDER BY client, distinct_publishers DESC; diff --git a/sql/2024/privacy/top_ara_destinations_registered_by_most_third_parties.sql b/sql/2024/privacy/top_ara_destinations_registered_by_most_third_parties.sql new file mode 100644 index 00000000000..065d46df0c0 --- /dev/null +++ b/sql/2024/privacy/top_ara_destinations_registered_by_most_third_parties.sql @@ -0,0 +1,78 @@ +#standardSQL +# Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct third-parties (at site level) + +-- Extracting third-parties observed using ARA API on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting ARA API source registration details being passed by a given third-party (passed as "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + const result = []; + + values.forEach(value => { + if (value.toLowerCase().startsWith('attribution-reporting-register-source|')) { + const parts = value.replace('attribution-reporting-register-source|', '').split('|'); + parts.forEach(part => { + if (part.startsWith('destination=')) { + const destinations = part.replace('destination=', '').split(','); + destinations.forEach(destination => { + result.push('destination=' + destination.trim()); + }); + } else { + result.push(part.trim()); + } + }); + } + }); + + return result; +"""; + +WITH ara_features AS ( + SELECT + client, + CASE + WHEN ara LIKE 'destination=%' THEN NET.REG_DOMAIN(REPLACE(ara, 'destination=', '')) + ELSE NULL + END AS destination, + COUNT(NET.REG_DOMAIN(page)) AS total_publishers, + COUNT(DISTINCT NET.REG_DOMAIN(page)) AS distinct_publishers, + COUNT(third_party_domain) AS total_third_party_domains, + COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains + FROM `httparchive.all.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + WHERE + date = '2024-06-01' AND + is_root_page = TRUE AND + ara LIKE 'destination%' + GROUP BY client, destination + HAVING destination IS NOT NULL +), +ranked_features AS ( + SELECT + client, + destination, + total_publishers, + distinct_publishers, + total_third_party_domains, + distinct_third_party_domains, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY distinct_third_party_domains DESC) AS third_party_domain_rank + FROM ara_features +) +SELECT * FROM ranked_features +WHERE third_party_domain_rank <= 25 +ORDER BY client, distinct_third_party_domains DESC; diff --git a/sql/2024/privacy/top_direct_ads_demand.sql b/sql/2024/privacy/top_direct_ads_demand.sql new file mode 100644 index 00000000000..ee96dbe9e38 --- /dev/null +++ b/sql/2024/privacy/top_direct_ads_demand.sql @@ -0,0 +1,63 @@ +WITH RECURSIVE pages AS ( + SELECT + CASE page -- Publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain. CASE needs to be replaced with a more robust solution from HTTPArchive/custom-metrics#136. + WHEN 'https://www.chunkbase.com/' THEN 'cafemedia.com' + ELSE NET.REG_DOMAIN(page) + END AS page_domain, + JSON_QUERY(ANY_VALUE(custom_metrics), '$.ads') AS ads_metrics + FROM `httparchive.all.pages` + WHERE date = '2024-06-01' AND + is_root_page = TRUE + GROUP BY page_domain +), ads AS ( + SELECT + page_domain, + JSON_QUERY(ads_metrics, '$.ads.account_types') AS ad_accounts + FROM pages + WHERE + JSON_VALUE(ads_metrics, '$.ads.account_count') != '0' +), sellers AS ( + SELECT + page_domain, + JSON_QUERY(ads_metrics, '$.sellers.seller_types') AS ad_sellers + FROM pages + WHERE + JSON_VALUE(ads_metrics, '$.sellers.seller_count') != '0' +), relationships AS ( + SELECT + demand, + publisher, + COUNT(DISTINCT publisher) OVER () AS total_publishers + FROM ( + SELECT + NET.REG_DOMAIN(domain) AS demand, + page_domain AS publisher + FROM ads, + UNNEST(JSON_VALUE_ARRAY(ad_accounts, '$.direct.domains')) AS domain + UNION ALL + SELECT + page_domain AS demand, + NET.REG_DOMAIN(domain) AS publisher + FROM sellers, + UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.publisher.domains')) AS domain + UNION ALL + SELECT + page_domain AS demand, + NET.REG_DOMAIN(domain) AS publisher + FROM sellers, + UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.both.domains')) AS domain + ) + GROUP BY + demand, + publisher +) + +SELECT + demand, + COUNT(DISTINCT publisher) / ANY_VALUE(total_publishers) AS pct_publishers, + COUNT(DISTINCT publisher) AS number_of_publishers +FROM relationships +GROUP BY + demand +ORDER BY pct_publishers DESC +LIMIT 100 diff --git a/sql/util/ads_txt_parser.py b/sql/util/ads_txt_parser.py new file mode 100644 index 00000000000..ec134a4888c --- /dev/null +++ b/sql/util/ads_txt_parser.py @@ -0,0 +1,108 @@ +""" +Description: This script is used to parse the sellers.json file from a given URL +and extract the required information. +The extracted information is then written to a file named sellers.json in the current directory. +e.g. Google sellers.json url = https://storage.googleapis.com/adx-rtb-dictionaries/sellers.json +""" + +import json +import requests # pylint: disable=import-error + +SELLER_TYPES = ["publisher", "intermediary", "both"] + + +def is_present(response, paths): + """ + Check if any of the given paths are present in the response URL. + + Args: + response (object): The response object. + paths (list): A list of paths to check. + + Returns: + bool: True if any of the paths are present in the response URL, False otherwise. + """ + return any(path in response.url for path in paths) + + +def parse_sellers_json( + url="https://storage.googleapis.com/adx-rtb-dictionaries/sellers.json", +): + """ + Parse the sellers.json file from a given URL. + + Args: + url (str): The URL of the sellers.json file. + + Returns: + dict: A dictionary containing the parsed information. + """ + result = {} + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + content = response.json() + except (requests.exceptions.RequestException, json.JSONDecodeError): + content = None + + result = { + "present": is_present(response, ["/sellers.json"]), + "redirected": response.history != [], + "status": response.status_code, + } + + if result["present"] and content: + result.update( + { + "seller_count": 0, + "seller_types": { + "publisher": { + "domains": set(), + "seller_count": 0, + }, + "intermediary": { + "domains": set(), + "seller_count": 0, + }, + "both": { + "domains": set(), + "seller_count": 0, + }, + }, + "passthrough_count": 0, + "confidential_count": 0, + } + ) + + result["seller_count"] = len(content.get("sellers", [])) + + for seller in content.get("sellers", []): + stype = seller.get("seller_type", "").strip().lower() + if stype not in SELLER_TYPES or not seller.get("seller_id"): + continue + + if seller.get("is_passthrough"): + result["passthrough_count"] += 1 + + if seller.get("is_confidential"): + result["confidential_count"] += 1 + + if seller.get("domain"): + domain = seller["domain"].strip().lower() + result["seller_types"][stype]["domains"].add(domain) + result["seller_types"][stype]["seller_count"] += 1 + + for stype in result["seller_types"].values(): + stype["domain_count"] = len(stype["domains"]) + stype["domains"] = list(stype["domains"]) + + return result + + +if __name__ == "__main__": + url = input("Enter the sellers.json URL: ").strip() + parsed_data = parse_sellers_json(url) + + with open("sellers.json", "w", encoding="utf-8") as f: + f.write(json.dumps(parsed_data, indent=4)) diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index 72896a9080a..e32047eecb3 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -2,117 +2,72 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, + "metadata": {}, "source": [ - "\"Open" + "\"Open" ] }, { "cell_type": "code", - "source": [ - "# Almanac\n", - "CHAPTER = \"privacy\"\n", - "YEAR = \"2024\"\n", - "\n", - "# BigQuery\n", - "GCP_PROJECT = \"httparchive\"\n", - "\n", - "# Git\n", - "BRANCH_NAME = \"{chapter}-sql-{year}\".format(\n", - " chapter=CHAPTER,\n", - " year=YEAR\n", - ")\n", - "\n", - "# SQL folder\n", - "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", - " year=YEAR,\n", - " chapter=CHAPTER\n", - ")\n", - "\n", - "# Google Sheets\n", - "spreadsheet_name = \"{chapter} (Web Almanac {year})\".format(\n", - " chapter=CHAPTER.capitalize(),\n", - " year=YEAR\n", - ")\n", - "\n", - "# Set to `None` to create new one or an existing spreadsheet URL.\n", - "existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'" - ], + "execution_count": null, "metadata": { - "id": "U37785Bxt5tE" + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OVkCxlRQH6Yt", + "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" }, - "execution_count": 1, - "outputs": [] + "outputs": [], + "source": [ + "# @title Download repo\n", + "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OVkCxlRQH6Yt", - "outputId": "9fb31f97-8541-461a-991f-e7932da56101" + "cellView": "form", + "id": "U37785Bxt5tE" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'almanac.httparchive.org'...\n", - "remote: Enumerating objects: 43942, done.\u001b[K\n", - "remote: Counting objects: 100% (5935/5935), done.\u001b[K\n", - "remote: Compressing objects: 100% (1535/1535), done.\u001b[K\n", - "remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007\u001b[K\n", - "Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.\n", - "Resolving deltas: 100% (29622/29622), done.\n", - "Updating files: 100% (5472/5472), done.\n" - ] - } - ], + "outputs": [], "source": [ - "# Download repo\n", - "!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git" + "# @title Configure the chapter to process\n", + "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", + "almanac_year = 2024 #@param {type: \"integer\"}\n", + "chapter_name = 'privacy' #@param {type: \"string\"}\n", + "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { + "cellView": "form", "colab": { "base_uri": "https://localhost:8080/" }, "id": "UzhgG5xvbQ1E", - "outputId": "4dfc6202-2034-49bd-a77c-5a6e00e01bea" + "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Already on 'privacy-sql-2024'\n", - "Your branch is up to date with 'origin/privacy-sql-2024'.\n", - "Already up to date.\n" - ] - } - ], + "outputs": [], "source": [ - "# Update local branch\n", - "!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull" + "# @title Update chapter branch\n", + "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", + "!cd almanac.httparchive.org/ && git checkout $branch_name && git pull" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { + "cellView": "form", "id": "45dBifFPJAtO" }, "outputs": [], "source": [ - "# Authenticate\n", + "# @title Authenticate\n", "import google.auth\n", "import os\n", "from google.colab import auth\n", @@ -121,97 +76,125 @@ "import gspread\n", "from gspread_dataframe import set_with_dataframe\n", "\n", + "\n", "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", "auth.authenticate_user()\n", "credentials, project = google.auth.default()\n", "client = bigquery.Client()\n", - "gc = gspread.authorize(credentials)" + "gc = gspread.authorize(credentials)\n", + "\n", + "try:\n", + " ss = gc.open_by_url(spreadsheet_url)\n", + "except:\n", + " print('Spreadsheet not found')" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { + "cellView": "form", "colab": { "base_uri": "https://localhost:8080/" }, + "collapsed": true, "id": "nblNil985Tjt", - "outputId": "ccde5268-430c-4ecc-b99c-fce20d061ec8" + "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0\n" - ] - } - ], + "outputs": [], "source": [ + "# @title Upload query results\n", + "\n", "import glob\n", "import re\n", + "from tabulate import tabulate\n", + "from IPython.display import clear_output\n", "\n", - "# Build Sheets\n", - "try:\n", - " ss = gc.open_by_url(existing_spreadsheet_url)\n", - " print('Using existing spreadsheet:', ss.url)\n", - "except:\n", - " ss = gc.create(spreadsheet_name)\n", - " print('Created a new spreadsheet:', spreadsheet_name, ss.url)\n", - "existing_sheets = [s.title for s in ss.worksheets()]\n", - "\n", - "file_match_include = r\"number_of_websites_with_features_based_on_string_search.sql\"+\"|\"+ \\\n", - " \"number_of_websites_with_origin_trial_from_token.sql\"\n", "\n", - "file_match_exclude = r\"^$\"\n", + "filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "filename_match_exclude = '(ads_and_sellers_graph)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "dry_run = True # @param {type: \"boolean\"}\n", + "overwrite_sheets = True # @param {type: \"boolean\"}\n", + "maximum_tb_billed = None # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", "\n", - "overwrite = False\n", - "dry_run = True\n", - "tb_processed_limit = 0.1\n", + "filename_include_regexp = r'{}'.format(filename_match)\n", + "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n", + "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", + " year=almanac_year,\n", + " chapter=chapter_name.lower()\n", + ")\n", + "existing_sheets = [s.title for s in ss.worksheets()]\n", "\n", - "# Find matching .sql queries in folder and save to google sheet.\n", - "for filepath in glob.iglob(folder):\n", + "# Print formatted logs\n", + "queries_processed_log = []\n", + "def print_logs_table(log=None, append=True):\n", + " if log:\n", + " queries_processed_log.append(log)\n", + " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", + " if not append:\n", + " del queries_processed_log[-1]\n", + " clear_output(wait=True)\n", + " print(table)\n", + "\n", + "# Find matching SQL queries and save results to Google Sheets.\n", + "for filepath in sorted(glob.iglob(folder)):\n", " filename = filepath.split('/')[-1]\n", - " sheet_title = re.sub(r\"(\\.sql|[^a-zA-Z0-9]+)\", \" \", filename).strip().title()\n", "\n", - " if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):\n", + " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", + "\n", + " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", "\n", - " print('Processing:', sheet_title)\n", " with open(filepath) as f:\n", " query = f.read()\n", "\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(dry_run = True)\n", - " )\n", - "\n", - " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", - " print(f\"Total Tb billed:{tb_processed:9.3f}\")\n", - "\n", - " if dry_run:\n", + " try:\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(dry_run = True)\n", + " )\n", + " except Exception as e:\n", + " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", " continue\n", "\n", - " if tb_processed > tb_processed_limit:\n", - " print('Data volume hit the limit. Skipping:', sheet_title)\n", - " continue\n", + " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", + " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", "\n", " if sheet_title in existing_sheets:\n", - " if not overwrite:\n", - " print('Overwrite is False. Skipping:', sheet_title)\n", + " if overwrite_sheets:\n", + " st = ss.worksheet(sheet_title)\n", + " else:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", " continue\n", "\n", + " if dry_run:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", + " continue\n", + "\n", + " try:\n", + " if maximum_tb_billed:\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(\n", + " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", + " )\n", + " )\n", " else:\n", - " st = ss.worksheet(sheet_title)\n", - " ss.del_worksheet(st)\n", + " response = client.query(query)\n", + "\n", + " df = response.to_dataframe()\n", + " if ('st' not in locals() or st.title != sheet_title):\n", + " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", + " set_with_dataframe(st, df, resize=False)\n", "\n", - " df = client.query(query).to_dataframe()\n", - " rows, cols = df.shape\n", + " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", + " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", "\n", - " st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)\n", - " set_with_dataframe(st, df)\n", + " except Exception as e:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", + " continue\n", "\n", " else:\n", - " print('Not Matched. Skipping:', sheet_title)" + " print_logs_table([filename, None, None, 'Filename mismatch'])" ] } ], @@ -224,7 +207,8 @@ "name": "python3" }, "language_info": { - "name": "python" + "name": "python", + "version": "3.12.4" } }, "nbformat": 4, diff --git a/sql/util/bq_writer.py b/sql/util/bq_writer.py new file mode 100644 index 00000000000..3355dcfcb94 --- /dev/null +++ b/sql/util/bq_writer.py @@ -0,0 +1,28 @@ +""" +This script writes a pandas DataFrame to BigQuery. +""" + +from google.cloud import bigquery # pylint: disable=import-error + + +def write_to_bq(df, table_id, schema, write_disposition="WRITE_APPEND"): + """ + Writes a DataFrame to BigQuery. + + Args: + df (pandas.DataFrame): The data to load into BigQuery. + table_id (str): The destination table in BigQuery (e.g., 'project.dataset.table'). + schema (list): The schema for the BigQuery table. + write_disposition (str): Write mode, default is WRITE_APPEND. + """ + client = bigquery.Client() + + job_config = bigquery.LoadJobConfig( + source_format=bigquery.SourceFormat.CSV, + write_disposition=write_disposition, + schema=schema, + ) + + # Load data into BigQuery + job = client.load_table_from_dataframe(df, table_id, job_config=job_config) + job.result() # Waits for the job to complete diff --git a/sql/util/breaches.py b/sql/util/breaches.py deleted file mode 100644 index 3d93a240532..00000000000 --- a/sql/util/breaches.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -1. Download breaches.json -2. Create a new table almanac.breaches_2022 via upload, with autodetected schema -3. Append the output of this query to almanac.breaches: -SELECT - DATE('2022-06-01') AS date, - Name, - Title, - Domain, - BreachDate, - AddedDate, - ModifiedDate, - PwnCount, - Description, - LogoPath, - IsVerified, - IsFabricated, - IsSensitive, - IsRetired, - IsSpamList, - TO_JSON_STRING(DataClasses) AS DataClasses -FROM - `httparchive.almanac.breaches_2022` -""" - -import json -import requests # pylint: disable=import-error - -year = 2022 - -with open(f"../{year}/privacy/breaches.jsonl", "w") as outfile: - breaches = json.loads( - requests.get("https://haveibeenpwned.com/api/v2/breaches").content - ) - for entry in breaches: - outfile.write(json.dumps(entry) + "\n") diff --git a/sql/util/functions.sql b/sql/util/functions.sql index b9f861b3683..becc0ee67f8 100644 --- a/sql/util/functions.sql +++ b/sql/util/functions.sql @@ -8,3 +8,27 @@ try { return null; } """; + +# Origin Trials +CREATE OR REPLACE FUNCTION `httparchive.fn.DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( + SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) +); + +CREATE OR REPLACE FUNCTION `httparchive.fn.PARSE_ORIGIN_TRIAL`(token STRING) +RETURNS STRUCT< + token STRING, + origin STRING, + feature STRING, + expiry TIMESTAMP, + is_subdomain BOOL, + is_third_party BOOL +> AS ( + STRUCT( + DECODE_ORIGIN_TRIAL(token) AS token, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party + ) +); diff --git a/sql/util/haveibeenpwned.py b/sql/util/haveibeenpwned.py new file mode 100644 index 00000000000..1ea1bb7f75f --- /dev/null +++ b/sql/util/haveibeenpwned.py @@ -0,0 +1,59 @@ +""" +This module retrieves data from the "haveibeenpwned" API and loads it into a BigQuery table. +""" + +import json +from datetime import datetime as DateTime + +import pandas +import requests # pylint: disable=import-error +from bq_writer import write_to_bq, bigquery + + +# Retrieve data from the "haveibeenpwned" API +breaches = json.loads( + requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10).content +) +df = pandas.DataFrame(breaches) + +year = DateTime.now().year +df["date"] = DateTime(year, 6, 1).date() +df["Name"] = df["Name"].astype(str) +df["Title"] = df["Title"].astype(str) +df["Domain"] = df["Domain"].astype(str) +df["BreachDate"] = pandas.to_datetime( + df["BreachDate"], format="%Y-%m-%d", errors="coerce" +).dt.date +df["AddedDate"] = pandas.to_datetime( + df["AddedDate"], format="%Y-%m-%d", errors="coerce" +).dt.date +df["ModifiedDate"] = pandas.to_datetime( + df["ModifiedDate"], format="%Y-%m-%d", errors="coerce" +).dt.date +df["Description"] = df["Description"].astype(str) +df["LogoPath"] = df["LogoPath"].astype(str) +df["DataClasses"] = df["DataClasses"].apply(json.dumps) + +# Append to httparchive.almanac.breaches + + +schema = [ + bigquery.SchemaField("date", "DATE"), + bigquery.SchemaField("Name", "STRING"), + bigquery.SchemaField("Title", "STRING"), + bigquery.SchemaField("Domain", "STRING"), + bigquery.SchemaField("BreachDate", "DATE"), + bigquery.SchemaField("AddedDate", "DATE"), + bigquery.SchemaField("ModifiedDate", "DATE"), + bigquery.SchemaField("PwnCount", "INTEGER"), + bigquery.SchemaField("Description", "STRING"), + bigquery.SchemaField("LogoPath", "STRING"), + bigquery.SchemaField("IsVerified", "BOOLEAN"), + bigquery.SchemaField("IsFabricated", "BOOLEAN"), + bigquery.SchemaField("IsSensitive", "BOOLEAN"), + bigquery.SchemaField("IsRetired", "BOOLEAN"), + bigquery.SchemaField("IsSpamList", "BOOLEAN"), + bigquery.SchemaField("DataClasses", "STRING"), +] + +write_to_bq(df, "httparchive.almanac.breaches", schema) diff --git a/sql/util/populate_easylist_adserver.py b/sql/util/populate_easylist_adserver.py new file mode 100644 index 00000000000..e048d16208a --- /dev/null +++ b/sql/util/populate_easylist_adserver.py @@ -0,0 +1,42 @@ +# pylint: disable=import-error +import requests +import pandas as pd +from bq_writer import bigquery, write_to_bq + + +def extract_domains(content): + domains_set = set() + for line in content.splitlines(): + + # Skip comments and regexes + if line.startswith("!") or line.startswith("/"): + continue + + # Remove the '||' prefix and '^.*' suffix + domain = line.strip().lstrip("||").split('^')[0] + + # Ensure the domain is not empty + if domain: + domains_set.add(domain) + + return domains_set + + +# URL to the text file containing the regex patterns +URL = "https://raw.githubusercontent.com/easylist/easylist/master/easylist/easylist_adservers.txt" + +# Download the file +response = requests.get(URL) + +# Extract domains +domains = extract_domains(response.text) + +# Create a DataFrame from the list of domains +df = pd.DataFrame(domains, columns=["Domain"]).sort_values("Domain").reset_index(drop=True) + +write_to_bq( + df, + "httparchive.almanac.easylist_adservers", + [bigquery.SchemaField("Domain", "STRING")], + "WRITE_TRUNCATE", +) diff --git a/sql/util/trackers_whotracksme.py b/sql/util/trackers_whotracksme.py deleted file mode 100644 index 2d882071213..00000000000 --- a/sql/util/trackers_whotracksme.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Retrieve and extract trackers as identified by WhoTracks.me. -https://github.com/ghostery/whotracks.me/blob/master/blog/generating_adblocker_filters.md#loading-the-data - -1. Download trackers.csv -2. Upload to almanac.trackers_2022 temp table -3. Append to almanac.whotracksme with this query: -SELECT - * -FROM - `httparchive.almanac.trackers_2022` -""" - -import requests # pylint: disable=import-error -import sqlite3 -import pandas - -year = "2022" - -trackerdb_sql = requests.get( - "https://raw.githubusercontent.com/whotracksme/whotracks.me/master/whotracksme/data/assets/trackerdb.sql" -).text - -connection = sqlite3.connect(":memory:") -connection.executescript(trackerdb_sql) - -sql_query = """ - SELECT - '{year}-06-01' AS date, - categories.name as category, - tracker, - domain - FROM - tracker_domains - INNER JOIN - trackers - ON trackers.id = tracker_domains.tracker - INNER JOIN - categories - ON categories.id = trackers.category_id; -""".format( - year=year -) - -pandas.read_sql(sql_query, connection).to_csv( - f"../{year}/privacy/trackers.csv", index=False -) diff --git a/sql/util/whotracksme_trackers.py b/sql/util/whotracksme_trackers.py new file mode 100644 index 00000000000..ec68f922e17 --- /dev/null +++ b/sql/util/whotracksme_trackers.py @@ -0,0 +1,47 @@ +""" +This module retrieves and extracts trackers as identified by WhoTracks.me +and appends them to the httparchive.almanac.whotracksme BigQuery table. +""" +# pylint: disable=import-error +import sqlite3 + +import pandas +import requests +from bq_writer import write_to_bq, bigquery + +# Retrieve and extract trackers as identified by WhoTracks.me. +# https://github.com/ghostery/whotracks.me/blob/master/blog/generating_adblocker_filters.md#loading-the-data +tracker_db = requests.get( + "https://raw.githubusercontent.com/whotracksme/whotracks.me/master/whotracksme/data/assets/trackerdb.sql", + timeout=10, +).text + +TRACKERS_QUERY = """ + SELECT + '2024-06-01' AS date, + categories.name as category, + tracker, + domain + FROM + tracker_domains + INNER JOIN + trackers + ON trackers.id = tracker_domains.tracker + INNER JOIN + categories + ON categories.id = trackers.category_id; +""" +connection = sqlite3.connect(":memory:") +connection.executescript(tracker_db) +trackers_df = pandas.read_sql(TRACKERS_QUERY, connection) +connection.close() + +# Append to almanac.whotracksme BQ table +schema = [ + bigquery.SchemaField("date", "DATE"), + bigquery.SchemaField("category", "STRING"), + bigquery.SchemaField("tracker", "STRING"), + bigquery.SchemaField("domain", "STRING"), +] + +write_to_bq(trackers_df, "httparchive.almanac.whotracksme", schema) diff --git a/src/requirements.txt b/src/requirements.txt index ffb157499a8..978ebb91caf 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -5,3 +5,8 @@ pytest==8.2.2 pytest-watch==4.2.0 pytest-cov==5.0.0 sqlfluff==1.4.5 + +# sql/util/* dependencies +pandas==2.2.2 +google-cloud-bigquery==3.25.0 +requests==2.32.3