From cf3354421af27b9630bbabf2b8695af749c8a492 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 12 Jan 2025 15:40:29 +0100 Subject: [PATCH 01/24] versions --- .../output/core_web_vitals/technologies.js | 145 ++++++++++++------ 1 file changed, 102 insertions(+), 43 deletions(-) diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index d307929a..6130d22d 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -31,19 +31,72 @@ CREATE TEMP FUNCTION IS_NON_ZERO( good + needs_improvement + poor > 0 ); `).query(ctx => ` -WITH geo_summary AS ( +WITH pages AS ( SELECT - CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\\d{4})(\\d{2})', r'\\1-\\2-01') AS DATE) AS date, - * EXCEPT (country_code), - \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo + client, + page, + root_page AS origin, + technologies, + summary, + lighthouse + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} +), geo_summary AS ( + SELECT + \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo, + rank, + device, + origin, + avg_fcp, + avg_fid, + avg_inp, + avg_lcp, + avg_ttfb, + fast_fcp, + fast_fid, + fast_inp, + fast_lcp, + fast_ttfb, + slow_fcp, + slow_fid, + slow_inp, + slow_lcp, + slow_ttfb, + small_cls, + medium_cls, + large_cls FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')} WHERE yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND device IN ('desktop', 'phone') -UNION ALL + + UNION ALL + SELECT - * EXCEPT (yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin), - 'ALL' AS geo + 'ALL' AS geo, + rank, + device, + origin, + avg_fcp, + avg_fid, + avg_inp, + avg_lcp, + avg_ttfb, + fast_fcp, + fast_fid, + fast_inp, + fast_lcp, + fast_ttfb, + slow_fcp, + slow_fid, + slow_inp, + slow_lcp, + slow_ttfb, + small_cls, + medium_cls, + large_cls FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')} WHERE date = '${pastMonth}' AND @@ -61,7 +114,7 @@ crux AS ( WHEN 10000 THEN 'Top 10k' WHEN 1000 THEN 'Top 1k' END AS rank, - CONCAT(origin, '/') AS root_page, + CONCAT(origin, '/') AS origin, IF(device = 'desktop', 'desktop', 'mobile') AS client, # CWV @@ -94,56 +147,61 @@ crux AS ( technologies AS ( SELECT - technology.technology, + tech.technology, + REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version, client, page - FROM ${ctx.ref('crawl', 'pages')}, - UNNEST(technologies) AS technology - WHERE - date = '${pastMonth}' - ${constants.devRankFilter} AND - technology.technology IS NOT NULL AND - technology.technology != '' -UNION ALL + FROM pages, + UNNEST(technologies) AS tech, + UNNEST(tech.info) AS version + WHERE REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL + + UNION ALL + + SELECT + tech.technology, + 'ALL' AS version, + client, + page + FROM pages, + UNNEST(technologies) AS tech + + UNION ALL + SELECT 'ALL' AS technology, + 'ALL' AS version, client, page - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${pastMonth}' - ${constants.devRankFilter} + FROM pages ), categories AS ( SELECT technology.technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category - FROM ${ctx.ref('crawl', 'pages')}, + FROM pages, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category - WHERE - date = '${pastMonth}' - ${constants.devRankFilter} GROUP BY technology -UNION ALL + + UNION ALL + SELECT 'ALL' AS technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category - FROM ${ctx.ref('crawl', 'pages')}, + FROM pages, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${pastMonth}' AND client = 'mobile' - ${constants.devRankFilter} ), -summary_stats AS ( +lab_metrics AS ( SELECT client, page, - root_page AS root_page, + origin, SAFE.INT64(summary.bytesTotal) AS bytesTotal, SAFE.INT64(summary.bytesJS) AS bytesJS, SAFE.INT64(summary.bytesImg) AS bytesImg, @@ -152,17 +210,15 @@ summary_stats AS ( SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance, SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa, SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${pastMonth}' - ${constants.devRankFilter} + FROM pages ), lab_data AS ( SELECT client, - root_page, + origin, technology, + version, ANY_VALUE(category) AS category, AVG(bytesTotal) AS bytesTotal, AVG(bytesJS) AS bytesJS, @@ -172,15 +228,16 @@ lab_data AS ( AVG(performance) AS performance, AVG(pwa) AS pwa, AVG(seo) AS seo - FROM summary_stats - JOIN technologies + FROM lab_metrics + INNER JOIN technologies USING (client, page) - JOIN categories + INNER JOIN categories USING (technology) GROUP BY client, - root_page, - technology + origin, + technology, + version ) SELECT @@ -189,8 +246,9 @@ SELECT rank, ANY_VALUE(category) AS category, technology AS app, + version, client, - COUNT(0) AS origins, + COUNT(DISTINCT origin) AS origins, # CrUX data COUNTIF(good_fid) AS origins_with_good_fid, @@ -227,9 +285,10 @@ SELECT FROM lab_data INNER JOIN crux -USING (client, root_page) +USING (client, origin) GROUP BY app, + version, geo, rank, client From 547f63ebd238c3b8516ea4272795cc2315c376be Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 12 Jan 2025 20:00:23 +0100 Subject: [PATCH 02/24] tech filter --- definitions/output/core_web_vitals/technologies.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 6130d22d..c8f1e35d 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -154,7 +154,10 @@ technologies AS ( FROM pages, UNNEST(technologies) AS tech, UNNEST(tech.info) AS version - WHERE REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL + WHERE + technology.technology IS NOT NULL AND + technology.technology != '' AND + REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL UNION ALL @@ -165,6 +168,9 @@ technologies AS ( page FROM pages, UNNEST(technologies) AS tech + WHERE + technology.technology IS NOT NULL AND + technology.technology != '' UNION ALL From e8580d0604751a262af89eb6172d21b9d722e86a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 19:36:54 +0100 Subject: [PATCH 03/24] new table with versions --- .../output/core_web_vitals/technologies.js | 145 +++------ .../output/reports/cwv_tech_adoption.js | 2 +- .../output/reports/cwv_tech_categories.js | 2 +- .../reports/cwv_tech_core_web_vitals.js | 2 +- .../output/reports/cwv_tech_lighthouse.js | 2 +- .../output/reports/cwv_tech_page_weight.js | 2 +- .../output/reports/cwv_tech_technologies.js | 2 +- definitions/output/reports/tech_crux.js | 301 ++++++++++++++++++ infra/dataform-trigger/index.js | 2 +- 9 files changed, 348 insertions(+), 112 deletions(-) create mode 100644 definitions/output/reports/tech_crux.js diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 421f38c1..98254191 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -31,72 +31,19 @@ CREATE TEMP FUNCTION IS_NON_ZERO( good + needs_improvement + poor > 0 ); `).query(ctx => ` -WITH pages AS ( +WITH geo_summary AS ( SELECT - client, - page, - root_page AS origin, - technologies, - summary, - lighthouse - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${pastMonth}' - ${constants.devRankFilter} -), geo_summary AS ( - SELECT - \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo, - rank, - device, - origin, - avg_fcp, - avg_fid, - avg_inp, - avg_lcp, - avg_ttfb, - fast_fcp, - fast_fid, - fast_inp, - fast_lcp, - fast_ttfb, - slow_fcp, - slow_fid, - slow_inp, - slow_lcp, - slow_ttfb, - small_cls, - medium_cls, - large_cls + CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\\d{4})(\\d{2})', r'\\1-\\2-01') AS DATE) AS date, + * EXCEPT (country_code), + \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')} WHERE yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND device IN ('desktop', 'phone') - - UNION ALL - +UNION ALL SELECT - 'ALL' AS geo, - rank, - device, - origin, - avg_fcp, - avg_fid, - avg_inp, - avg_lcp, - avg_ttfb, - fast_fcp, - fast_fid, - fast_inp, - fast_lcp, - fast_ttfb, - slow_fcp, - slow_fid, - slow_inp, - slow_lcp, - slow_ttfb, - small_cls, - medium_cls, - large_cls + * EXCEPT (yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin), + 'ALL' AS geo FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')} WHERE date = '${pastMonth}' AND @@ -114,7 +61,7 @@ crux AS ( WHEN 10000 THEN 'Top 10k' WHEN 1000 THEN 'Top 1k' END AS rank, - CONCAT(origin, '/') AS origin, + CONCAT(origin, '/') AS root_page, IF(device = 'desktop', 'desktop', 'mobile') AS client, # CWV @@ -147,67 +94,56 @@ crux AS ( technologies AS ( SELECT - tech.technology, - REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version, - client, - page - FROM pages, - UNNEST(technologies) AS tech, - UNNEST(tech.info) AS version - WHERE - technology.technology IS NOT NULL AND - technology.technology != '' AND - REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL - - UNION ALL - - SELECT - tech.technology, - 'ALL' AS version, + technology.technology, client, page - FROM pages, - UNNEST(technologies) AS tech + FROM ${ctx.ref('crawl', 'pages')}, + UNNEST(technologies) AS technology WHERE + date = '${pastMonth}' + ${constants.devRankFilter} AND technology.technology IS NOT NULL AND technology.technology != '' - - UNION ALL - +UNION ALL SELECT 'ALL' AS technology, - 'ALL' AS version, client, page - FROM pages + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} ), categories AS ( SELECT technology.technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category - FROM pages, + FROM ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} GROUP BY technology - - UNION ALL - +UNION ALL SELECT 'ALL' AS technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category - FROM pages, + FROM ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE + date = '${pastMonth}' AND client = 'mobile' + ${constants.devRankFilter} ), -lab_metrics AS ( +summary_stats AS ( SELECT client, page, - origin, + root_page AS root_page, SAFE.INT64(summary.bytesTotal) AS bytesTotal, SAFE.INT64(summary.bytesJS) AS bytesJS, SAFE.INT64(summary.bytesImg) AS bytesImg, @@ -216,15 +152,17 @@ lab_metrics AS ( SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance, SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa, SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo - FROM pages + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} ), lab_data AS ( SELECT client, - origin, + root_page, technology, - version, ANY_VALUE(category) AS category, AVG(bytesTotal) AS bytesTotal, AVG(bytesJS) AS bytesJS, @@ -234,16 +172,15 @@ lab_data AS ( AVG(performance) AS performance, AVG(pwa) AS pwa, AVG(seo) AS seo - FROM lab_metrics - INNER JOIN technologies + FROM summary_stats + JOIN technologies USING (client, page) - INNER JOIN categories + JOIN categories USING (technology) GROUP BY client, - origin, - technology, - version + root_page, + technology ) SELECT @@ -252,9 +189,8 @@ SELECT rank, ANY_VALUE(category) AS category, technology AS app, - version, client, - COUNT(DISTINCT origin) AS origins, + COUNT(0) AS origins, # CrUX data COUNTIF(good_fid) AS origins_with_good_fid, @@ -291,10 +227,9 @@ SELECT FROM lab_data INNER JOIN crux -USING (client, origin) +USING (client, root_page) GROUP BY app, - version, geo, rank, client diff --git a/definitions/output/reports/cwv_tech_adoption.js b/definitions/output/reports/cwv_tech_adoption.js index 65251dd5..a1b77420 100644 --- a/definitions/output/reports/cwv_tech_adoption.js +++ b/definitions/output/reports/cwv_tech_adoption.js @@ -8,7 +8,7 @@ publish('cwv_tech_adoption', { partitionBy: 'date', clusterBy: ['rank', 'geo'] }, - tags: ['crux_ready', 'tech_report'] + tags: ['tech_report'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${pastMonth}'; diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index 2522f35b..61a9e229 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth) publish('cwv_tech_categories', { schema: 'reports', type: 'table', - tags: ['crux_ready', 'tech_report'] + tags: ['c'] }).query(ctx => ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */ WITH pages AS ( diff --git a/definitions/output/reports/cwv_tech_core_web_vitals.js b/definitions/output/reports/cwv_tech_core_web_vitals.js index 772f432a..06cae4be 100644 --- a/definitions/output/reports/cwv_tech_core_web_vitals.js +++ b/definitions/output/reports/cwv_tech_core_web_vitals.js @@ -8,7 +8,7 @@ publish('cwv_tech_core_web_vitals', { partitionBy: 'date', clusterBy: ['rank', 'geo'] }, - tags: ['crux_ready', 'tech_report'] + tags: ['tech_report'] }).preOps(ctx => ` CREATE TEMPORARY FUNCTION GET_VITALS( records ARRAY ` CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE( records ARRAY ` CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT( records ARRAY ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */ WITH pages AS ( diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js new file mode 100644 index 00000000..ce4f6918 --- /dev/null +++ b/definitions/output/reports/tech_crux.js @@ -0,0 +1,301 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_crux', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['geo', 'app', 'rank', 'client'], + requirePartitionFilter: true + }, + tags: ['tech_report'], + dependOnDependencyAssertions: true +}).preOps(ctx => ` +--DELETE FROM ${ctx.self()} +--WHERE date = '${pastMonth}'; + +CREATE TEMP FUNCTION IS_GOOD( + good FLOAT64, + needs_improvement FLOAT64, + poor FLOAT64 +) RETURNS BOOL AS ( + SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75 +); + +CREATE TEMP FUNCTION IS_NON_ZERO( + good FLOAT64, + needs_improvement FLOAT64, + poor FLOAT64 +) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); +`).query(ctx => ` +WITH pages AS ( + SELECT + client, + page, + root_page AS origin, + technologies, + summary, + lighthouse + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} +), geo_summary AS ( + SELECT + \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo, + rank, + device, + origin, + avg_fcp, + avg_fid, + avg_inp, + avg_lcp, + avg_ttfb, + fast_fcp, + fast_fid, + fast_inp, + fast_lcp, + fast_ttfb, + slow_fcp, + slow_fid, + slow_inp, + slow_lcp, + slow_ttfb, + small_cls, + medium_cls, + large_cls + FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')} + WHERE + yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND + device IN ('desktop', 'phone') + + UNION ALL + + SELECT + 'ALL' AS geo, + rank, + device, + origin, + avg_fcp, + avg_fid, + avg_inp, + avg_lcp, + avg_ttfb, + fast_fcp, + fast_fid, + fast_inp, + fast_lcp, + fast_ttfb, + slow_fcp, + slow_fid, + slow_inp, + slow_lcp, + slow_ttfb, + small_cls, + medium_cls, + large_cls + FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')} + WHERE + date = '${pastMonth}' AND + device IN ('desktop', 'phone') +), + +crux AS ( + SELECT + geo, + CASE _rank + WHEN 100000000 THEN 'ALL' + WHEN 10000000 THEN 'Top 10M' + WHEN 1000000 THEN 'Top 1M' + WHEN 100000 THEN 'Top 100k' + WHEN 10000 THEN 'Top 10k' + WHEN 1000 THEN 'Top 1k' + END AS rank, + CONCAT(origin, '/') AS origin, + IF(device = 'desktop', 'desktop', 'mobile') AS client, + + # CWV + IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid, + IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid, + IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls, + IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls, + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp, + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp, + + (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND + IS_GOOD(small_cls, medium_cls, large_cls) AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024, + + (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND + IS_GOOD(small_cls, medium_cls, large_cls) AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023, + + # WV + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp, + IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp, + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb, + IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb, + IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp, + IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp + FROM geo_summary, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank + WHERE rank <= _rank +), + +technologies AS ( + SELECT + tech.technology, + REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version, + client, + page + FROM pages, + UNNEST(technologies) AS tech, + UNNEST(tech.info) AS version + WHERE + technology.technology IS NOT NULL AND + technology.technology != '' AND + REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL + + UNION ALL + + SELECT + tech.technology, + 'ALL' AS version, + client, + page + FROM pages, + UNNEST(technologies) AS tech + WHERE + technology.technology IS NOT NULL AND + technology.technology != '' + + UNION ALL + + SELECT + 'ALL' AS technology, + 'ALL' AS version, + client, + page + FROM pages +), + +categories AS ( + SELECT + technology.technology, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category + FROM pages, + UNNEST(technologies) AS technology, + UNNEST(technology.categories) AS category + GROUP BY technology + + UNION ALL + + SELECT + 'ALL' AS technology, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category + FROM pages, + UNNEST(technologies) AS technology, + UNNEST(technology.categories) AS category + WHERE + client = 'mobile' +), + +lab_metrics AS ( + SELECT + client, + page, + origin, + SAFE.INT64(summary.bytesTotal) AS bytesTotal, + SAFE.INT64(summary.bytesJS) AS bytesJS, + SAFE.INT64(summary.bytesImg) AS bytesImg, + SAFE.FLOAT64(lighthouse.categories.accessibility.score) AS accessibility, + SAFE.FLOAT64(lighthouse.categories['best-practices'].score) AS best_practices, + SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance, + SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa, + SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo + FROM pages +), + +lab_data AS ( + SELECT + client, + origin, + technology, + version, + ANY_VALUE(category) AS category, + AVG(bytesTotal) AS bytesTotal, + AVG(bytesJS) AS bytesJS, + AVG(bytesImg) AS bytesImg, + AVG(accessibility) AS accessibility, + AVG(best_practices) AS best_practices, + AVG(performance) AS performance, + AVG(pwa) AS pwa, + AVG(seo) AS seo + FROM lab_metrics + INNER JOIN technologies + USING (client, page) + INNER JOIN categories + USING (technology) + GROUP BY + client, + origin, + technology, + version +) + +SELECT + DATE('${pastMonth}') AS date, + geo, + rank, + ANY_VALUE(category) AS category, + technology AS app, + version, + client, + COUNT(DISTINCT origin) AS origins, + + # CrUX data + COUNTIF(good_fid) AS origins_with_good_fid, + COUNTIF(good_cls) AS origins_with_good_cls, + COUNTIF(good_lcp) AS origins_with_good_lcp, + COUNTIF(good_fcp) AS origins_with_good_fcp, + COUNTIF(good_ttfb) AS origins_with_good_ttfb, + COUNTIF(good_inp) AS origins_with_good_inp, + COUNTIF(any_fid) AS origins_with_any_fid, + COUNTIF(any_cls) AS origins_with_any_cls, + COUNTIF(any_lcp) AS origins_with_any_lcp, + COUNTIF(any_fcp) AS origins_with_any_fcp, + COUNTIF(any_ttfb) AS origins_with_any_ttfb, + COUNTIF(any_inp) AS origins_with_any_inp, + COUNTIF(good_cwv_2024) AS origins_with_good_cwv, + COUNTIF(good_cwv_2024) AS origins_with_good_cwv_2024, + COUNTIF(good_cwv_2023) AS origins_with_good_cwv_2023, + COUNTIF(any_lcp AND any_cls) AS origins_eligible_for_cwv, + SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, + SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024, + SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023, + + # Lighthouse data + SAFE_CAST(APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_accessibility, + SAFE_CAST(APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_best_practices, + SAFE_CAST(APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_performance, + SAFE_CAST(APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_pwa, + SAFE_CAST(APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_seo, + + # Page weight stats + SAFE_CAST(APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS INT64) AS median_bytes_total, + SAFE_CAST(APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS INT64) AS median_bytes_js, + SAFE_CAST(APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS INT64) AS median_bytes_image + +FROM lab_data +INNER JOIN crux +USING (client, origin) +GROUP BY + app, + version, + geo, + rank, + client +`) diff --git a/infra/dataform-trigger/index.js b/infra/dataform-trigger/index.js index 345c623b..2cf05645 100644 --- a/infra/dataform-trigger/index.js +++ b/infra/dataform-trigger/index.js @@ -31,7 +31,7 @@ FROM crux, report; action: 'runDataformRepo', actionArgs: { repoName: 'crawl-data', - tags: ['crux_ready'] + tags: ['tech_report'] } }, crawl_complete: { From bfac4f6c845c26fdb7bfa96e4675a20d32e6438d Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 19:37:55 +0100 Subject: [PATCH 04/24] typo --- definitions/output/reports/cwv_tech_categories.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index 61a9e229..56534672 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth) publish('cwv_tech_categories', { schema: 'reports', type: 'table', - tags: ['c'] + tags: ['tech_report'] }).query(ctx => ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */ WITH pages AS ( From ac2f597a758ddaa92ee1544cdede2f80239ae540 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 20:25:12 +0100 Subject: [PATCH 05/24] versions table --- definitions/output/reports/tech_crux.js | 21 +++---- .../output/reports/tech_report_versions.js | 62 +++++++++++++++++++ 2 files changed, 71 insertions(+), 12 deletions(-) create mode 100644 definitions/output/reports/tech_report_versions.js diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index ce4f6918..2eb70b39 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -12,8 +12,8 @@ publish('tech_crux', { tags: ['tech_report'], dependOnDependencyAssertions: true }).preOps(ctx => ` ---DELETE FROM ${ctx.self()} ---WHERE date = '${pastMonth}'; +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; CREATE TEMP FUNCTION IS_GOOD( good FLOAT64, @@ -155,8 +155,8 @@ technologies AS ( UNNEST(technologies) AS tech, UNNEST(tech.info) AS version WHERE - technology.technology IS NOT NULL AND - technology.technology != '' AND + tech.technology IS NOT NULL AND + tech.technology != '' AND REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL UNION ALL @@ -168,9 +168,6 @@ technologies AS ( page FROM pages, UNNEST(technologies) AS tech - WHERE - technology.technology IS NOT NULL AND - technology.technology != '' UNION ALL @@ -184,11 +181,11 @@ technologies AS ( categories AS ( SELECT - technology.technology, + tech.technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM pages, - UNNEST(technologies) AS technology, - UNNEST(technology.categories) AS category + UNNEST(technologies) AS tech, + UNNEST(tech.categories) AS category GROUP BY technology UNION ALL @@ -197,8 +194,8 @@ categories AS ( 'ALL' AS technology, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM pages, - UNNEST(technologies) AS technology, - UNNEST(technology.categories) AS category + UNNEST(technologies) AS tech, + UNNEST(tech.categories) AS category WHERE client = 'mobile' ), diff --git a/definitions/output/reports/tech_report_versions.js b/definitions/output/reports/tech_report_versions.js new file mode 100644 index 00000000..f129305b --- /dev/null +++ b/definitions/output/reports/tech_report_versions.js @@ -0,0 +1,62 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_versions', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +/* {"dataform_trigger": "report_cwv_tech_complete", "name": "versions", "type": "dict"} */ +WITH pages AS ( + SELECT DISTINCT + client, + root_page, + tech.technology + REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version + FROM ${ctx.ref('crawl', 'pages')}, + UNNEST(technologies) AS tech + LEFT JOIN tech.info AS version + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} +), + +version_origins AS ( + SELECT + client, + technology, + version, + COUNT(DISTINCT root_page) AS origins + FROM pages + WHERE version IS NOT NULL + GROUP BY + client, + technology +), + +total_origins AS ( + SELECT + client, + technology, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY + client, + technology +) + +SELECT + client, + technology, + version, + origins +FROM version_origins + +UNION ALL + +SELECT + client, + technology, + 'ALL' AS version, + origins +FROM total_origins +`) From bdd46b8a6fb9b51f5ca824f8ea01234c8f84cfc4 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 20:59:45 +0100 Subject: [PATCH 06/24] fix --- definitions/output/reports/tech_report_versions.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/definitions/output/reports/tech_report_versions.js b/definitions/output/reports/tech_report_versions.js index f129305b..da5baf4b 100644 --- a/definitions/output/reports/tech_report_versions.js +++ b/definitions/output/reports/tech_report_versions.js @@ -10,14 +10,15 @@ WITH pages AS ( SELECT DISTINCT client, root_page, - tech.technology + tech.technology, REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version - FROM ${ctx.ref('crawl', 'pages')}, - UNNEST(technologies) AS tech + FROM ${ctx.ref('crawl', 'pages')} AS pages + INNER JOIN pages.technologies AS tech LEFT JOIN tech.info AS version WHERE date = '${pastMonth}' - ${constants.devRankFilter} + ${constants.devRankFilter} AND + tech.technology IS NOT NULL ), version_origins AS ( @@ -30,7 +31,8 @@ version_origins AS ( WHERE version IS NOT NULL GROUP BY client, - technology + technology, + version ), total_origins AS ( From 23864b97848275dfe706f3e05cd254c353981292 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 21:00:07 +0100 Subject: [PATCH 07/24] no retries --- infra/tf/bigquery_export/main.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/infra/tf/bigquery_export/main.tf b/infra/tf/bigquery_export/main.tf index 13465d7b..95e4f26a 100644 --- a/infra/tf/bigquery_export/main.tf +++ b/infra/tf/bigquery_export/main.tf @@ -32,6 +32,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { deletion_protection = false template { + parallelism = 5 template { containers { image = "${var.location}.gcr.io/${var.project}/cloud-run/${var.function_name}:latest" @@ -48,6 +49,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { } timeout = "3600s" service_account = var.function_identity + max_retries = 1 } } } From 4d4245309fe61fe4ab45f0c76747503162586b32 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 21:40:52 +0100 Subject: [PATCH 08/24] tech_report_* tables --- .../output/reports/cwv_tech_adoption.js | 2 +- .../output/reports/cwv_tech_categories.js | 2 +- .../output/reports/cwv_tech_lighthouse.js | 2 +- .../output/reports/cwv_tech_page_weight.js | 2 +- .../output/reports/cwv_tech_technologies.js | 2 +- definitions/output/reports/tech_crux.js | 11 +- .../output/reports/tech_report_adoption.js | 35 ++++++ .../output/reports/tech_report_categories.js | 94 ++++++++++++++++ .../reports/tech_report_core_web_vitals.js | 103 ++++++++++++++++++ .../output/reports/tech_report_lighthouse.js | 80 ++++++++++++++ .../output/reports/tech_report_page_weight.js | 70 ++++++++++++ .../reports/tech_report_technologies.js | 79 ++++++++++++++ .../output/reports/tech_report_versions.js | 2 +- 13 files changed, 472 insertions(+), 12 deletions(-) create mode 100644 definitions/output/reports/tech_report_adoption.js create mode 100644 definitions/output/reports/tech_report_categories.js create mode 100644 definitions/output/reports/tech_report_core_web_vitals.js create mode 100644 definitions/output/reports/tech_report_lighthouse.js create mode 100644 definitions/output/reports/tech_report_page_weight.js create mode 100644 definitions/output/reports/tech_report_technologies.js diff --git a/definitions/output/reports/cwv_tech_adoption.js b/definitions/output/reports/cwv_tech_adoption.js index a1b77420..1b3e40c1 100644 --- a/definitions/output/reports/cwv_tech_adoption.js +++ b/definitions/output/reports/cwv_tech_adoption.js @@ -8,7 +8,7 @@ publish('cwv_tech_adoption', { partitionBy: 'date', clusterBy: ['rank', 'geo'] }, - tags: ['tech_report'] + tags: ['crux_ready'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${pastMonth}'; diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index 56534672..ac511770 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth) publish('cwv_tech_categories', { schema: 'reports', type: 'table', - tags: ['tech_report'] + tags: ['crux_ready'] }).query(ctx => ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */ WITH pages AS ( diff --git a/definitions/output/reports/cwv_tech_lighthouse.js b/definitions/output/reports/cwv_tech_lighthouse.js index 0fe98cd0..bba7b2b6 100644 --- a/definitions/output/reports/cwv_tech_lighthouse.js +++ b/definitions/output/reports/cwv_tech_lighthouse.js @@ -8,7 +8,7 @@ publish('cwv_tech_lighthouse', { partitionBy: 'date', clusterBy: ['rank', 'geo'] }, - tags: ['tech_report'] + tags: ['crux_ready'] }).preOps(ctx => ` CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE( records ARRAY ` CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT( records ARRAY ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */ WITH pages AS ( diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 2eb70b39..3fdd06db 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -246,11 +246,10 @@ lab_data AS ( SELECT DATE('${pastMonth}') AS date, geo, + client, rank, - ANY_VALUE(category) AS category, - technology AS app, + technology, version, - client, COUNT(DISTINCT origin) AS origins, # CrUX data @@ -290,9 +289,9 @@ FROM lab_data INNER JOIN crux USING (client, origin) GROUP BY - app, - version, geo, + client, rank, - client + technology, + version `) diff --git a/definitions/output/reports/tech_report_adoption.js b/definitions/output/reports/tech_report_adoption.js new file mode 100644 index 00000000..2ebdc58b --- /dev/null +++ b/definitions/output/reports/tech_report_adoption.js @@ -0,0 +1,35 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_adoption', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo', 'client'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS adoption +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_categories.js b/definitions/output/reports/tech_report_categories.js new file mode 100644 index 00000000..bce75b9f --- /dev/null +++ b/definitions/output/reports/tech_report_categories.js @@ -0,0 +1,94 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_categories', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "name": "categories", "type": "dict"} */ +WITH pages AS ( + SELECT DISTINCT + client, + root_page, + technologies + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} +), + +category_descriptions AS ( + SELECT + name AS category, + description + FROM ${ctx.ref('wappalyzer', 'categories')} +), + +category_stats AS ( + SELECT + category, + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins + FROM ( + SELECT + client, + category, + COUNT(DISTINCT root_page) AS origins + FROM pages + INNER JOIN pages.technologies AS tech + INNER JOIN tech.categories AS category + WHERE + category IS NOT NULL + GROUP BY + client, + category + ) + GROUP BY category +), + +technology_stats AS ( + SELECT + technology, + category_obj AS categories, + SUM(origins) AS total_origins + FROM ${ctx.ref('reports', 'tech_report_technologies')} + GROUP BY + technology, + categories +) + +SELECT + category, + description, + origins, + ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies +FROM category_stats +INNER JOIN technology_stats +ON category_stats.category IN UNNEST(technology_stats.categories) +INNER JOIN category_descriptions +USING (category) +GROUP BY + category, + description, + origins + +UNION ALL + +SELECT + 'ALL' AS category, + NULL AS description, + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins, + NULL AS technologies +FROM ( + SELECT + client, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY client +) +`) diff --git a/definitions/output/reports/tech_report_core_web_vitals.js b/definitions/output/reports/tech_report_core_web_vitals.js new file mode 100644 index 00000000..2d26f90f --- /dev/null +++ b/definitions/output/reports/tech_report_core_web_vitals.js @@ -0,0 +1,103 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_core_web_vitals', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +CREATE TEMPORARY FUNCTION GET_VITALS( + records ARRAY>) +RETURNS ARRAY, + mobile STRUCT< + good_number INT64, + tested INT64 +>>> +LANGUAGE js AS ''' +const METRIC_MAP = { + overall: ['origins_with_good_cwv', 'origins_eligible_for_cwv'], + LCP: ['origins_with_good_lcp', 'origins_with_any_lcp'], + CLS: ['origins_with_good_cls', 'origins_with_any_cls'], + FID: ['origins_with_good_fid', 'origins_with_any_fid'], + FCP: ['origins_with_good_fcp', 'origins_with_any_fcp'], + TTFB: ['origins_with_good_ttfb', 'origins_with_any_ttfb'], + INP: ['origins_with_good_inp', 'origins_with_any_inp'] +}; + +// Initialize the vitals map. +const vitals = Object.fromEntries( + Object.keys(METRIC_MAP).map(metricName => { + return [metricName, {name: metricName}] +})); + +// Populate each client record. +records.forEach(record => { + Object.entries(METRIC_MAP).forEach( + ([metricName, [good_number, tested]]) => { + vitals[metricName][record.client] = {good_number: record[good_number], tested: record[tested]} +})}) + +return Object.values(vitals) +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "core_web_vitals", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + GET_VITALS(ARRAY_AGG(STRUCT( + client, + origins_with_good_fid, + origins_with_good_cls, + origins_with_good_lcp, + origins_with_good_fcp, + origins_with_good_ttfb, + origins_with_good_inp, + origins_with_any_fid, + origins_with_any_cls, + origins_with_any_lcp, + origins_with_any_fcp, + origins_with_any_ttfb, + origins_with_any_inp, + origins_with_good_cwv, + origins_eligible_for_cwv + ))) AS vitals +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_lighthouse.js b/definitions/output/reports/tech_report_lighthouse.js new file mode 100644 index 00000000..504686c8 --- /dev/null +++ b/definitions/output/reports/tech_report_lighthouse.js @@ -0,0 +1,80 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_lighthouse', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE( + records ARRAY>) +RETURNS ARRAY, + mobile STRUCT< + median_score FLOAT64 +>>> +LANGUAGE js AS ''' +const METRIC_MAP = { + accessibility: 'median_lighthouse_score_accessibility', + best_practices: 'median_lighthouse_score_best_practices', + performance: 'median_lighthouse_score_performance', + pwa: 'median_lighthouse_score_pwa', + seo: 'median_lighthouse_score_seo', +} + +// Initialize the Lighthouse map. +const lighthouse = Object.fromEntries(Object.keys(METRIC_MAP).map(metricName => { + return [metricName, {name: metricName}] +})); + +// Populate each client record. +records.forEach(record => { + Object.entries(METRIC_MAP).forEach(([metricName, median_score]) => { + lighthouse[metricName][record.client] = {median_score: record[median_score]} + }); +}); + +return Object.values(lighthouse) +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "lighthouse", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + GET_LIGHTHOUSE(ARRAY_AGG(STRUCT( + client, + median_lighthouse_score_accessibility, + median_lighthouse_score_best_practices, + median_lighthouse_score_performance, + median_lighthouse_score_pwa, + median_lighthouse_score_seo + ))) AS lighthouse +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_page_weight.js b/definitions/output/reports/tech_report_page_weight.js new file mode 100644 index 00000000..31521d51 --- /dev/null +++ b/definitions/output/reports/tech_report_page_weight.js @@ -0,0 +1,70 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_page_weight', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT( + records ARRAY>) +RETURNS ARRAY, + desktop STRUCT< + median_bytes INT64 +>>> +LANGUAGE js AS ''' +const METRICS = ['total', 'js', 'images'] + +// Initialize the page weight map. +const pageWeight = Object.fromEntries(METRICS.map(metricName => { +return [metricName, {name: metricName}] +})) + +// Populate each client record. +records.forEach(record => { + METRICS.forEach(metricName => { + pageWeight[metricName][record.client] = {median_bytes: record[metricName]} + }) +}) + +return Object.values(pageWeight) +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "page_weight", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + GET_PAGE_WEIGHT(ARRAY_AGG(STRUCT( + client, + median_bytes_total, + median_bytes_js, + median_bytes_image + ))) AS pageWeight +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_technologies.js b/definitions/output/reports/tech_report_technologies.js new file mode 100644 index 00000000..92e4b687 --- /dev/null +++ b/definitions/output/reports/tech_report_technologies.js @@ -0,0 +1,79 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_technologies', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "name": "technologies", "type": "dict"} */ +WITH pages AS ( + SELECT DISTINCT + client, + root_page, + tech.technology + FROM ${ctx.ref('crawl', 'pages')} AS pages + INNER JOIN pages.technologies AS tech + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} AND + tech.technology IS NOT NULL +), + +tech_origins AS ( + SELECT + client, + technology, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY + client, + technology +), + +technologies AS ( + SELECT + name AS technology, + description, + STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category, + categories AS category_obj, + NULL AS similar_technologies + FROM ${ctx.ref('wappalyzer', 'technologies')} AS technologies + INNER JOIN technologies.categories AS category + GROUP BY + technology, + description, + categories +), + +total_pages AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY client +) + +SELECT + client, + technology, + description, + category, + category_obj, + similar_technologies, + origins +FROM tech_origins +INNER JOIN technologies +USING(technology) + +UNION ALL + +SELECT + client, + 'ALL' AS technology, + NULL AS description, + NULL AS category, + NULL AS category_obj, + NULL AS similar_technologies, + origins +FROM total_pages +`) diff --git a/definitions/output/reports/tech_report_versions.js b/definitions/output/reports/tech_report_versions.js index da5baf4b..5634b2af 100644 --- a/definitions/output/reports/tech_report_versions.js +++ b/definitions/output/reports/tech_report_versions.js @@ -5,7 +5,7 @@ publish('tech_report_versions', { type: 'table', tags: ['tech_report'] }).query(ctx => ` -/* {"dataform_trigger": "report_cwv_tech_complete", "name": "versions", "type": "dict"} */ +/* {"dataform_trigger": "tech_report_complete", "name": "versions", "type": "dict"} */ WITH pages AS ( SELECT DISTINCT client, From 4dd3f9b6f1f3a27fdba67306f17fa92f0ff8b509 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 22:53:24 +0100 Subject: [PATCH 09/24] clusters renamed --- definitions/output/reports/tech_crux.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 3fdd06db..84156e7b 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -6,7 +6,7 @@ publish('tech_crux', { protected: true, bigquery: { partitionBy: 'date', - clusterBy: ['geo', 'app', 'rank', 'client'], + clusterBy: ['geo', 'client', 'rank', 'technology'], requirePartitionFilter: true }, tags: ['tech_report'], From 8032aabb5a1cd00417dbfa5e7e1751b9dfa592c3 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 23:23:11 +0100 Subject: [PATCH 10/24] lint --- infra/tf/bigquery_export/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/tf/bigquery_export/main.tf b/infra/tf/bigquery_export/main.tf index 95e4f26a..39814da2 100644 --- a/infra/tf/bigquery_export/main.tf +++ b/infra/tf/bigquery_export/main.tf @@ -49,7 +49,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { } timeout = "3600s" service_account = var.function_identity - max_retries = 1 + max_retries = 1 } } } From a41ed3234fab468d1c1adc21b96e4bf12d2a06d7 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Jan 2025 23:32:10 +0100 Subject: [PATCH 11/24] adjust export config --- infra/bigquery-export/index.js | 2 +- infra/bigquery-export/reports.js | 6 +++--- infra/tf/dataform_trigger/main.tf | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/infra/bigquery-export/index.js b/infra/bigquery-export/index.js index 74d3595f..449ccea4 100644 --- a/infra/bigquery-export/index.js +++ b/infra/bigquery-export/index.js @@ -17,7 +17,7 @@ async function main (exportConfig) { console.log(exportConfig) const reports = new ReportsExporter() await reports.export(exportConfig) - } else if (eventName === 'report_cwv_tech_complete') { + } else if (eventName === 'tech_report_complete') { console.info('Tech Report export') console.log(exportConfig) const techReports = new TechReportsExporter() diff --git a/infra/bigquery-export/reports.js b/infra/bigquery-export/reports.js index 9e80ebc1..b621f199 100644 --- a/infra/bigquery-export/reports.js +++ b/infra/bigquery-export/reports.js @@ -57,7 +57,7 @@ export class TechReportsExporter { } async export (exportConfig) { - if (exportConfig.dataform_trigger !== 'report_cwv_tech_complete') { + if (exportConfig.dataform_trigger !== 'tech_report_complete') { console.error('Invalid dataform trigger') return } @@ -68,13 +68,13 @@ export class TechReportsExporter { SELECT STRING(date) AS date, * EXCEPT(date) -FROM httparchive.reports.cwv_tech_${exportConfig.name} +FROM httparchive.reports.tech_report_${exportConfig.name} WHERE date = '${exportConfig.date}' ` } else if (exportConfig.type === 'dict') { query = ` SELECT * -FROM reports.cwv_tech_${exportConfig.name} +FROM reports.tech_report_${exportConfig.name} ` } else { console.error('Invalid export type') diff --git a/infra/tf/dataform_trigger/main.tf b/infra/tf/dataform_trigger/main.tf index cc1ea438..001c839b 100644 --- a/infra/tf/dataform_trigger/main.tf +++ b/infra/tf/dataform_trigger/main.tf @@ -30,11 +30,11 @@ resource "google_storage_bucket_object" "source" { } resource "google_cloudfunctions2_function" "dataform_trigger" { - name = "dataform-trigger" + name = var.function_name location = var.region build_config { runtime = "nodejs20" - entry_point = "dataform-trigger" + entry_point = var.function_name source { storage_source { bucket = google_storage_bucket_object.source.bucket From 2aec142c09fd401ae4a9c3fa875fec982ed5ac48 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 27 Jan 2025 00:00:36 +0100 Subject: [PATCH 12/24] fix clustering --- definitions/output/reports/tech_report_adoption.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/reports/tech_report_adoption.js b/definitions/output/reports/tech_report_adoption.js index 2ebdc58b..06a464dd 100644 --- a/definitions/output/reports/tech_report_adoption.js +++ b/definitions/output/reports/tech_report_adoption.js @@ -6,7 +6,7 @@ publish('tech_report_adoption', { protected: true, bigquery: { partitionBy: 'date', - clusterBy: ['rank', 'geo', 'client'] + clusterBy: ['rank', 'geo'] }, tags: ['tech_report'] }).preOps(ctx => ` From 396d66442d881811300abf080c31c268254f96b6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 27 Jan 2025 09:57:35 +0100 Subject: [PATCH 13/24] origin renamed --- definitions/output/reports/tech_crux.js | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 84156e7b..53073f97 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -35,7 +35,7 @@ WITH pages AS ( SELECT client, page, - root_page AS origin, + root_page, technologies, summary, lighthouse @@ -43,7 +43,9 @@ WITH pages AS ( WHERE date = '${pastMonth}' ${constants.devRankFilter} -), geo_summary AS ( +), + +geo_summary AS ( SELECT \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo, rank, @@ -204,7 +206,7 @@ lab_metrics AS ( SELECT client, page, - origin, + root_page, SAFE.INT64(summary.bytesTotal) AS bytesTotal, SAFE.INT64(summary.bytesJS) AS bytesJS, SAFE.INT64(summary.bytesImg) AS bytesImg, @@ -219,7 +221,7 @@ lab_metrics AS ( lab_data AS ( SELECT client, - origin, + root_page, technology, version, ANY_VALUE(category) AS category, @@ -250,7 +252,7 @@ SELECT rank, technology, version, - COUNT(DISTINCT origin) AS origins, + COUNT(DISTINCT root_page) AS origins, # CrUX data COUNTIF(good_fid) AS origins_with_good_fid, @@ -287,7 +289,7 @@ SELECT FROM lab_data INNER JOIN crux -USING (client, origin) +USING (client, root_page) GROUP BY geo, client, From e9b666e02aefa5de9cc01c742d9b42d1eb555e28 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 27 Jan 2025 12:22:32 +0100 Subject: [PATCH 14/24] deduplicated good_cwv --- definitions/output/reports/tech_crux.js | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 53073f97..9dc2d69a 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -126,14 +126,14 @@ crux AS ( IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls, IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp, IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp, - - (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND - IS_GOOD(small_cls, medium_cls, large_cls) AND - IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024, - - (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND - IS_GOOD(small_cls, medium_cls, large_cls) AND - IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023, + IF('${pastMonth}' < '2024-01-01', + (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND + IS_GOOD(small_cls, medium_cls, large_cls) AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp), + (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND + IS_GOOD(small_cls, medium_cls, large_cls) AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) + ) AS good_cwv, # WV IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp, @@ -267,13 +267,9 @@ SELECT COUNTIF(any_fcp) AS origins_with_any_fcp, COUNTIF(any_ttfb) AS origins_with_any_ttfb, COUNTIF(any_inp) AS origins_with_any_inp, - COUNTIF(good_cwv_2024) AS origins_with_good_cwv, - COUNTIF(good_cwv_2024) AS origins_with_good_cwv_2024, - COUNTIF(good_cwv_2023) AS origins_with_good_cwv_2023, + COUNTIF(good_cwv) AS origins_with_good_cwv, COUNTIF(any_lcp AND any_cls) AS origins_eligible_for_cwv, - SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, - SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024, - SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023, + SAFE_DIVIDE(COUNTIF(good_cwv), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, # Lighthouse data SAFE_CAST(APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_accessibility, From 58eea31a2427ab9fb68e3fc41adf90e145090afd Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 00:05:07 +0100 Subject: [PATCH 15/24] include minor --- definitions/output/reports/tech_crux.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 9dc2d69a..5cee05c3 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -150,7 +150,7 @@ crux AS ( technologies AS ( SELECT tech.technology, - REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version, + REGEXP_EXTRACT(version, r'(?:0|[1-9]\\d*)(?:\\.(?:0|[1-9]\\d*))?') AS version, client, page FROM pages, @@ -159,7 +159,7 @@ technologies AS ( WHERE tech.technology IS NOT NULL AND tech.technology != '' AND - REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL + REGEXP_EXTRACT(version, r'(?:0|[1-9]\\d*)(?:\\.(?:0|[1-9]\\d*))?') IS NOT NULL UNION ALL From c88ef18d78f34972a663df82dcf4add544c6aa09 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 00:10:30 +0100 Subject: [PATCH 16/24] fix --- definitions/output/reports/tech_crux.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 5cee05c3..ad06125a 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -116,7 +116,7 @@ crux AS ( WHEN 10000 THEN 'Top 10k' WHEN 1000 THEN 'Top 1k' END AS rank, - CONCAT(origin, '/') AS origin, + CONCAT(origin, '/') AS root_page, IF(device = 'desktop', 'desktop', 'mobile') AS client, # CWV @@ -240,7 +240,7 @@ lab_data AS ( USING (technology) GROUP BY client, - origin, + root_page, technology, version ) From bd07f78fd098c25db1219b4f562af0f1c5377efc Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 00:25:08 +0100 Subject: [PATCH 17/24] cleanup --- definitions/output/reports/tech_crux.js | 4 ---- 1 file changed, 4 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index ad06125a..ade687b8 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -157,8 +157,6 @@ technologies AS ( UNNEST(technologies) AS tech, UNNEST(tech.info) AS version WHERE - tech.technology IS NOT NULL AND - tech.technology != '' AND REGEXP_EXTRACT(version, r'(?:0|[1-9]\\d*)(?:\\.(?:0|[1-9]\\d*))?') IS NOT NULL UNION ALL @@ -198,8 +196,6 @@ categories AS ( FROM pages, UNNEST(technologies) AS tech, UNNEST(tech.categories) AS category - WHERE - client = 'mobile' ), lab_metrics AS ( From 5967524344e3a39ef4f8048c2898b0eced58f9c0 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 00:28:14 +0100 Subject: [PATCH 18/24] pattern fix --- definitions/output/reports/tech_crux.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index ade687b8..01466a35 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -150,14 +150,14 @@ crux AS ( technologies AS ( SELECT tech.technology, - REGEXP_EXTRACT(version, r'(?:0|[1-9]\\d*)(?:\\.(?:0|[1-9]\\d*))?') AS version, + REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') AS version, client, page FROM pages, UNNEST(technologies) AS tech, UNNEST(tech.info) AS version WHERE - REGEXP_EXTRACT(version, r'(?:0|[1-9]\\d*)(?:\\.(?:0|[1-9]\\d*))?') IS NOT NULL + REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') IS NOT NULL UNION ALL From 7ff91510f204f31d1dd54595c849e672c34ab2b5 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 12:13:16 +0100 Subject: [PATCH 19/24] tech detections only --- definitions/output/reports/tech_crux.js | 4 ++++ definitions/output/reports/tech_report_versions.js | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 01466a35..3f279755 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -157,6 +157,7 @@ technologies AS ( UNNEST(technologies) AS tech, UNNEST(tech.info) AS version WHERE + WHERE tech.technology IS NOT NULL AND REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') IS NOT NULL UNION ALL @@ -168,6 +169,9 @@ technologies AS ( page FROM pages, UNNEST(technologies) AS tech + WHERE + WHERE tech.technology IS NOT NULL + UNION ALL diff --git a/definitions/output/reports/tech_report_versions.js b/definitions/output/reports/tech_report_versions.js index 5634b2af..d55d8948 100644 --- a/definitions/output/reports/tech_report_versions.js +++ b/definitions/output/reports/tech_report_versions.js @@ -11,7 +11,7 @@ WITH pages AS ( client, root_page, tech.technology, - REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version + REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') AS version FROM ${ctx.ref('crawl', 'pages')} AS pages INNER JOIN pages.technologies AS tech LEFT JOIN tech.info AS version From 718e3c4ff7481a8b1b1db46189682b17d35cd14e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 12:16:19 +0100 Subject: [PATCH 20/24] fix --- definitions/output/reports/tech_crux.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 3f279755..6a734503 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -157,7 +157,7 @@ technologies AS ( UNNEST(technologies) AS tech, UNNEST(tech.info) AS version WHERE - WHERE tech.technology IS NOT NULL AND + tech.technology IS NOT NULL AND REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') IS NOT NULL UNION ALL @@ -170,7 +170,7 @@ technologies AS ( FROM pages, UNNEST(technologies) AS tech WHERE - WHERE tech.technology IS NOT NULL + tech.technology IS NOT NULL UNION ALL From 34a4bb769a7d619205a313983b55b8c8b980e35e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 31 Jan 2025 20:29:07 +0100 Subject: [PATCH 21/24] relaxed pattern --- definitions/output/reports/tech_crux.js | 4 ++-- definitions/output/reports/tech_report_versions.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js index 6a734503..4f20a015 100644 --- a/definitions/output/reports/tech_crux.js +++ b/definitions/output/reports/tech_crux.js @@ -150,7 +150,7 @@ crux AS ( technologies AS ( SELECT tech.technology, - REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') AS version, + REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') AS version, client, page FROM pages, @@ -158,7 +158,7 @@ technologies AS ( UNNEST(tech.info) AS version WHERE tech.technology IS NOT NULL AND - REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') IS NOT NULL + REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') IS NOT NULL UNION ALL diff --git a/definitions/output/reports/tech_report_versions.js b/definitions/output/reports/tech_report_versions.js index d55d8948..d11fa8f3 100644 --- a/definitions/output/reports/tech_report_versions.js +++ b/definitions/output/reports/tech_report_versions.js @@ -11,7 +11,7 @@ WITH pages AS ( client, root_page, tech.technology, - REGEXP_EXTRACT(version, r'(?:(?:0|[1-9])\\d*)(?:\\.(?:0|[1-9])\\d*)?') AS version + REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') AS version FROM ${ctx.ref('crawl', 'pages')} AS pages INNER JOIN pages.technologies AS tech LEFT JOIN tech.info AS version From 330e918dd85f2f3d30f2ada7f002bff45121601b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 1 Feb 2025 21:00:38 +0100 Subject: [PATCH 22/24] remove hashing (#59) --- infra/bigquery-export/firestore.js | 18 ++---------------- infra/bigquery-export/utils.js | 29 ----------------------------- 2 files changed, 2 insertions(+), 45 deletions(-) delete mode 100644 infra/bigquery-export/utils.js diff --git a/infra/bigquery-export/firestore.js b/infra/bigquery-export/firestore.js index 207be4c8..bde22a8e 100644 --- a/infra/bigquery-export/firestore.js +++ b/infra/bigquery-export/firestore.js @@ -1,15 +1,5 @@ import { Firestore } from '@google-cloud/firestore' import { BigQueryExport } from './bigquery.js' -import { technologyHashId } from './utils.js' - -const TECHNOLOGY_QUERY_ID_KEYS = { - adoption: ['date', 'technology', 'geo', 'rank'], - lighthouse: ['date', 'technology', 'geo', 'rank'], - core_web_vitals: ['date', 'technology', 'geo', 'rank'], - page_weight: ['date', 'technology', 'geo', 'rank'], - technologies: ['client', 'technology', 'category'], - categories: ['category'] -} export class FirestoreBatch { constructor () { @@ -26,8 +16,7 @@ export class FirestoreBatch { if (operation === 'delete') { batch.delete(doc.ref) } else if (operation === 'set') { - const docId = technologyHashId(doc, this.collectionName, TECHNOLOGY_QUERY_ID_KEYS) - const docRef = this.firestore.collection(this.collectionName).doc(docId) + const docRef = this.firestore.collection(this.collectionName).doc() batch.set(docRef, doc) } else { throw new Error('Invalid operation') @@ -144,10 +133,7 @@ export class FirestoreBatch { databaseId: 'tech-report-apis-' + exportConfig.environment }) - // Delete all the documents before writing the new ones - if (exportConfig.truncate !== 'false') { - await this.batchDelete() - } + await this.batchDelete() const rowStream = await this.bigquery.queryResultsStream(query) await this.streamFromBigQuery(rowStream) diff --git a/infra/bigquery-export/utils.js b/infra/bigquery-export/utils.js deleted file mode 100644 index 41263a15..00000000 --- a/infra/bigquery-export/utils.js +++ /dev/null @@ -1,29 +0,0 @@ -import crypto from 'crypto' - -/** - * Returns a hashed ID for a set of technology query keys. Keys are sorted alphabetically and joined with a dash. - * The resulting string is hashed using SHA256. - * - * @param {Object} element - The input object containing query data. - * @param {string} queryType - The type of query to generate the hash for. - * @param {Object} keyMap - The mapping of query types to their keys. Defaults to constants.TECHNOLOGY_QUERY_ID_KEYS. - * @returns {string} - The hashed ID. - * @throws {Error} - If the queryType is invalid or if required keys are missing in the element. - */ -export function technologyHashId (element, queryType, keyMap) { - if (!keyMap[queryType]) { - throw new Error(`Invalid query type: ${queryType}`) - } - - const keys = keyMap[queryType].sort() - if (!keys.every(key => key in element)) { - throw new Error(`Missing keys in element ${JSON.stringify(element)} for query type ${queryType}`) - } - - const values = keys.map(key => element[key]) - const hash = crypto.createHash('sha256') - .update(values.join('-')) - .digest('hex') - - return hash -} From 7ae88faa38adf877b2ec2340ce6a36d9e8bbd442 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 1 Feb 2025 21:08:32 +0100 Subject: [PATCH 23/24] dedupe technologies --- .../output/reports/cwv_tech_categories.js | 16 ++++++----- .../reports/tech_report_technologies.js | 28 +++++++++++++------ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index ac511770..6711bc7a 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -55,6 +55,14 @@ technology_stats AS ( GROUP BY technology, categories +), + +total_pages AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY client ) SELECT @@ -82,11 +90,5 @@ SELECT COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile ) AS origins, NULL AS technologies -FROM ( - SELECT - client, - COUNT(DISTINCT root_page) AS origins - FROM pages - GROUP BY client -) +FROM total_pages `) diff --git a/definitions/output/reports/tech_report_technologies.js b/definitions/output/reports/tech_report_technologies.js index 92e4b687..c56f28bd 100644 --- a/definitions/output/reports/tech_report_technologies.js +++ b/definitions/output/reports/tech_report_technologies.js @@ -21,13 +21,22 @@ WITH pages AS ( tech_origins AS ( SELECT - client, technology, - COUNT(DISTINCT root_page) AS origins - FROM pages - GROUP BY - client, - technology + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins + FROM ( + SELECT + client, + technology, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY + client, + technology + ) + GROUP BY technology ), technologies AS ( @@ -54,7 +63,6 @@ total_pages AS ( ) SELECT - client, technology, description, category, @@ -68,12 +76,14 @@ USING(technology) UNION ALL SELECT - client, 'ALL' AS technology, NULL AS description, NULL AS category, NULL AS category_obj, NULL AS similar_technologies, - origins + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins FROM total_pages `) From c6f8460411fbfd08cf3bb7badedef7268bf5213b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 3 Feb 2025 00:48:29 +0100 Subject: [PATCH 24/24] cleanup --- definitions/output/reports/tech_report_technologies.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/definitions/output/reports/tech_report_technologies.js b/definitions/output/reports/tech_report_technologies.js index c56f28bd..a77c1a0a 100644 --- a/definitions/output/reports/tech_report_technologies.js +++ b/definitions/output/reports/tech_report_technologies.js @@ -23,8 +23,8 @@ tech_origins AS ( SELECT technology, STRUCT( - COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, - COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + MAX(IF(client = 'desktop', origins, 0)) AS desktop, + MAX(IF(client = 'mobile', origins, 0)) AS mobile ) AS origins FROM ( SELECT @@ -82,8 +82,8 @@ SELECT NULL AS category_obj, NULL AS similar_technologies, STRUCT( - COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, - COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + MAX(IF(client = 'desktop', origins, 0)) AS desktop, + MAX(IF(client = 'mobile', origins, 0)) AS mobile ) AS origins FROM total_pages `)