diff --git a/definitions/output/reports/cwv_tech_adoption.js b/definitions/output/reports/cwv_tech_adoption.js index 65251dd5..1b3e40c1 100644 --- a/definitions/output/reports/cwv_tech_adoption.js +++ b/definitions/output/reports/cwv_tech_adoption.js @@ -8,7 +8,7 @@ publish('cwv_tech_adoption', { partitionBy: 'date', clusterBy: ['rank', 'geo'] }, - tags: ['crux_ready', 'tech_report'] + tags: ['crux_ready'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${pastMonth}'; diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js index 2522f35b..6711bc7a 100644 --- a/definitions/output/reports/cwv_tech_categories.js +++ b/definitions/output/reports/cwv_tech_categories.js @@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth) publish('cwv_tech_categories', { schema: 'reports', type: 'table', - tags: ['crux_ready', 'tech_report'] + tags: ['crux_ready'] }).query(ctx => ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */ WITH pages AS ( @@ -55,6 +55,14 @@ technology_stats AS ( GROUP BY technology, categories +), + +total_pages AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY client ) SELECT @@ -82,11 +90,5 @@ SELECT COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile ) AS origins, NULL AS technologies -FROM ( - SELECT - client, - COUNT(DISTINCT root_page) AS origins - FROM pages - GROUP BY client -) +FROM total_pages `) diff --git a/definitions/output/reports/cwv_tech_core_web_vitals.js b/definitions/output/reports/cwv_tech_core_web_vitals.js index 772f432a..06cae4be 100644 --- a/definitions/output/reports/cwv_tech_core_web_vitals.js +++ b/definitions/output/reports/cwv_tech_core_web_vitals.js @@ -8,7 +8,7 @@ publish('cwv_tech_core_web_vitals', { partitionBy: 'date', clusterBy: ['rank', 'geo'] }, - tags: ['crux_ready', 'tech_report'] + tags: ['tech_report'] }).preOps(ctx => ` CREATE TEMPORARY FUNCTION GET_VITALS( records ARRAY ` CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE( records ARRAY ` CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT( records ARRAY ` /* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */ WITH pages AS ( diff --git a/definitions/output/reports/tech_crux.js b/definitions/output/reports/tech_crux.js new file mode 100644 index 00000000..4f20a015 --- /dev/null +++ b/definitions/output/reports/tech_crux.js @@ -0,0 +1,295 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_crux', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['geo', 'client', 'rank', 'technology'], + requirePartitionFilter: true + }, + tags: ['tech_report'], + dependOnDependencyAssertions: true +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; + +CREATE TEMP FUNCTION IS_GOOD( + good FLOAT64, + needs_improvement FLOAT64, + poor FLOAT64 +) RETURNS BOOL AS ( + SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75 +); + +CREATE TEMP FUNCTION IS_NON_ZERO( + good FLOAT64, + needs_improvement FLOAT64, + poor FLOAT64 +) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); +`).query(ctx => ` +WITH pages AS ( + SELECT + client, + page, + root_page, + technologies, + summary, + lighthouse + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} +), + +geo_summary AS ( + SELECT + \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo, + rank, + device, + origin, + avg_fcp, + avg_fid, + avg_inp, + avg_lcp, + avg_ttfb, + fast_fcp, + fast_fid, + fast_inp, + fast_lcp, + fast_ttfb, + slow_fcp, + slow_fid, + slow_inp, + slow_lcp, + slow_ttfb, + small_cls, + medium_cls, + large_cls + FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')} + WHERE + yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND + device IN ('desktop', 'phone') + + UNION ALL + + SELECT + 'ALL' AS geo, + rank, + device, + origin, + avg_fcp, + avg_fid, + avg_inp, + avg_lcp, + avg_ttfb, + fast_fcp, + fast_fid, + fast_inp, + fast_lcp, + fast_ttfb, + slow_fcp, + slow_fid, + slow_inp, + slow_lcp, + slow_ttfb, + small_cls, + medium_cls, + large_cls + FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')} + WHERE + date = '${pastMonth}' AND + device IN ('desktop', 'phone') +), + +crux AS ( + SELECT + geo, + CASE _rank + WHEN 100000000 THEN 'ALL' + WHEN 10000000 THEN 'Top 10M' + WHEN 1000000 THEN 'Top 1M' + WHEN 100000 THEN 'Top 100k' + WHEN 10000 THEN 'Top 10k' + WHEN 1000 THEN 'Top 1k' + END AS rank, + CONCAT(origin, '/') AS root_page, + IF(device = 'desktop', 'desktop', 'mobile') AS client, + + # CWV + IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid, + IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid, + IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls, + IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls, + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp, + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp, + IF('${pastMonth}' < '2024-01-01', + (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND + IS_GOOD(small_cls, medium_cls, large_cls) AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp), + (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND + IS_GOOD(small_cls, medium_cls, large_cls) AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) + ) AS good_cwv, + + # WV + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp, + IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp, + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb, + IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb, + IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp, + IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp + FROM geo_summary, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank + WHERE rank <= _rank +), + +technologies AS ( + SELECT + tech.technology, + REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') AS version, + client, + page + FROM pages, + UNNEST(technologies) AS tech, + UNNEST(tech.info) AS version + WHERE + tech.technology IS NOT NULL AND + REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') IS NOT NULL + + UNION ALL + + SELECT + tech.technology, + 'ALL' AS version, + client, + page + FROM pages, + UNNEST(technologies) AS tech + WHERE + tech.technology IS NOT NULL + + + UNION ALL + + SELECT + 'ALL' AS technology, + 'ALL' AS version, + client, + page + FROM pages +), + +categories AS ( + SELECT + tech.technology, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category + FROM pages, + UNNEST(technologies) AS tech, + UNNEST(tech.categories) AS category + GROUP BY technology + + UNION ALL + + SELECT + 'ALL' AS technology, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category + FROM pages, + UNNEST(technologies) AS tech, + UNNEST(tech.categories) AS category +), + +lab_metrics AS ( + SELECT + client, + page, + root_page, + SAFE.INT64(summary.bytesTotal) AS bytesTotal, + SAFE.INT64(summary.bytesJS) AS bytesJS, + SAFE.INT64(summary.bytesImg) AS bytesImg, + SAFE.FLOAT64(lighthouse.categories.accessibility.score) AS accessibility, + SAFE.FLOAT64(lighthouse.categories['best-practices'].score) AS best_practices, + SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance, + SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa, + SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo + FROM pages +), + +lab_data AS ( + SELECT + client, + root_page, + technology, + version, + ANY_VALUE(category) AS category, + AVG(bytesTotal) AS bytesTotal, + AVG(bytesJS) AS bytesJS, + AVG(bytesImg) AS bytesImg, + AVG(accessibility) AS accessibility, + AVG(best_practices) AS best_practices, + AVG(performance) AS performance, + AVG(pwa) AS pwa, + AVG(seo) AS seo + FROM lab_metrics + INNER JOIN technologies + USING (client, page) + INNER JOIN categories + USING (technology) + GROUP BY + client, + root_page, + technology, + version +) + +SELECT + DATE('${pastMonth}') AS date, + geo, + client, + rank, + technology, + version, + COUNT(DISTINCT root_page) AS origins, + + # CrUX data + COUNTIF(good_fid) AS origins_with_good_fid, + COUNTIF(good_cls) AS origins_with_good_cls, + COUNTIF(good_lcp) AS origins_with_good_lcp, + COUNTIF(good_fcp) AS origins_with_good_fcp, + COUNTIF(good_ttfb) AS origins_with_good_ttfb, + COUNTIF(good_inp) AS origins_with_good_inp, + COUNTIF(any_fid) AS origins_with_any_fid, + COUNTIF(any_cls) AS origins_with_any_cls, + COUNTIF(any_lcp) AS origins_with_any_lcp, + COUNTIF(any_fcp) AS origins_with_any_fcp, + COUNTIF(any_ttfb) AS origins_with_any_ttfb, + COUNTIF(any_inp) AS origins_with_any_inp, + COUNTIF(good_cwv) AS origins_with_good_cwv, + COUNTIF(any_lcp AND any_cls) AS origins_eligible_for_cwv, + SAFE_DIVIDE(COUNTIF(good_cwv), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, + + # Lighthouse data + SAFE_CAST(APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_accessibility, + SAFE_CAST(APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_best_practices, + SAFE_CAST(APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_performance, + SAFE_CAST(APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_pwa, + SAFE_CAST(APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_seo, + + # Page weight stats + SAFE_CAST(APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS INT64) AS median_bytes_total, + SAFE_CAST(APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS INT64) AS median_bytes_js, + SAFE_CAST(APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS INT64) AS median_bytes_image + +FROM lab_data +INNER JOIN crux +USING (client, root_page) +GROUP BY + geo, + client, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_adoption.js b/definitions/output/reports/tech_report_adoption.js new file mode 100644 index 00000000..06a464dd --- /dev/null +++ b/definitions/output/reports/tech_report_adoption.js @@ -0,0 +1,35 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_adoption', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS adoption +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_categories.js b/definitions/output/reports/tech_report_categories.js new file mode 100644 index 00000000..bce75b9f --- /dev/null +++ b/definitions/output/reports/tech_report_categories.js @@ -0,0 +1,94 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_categories', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "name": "categories", "type": "dict"} */ +WITH pages AS ( + SELECT DISTINCT + client, + root_page, + technologies + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} +), + +category_descriptions AS ( + SELECT + name AS category, + description + FROM ${ctx.ref('wappalyzer', 'categories')} +), + +category_stats AS ( + SELECT + category, + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins + FROM ( + SELECT + client, + category, + COUNT(DISTINCT root_page) AS origins + FROM pages + INNER JOIN pages.technologies AS tech + INNER JOIN tech.categories AS category + WHERE + category IS NOT NULL + GROUP BY + client, + category + ) + GROUP BY category +), + +technology_stats AS ( + SELECT + technology, + category_obj AS categories, + SUM(origins) AS total_origins + FROM ${ctx.ref('reports', 'tech_report_technologies')} + GROUP BY + technology, + categories +) + +SELECT + category, + description, + origins, + ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies +FROM category_stats +INNER JOIN technology_stats +ON category_stats.category IN UNNEST(technology_stats.categories) +INNER JOIN category_descriptions +USING (category) +GROUP BY + category, + description, + origins + +UNION ALL + +SELECT + 'ALL' AS category, + NULL AS description, + STRUCT( + COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop, + COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile + ) AS origins, + NULL AS technologies +FROM ( + SELECT + client, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY client +) +`) diff --git a/definitions/output/reports/tech_report_core_web_vitals.js b/definitions/output/reports/tech_report_core_web_vitals.js new file mode 100644 index 00000000..2d26f90f --- /dev/null +++ b/definitions/output/reports/tech_report_core_web_vitals.js @@ -0,0 +1,103 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_core_web_vitals', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +CREATE TEMPORARY FUNCTION GET_VITALS( + records ARRAY>) +RETURNS ARRAY, + mobile STRUCT< + good_number INT64, + tested INT64 +>>> +LANGUAGE js AS ''' +const METRIC_MAP = { + overall: ['origins_with_good_cwv', 'origins_eligible_for_cwv'], + LCP: ['origins_with_good_lcp', 'origins_with_any_lcp'], + CLS: ['origins_with_good_cls', 'origins_with_any_cls'], + FID: ['origins_with_good_fid', 'origins_with_any_fid'], + FCP: ['origins_with_good_fcp', 'origins_with_any_fcp'], + TTFB: ['origins_with_good_ttfb', 'origins_with_any_ttfb'], + INP: ['origins_with_good_inp', 'origins_with_any_inp'] +}; + +// Initialize the vitals map. +const vitals = Object.fromEntries( + Object.keys(METRIC_MAP).map(metricName => { + return [metricName, {name: metricName}] +})); + +// Populate each client record. +records.forEach(record => { + Object.entries(METRIC_MAP).forEach( + ([metricName, [good_number, tested]]) => { + vitals[metricName][record.client] = {good_number: record[good_number], tested: record[tested]} +})}) + +return Object.values(vitals) +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "core_web_vitals", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + GET_VITALS(ARRAY_AGG(STRUCT( + client, + origins_with_good_fid, + origins_with_good_cls, + origins_with_good_lcp, + origins_with_good_fcp, + origins_with_good_ttfb, + origins_with_good_inp, + origins_with_any_fid, + origins_with_any_cls, + origins_with_any_lcp, + origins_with_any_fcp, + origins_with_any_ttfb, + origins_with_any_inp, + origins_with_good_cwv, + origins_eligible_for_cwv + ))) AS vitals +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_lighthouse.js b/definitions/output/reports/tech_report_lighthouse.js new file mode 100644 index 00000000..504686c8 --- /dev/null +++ b/definitions/output/reports/tech_report_lighthouse.js @@ -0,0 +1,80 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_lighthouse', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE( + records ARRAY>) +RETURNS ARRAY, + mobile STRUCT< + median_score FLOAT64 +>>> +LANGUAGE js AS ''' +const METRIC_MAP = { + accessibility: 'median_lighthouse_score_accessibility', + best_practices: 'median_lighthouse_score_best_practices', + performance: 'median_lighthouse_score_performance', + pwa: 'median_lighthouse_score_pwa', + seo: 'median_lighthouse_score_seo', +} + +// Initialize the Lighthouse map. +const lighthouse = Object.fromEntries(Object.keys(METRIC_MAP).map(metricName => { + return [metricName, {name: metricName}] +})); + +// Populate each client record. +records.forEach(record => { + Object.entries(METRIC_MAP).forEach(([metricName, median_score]) => { + lighthouse[metricName][record.client] = {median_score: record[median_score]} + }); +}); + +return Object.values(lighthouse) +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "lighthouse", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + GET_LIGHTHOUSE(ARRAY_AGG(STRUCT( + client, + median_lighthouse_score_accessibility, + median_lighthouse_score_best_practices, + median_lighthouse_score_performance, + median_lighthouse_score_pwa, + median_lighthouse_score_seo + ))) AS lighthouse +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_page_weight.js b/definitions/output/reports/tech_report_page_weight.js new file mode 100644 index 00000000..31521d51 --- /dev/null +++ b/definitions/output/reports/tech_report_page_weight.js @@ -0,0 +1,70 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_page_weight', { + schema: 'reports', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['rank', 'geo'] + }, + tags: ['tech_report'] +}).preOps(ctx => ` +CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT( + records ARRAY>) +RETURNS ARRAY, + desktop STRUCT< + median_bytes INT64 +>>> +LANGUAGE js AS ''' +const METRICS = ['total', 'js', 'images'] + +// Initialize the page weight map. +const pageWeight = Object.fromEntries(METRICS.map(metricName => { +return [metricName, {name: metricName}] +})) + +// Populate each client record. +records.forEach(record => { + METRICS.forEach(metricName => { + pageWeight[metricName][record.client] = {median_bytes: record[metricName]} + }) +}) + +return Object.values(pageWeight) +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "page_weight", "type": "report"} */ +SELECT + date, + geo, + rank, + technology, + version, + GET_PAGE_WEIGHT(ARRAY_AGG(STRUCT( + client, + median_bytes_total, + median_bytes_js, + median_bytes_image + ))) AS pageWeight +FROM ${ctx.ref('reports', 'tech_crux')} +WHERE date = '${pastMonth}' +GROUP BY + date, + geo, + rank, + technology, + version +`) diff --git a/definitions/output/reports/tech_report_technologies.js b/definitions/output/reports/tech_report_technologies.js new file mode 100644 index 00000000..a77c1a0a --- /dev/null +++ b/definitions/output/reports/tech_report_technologies.js @@ -0,0 +1,89 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_technologies', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "name": "technologies", "type": "dict"} */ +WITH pages AS ( + SELECT DISTINCT + client, + root_page, + tech.technology + FROM ${ctx.ref('crawl', 'pages')} AS pages + INNER JOIN pages.technologies AS tech + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} AND + tech.technology IS NOT NULL +), + +tech_origins AS ( + SELECT + technology, + STRUCT( + MAX(IF(client = 'desktop', origins, 0)) AS desktop, + MAX(IF(client = 'mobile', origins, 0)) AS mobile + ) AS origins + FROM ( + SELECT + client, + technology, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY + client, + technology + ) + GROUP BY technology +), + +technologies AS ( + SELECT + name AS technology, + description, + STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category, + categories AS category_obj, + NULL AS similar_technologies + FROM ${ctx.ref('wappalyzer', 'technologies')} AS technologies + INNER JOIN technologies.categories AS category + GROUP BY + technology, + description, + categories +), + +total_pages AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY client +) + +SELECT + technology, + description, + category, + category_obj, + similar_technologies, + origins +FROM tech_origins +INNER JOIN technologies +USING(technology) + +UNION ALL + +SELECT + 'ALL' AS technology, + NULL AS description, + NULL AS category, + NULL AS category_obj, + NULL AS similar_technologies, + STRUCT( + MAX(IF(client = 'desktop', origins, 0)) AS desktop, + MAX(IF(client = 'mobile', origins, 0)) AS mobile + ) AS origins +FROM total_pages +`) diff --git a/definitions/output/reports/tech_report_versions.js b/definitions/output/reports/tech_report_versions.js new file mode 100644 index 00000000..d11fa8f3 --- /dev/null +++ b/definitions/output/reports/tech_report_versions.js @@ -0,0 +1,64 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_versions', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +/* {"dataform_trigger": "tech_report_complete", "name": "versions", "type": "dict"} */ +WITH pages AS ( + SELECT DISTINCT + client, + root_page, + tech.technology, + REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') AS version + FROM ${ctx.ref('crawl', 'pages')} AS pages + INNER JOIN pages.technologies AS tech + LEFT JOIN tech.info AS version + WHERE + date = '${pastMonth}' + ${constants.devRankFilter} AND + tech.technology IS NOT NULL +), + +version_origins AS ( + SELECT + client, + technology, + version, + COUNT(DISTINCT root_page) AS origins + FROM pages + WHERE version IS NOT NULL + GROUP BY + client, + technology, + version +), + +total_origins AS ( + SELECT + client, + technology, + COUNT(DISTINCT root_page) AS origins + FROM pages + GROUP BY + client, + technology +) + +SELECT + client, + technology, + version, + origins +FROM version_origins + +UNION ALL + +SELECT + client, + technology, + 'ALL' AS version, + origins +FROM total_origins +`) diff --git a/infra/bigquery-export/index.js b/infra/bigquery-export/index.js index 74d3595f..449ccea4 100644 --- a/infra/bigquery-export/index.js +++ b/infra/bigquery-export/index.js @@ -17,7 +17,7 @@ async function main (exportConfig) { console.log(exportConfig) const reports = new ReportsExporter() await reports.export(exportConfig) - } else if (eventName === 'report_cwv_tech_complete') { + } else if (eventName === 'tech_report_complete') { console.info('Tech Report export') console.log(exportConfig) const techReports = new TechReportsExporter() diff --git a/infra/bigquery-export/reports.js b/infra/bigquery-export/reports.js index 9e80ebc1..b621f199 100644 --- a/infra/bigquery-export/reports.js +++ b/infra/bigquery-export/reports.js @@ -57,7 +57,7 @@ export class TechReportsExporter { } async export (exportConfig) { - if (exportConfig.dataform_trigger !== 'report_cwv_tech_complete') { + if (exportConfig.dataform_trigger !== 'tech_report_complete') { console.error('Invalid dataform trigger') return } @@ -68,13 +68,13 @@ export class TechReportsExporter { SELECT STRING(date) AS date, * EXCEPT(date) -FROM httparchive.reports.cwv_tech_${exportConfig.name} +FROM httparchive.reports.tech_report_${exportConfig.name} WHERE date = '${exportConfig.date}' ` } else if (exportConfig.type === 'dict') { query = ` SELECT * -FROM reports.cwv_tech_${exportConfig.name} +FROM reports.tech_report_${exportConfig.name} ` } else { console.error('Invalid export type') diff --git a/infra/dataform-trigger/index.js b/infra/dataform-trigger/index.js index 345c623b..2cf05645 100644 --- a/infra/dataform-trigger/index.js +++ b/infra/dataform-trigger/index.js @@ -31,7 +31,7 @@ FROM crux, report; action: 'runDataformRepo', actionArgs: { repoName: 'crawl-data', - tags: ['crux_ready'] + tags: ['tech_report'] } }, crawl_complete: { diff --git a/infra/tf/bigquery_export/main.tf b/infra/tf/bigquery_export/main.tf index 13465d7b..39814da2 100644 --- a/infra/tf/bigquery_export/main.tf +++ b/infra/tf/bigquery_export/main.tf @@ -32,6 +32,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { deletion_protection = false template { + parallelism = 5 template { containers { image = "${var.location}.gcr.io/${var.project}/cloud-run/${var.function_name}:latest" @@ -48,6 +49,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { } timeout = "3600s" service_account = var.function_identity + max_retries = 1 } } } diff --git a/infra/tf/dataform_trigger/main.tf b/infra/tf/dataform_trigger/main.tf index cc1ea438..001c839b 100644 --- a/infra/tf/dataform_trigger/main.tf +++ b/infra/tf/dataform_trigger/main.tf @@ -30,11 +30,11 @@ resource "google_storage_bucket_object" "source" { } resource "google_cloudfunctions2_function" "dataform_trigger" { - name = "dataform-trigger" + name = var.function_name location = var.region build_config { runtime = "nodejs20" - entry_point = "dataform-trigger" + entry_point = var.function_name source { storage_source { bucket = google_storage_bucket_object.source.bucket