Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tech report: Dedupe technology records #60

Open
wants to merge 33 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
cf33544
versions
max-ostapenko Jan 12, 2025
547f63e
tech filter
max-ostapenko Jan 12, 2025
e1f0e60
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 14, 2025
3ec247c
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 21, 2025
65acf68
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 26, 2025
e8580d0
new table with versions
max-ostapenko Jan 26, 2025
bfac4f6
typo
max-ostapenko Jan 26, 2025
ac2f597
versions table
max-ostapenko Jan 26, 2025
bdd46b8
fix
max-ostapenko Jan 26, 2025
23864b9
no retries
max-ostapenko Jan 26, 2025
4d42453
tech_report_* tables
max-ostapenko Jan 26, 2025
4dd3f9b
clusters renamed
max-ostapenko Jan 26, 2025
8032aab
lint
max-ostapenko Jan 26, 2025
a41ed32
adjust export config
max-ostapenko Jan 26, 2025
2aec142
fix clustering
max-ostapenko Jan 26, 2025
396d664
origin renamed
max-ostapenko Jan 27, 2025
e9b666e
deduplicated good_cwv
max-ostapenko Jan 27, 2025
ff2f5a4
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 27, 2025
58eea31
include minor
max-ostapenko Jan 30, 2025
747a18f
Merge branch 'main' into main
max-ostapenko Jan 30, 2025
8c0455c
Merge branch 'central-flyingfish' into central-flyingfish
max-ostapenko Jan 30, 2025
c88ef18
fix
max-ostapenko Jan 30, 2025
3268e28
Merge branch 'central-flyingfish' into central-flyingfish
max-ostapenko Jan 30, 2025
bd07f78
cleanup
max-ostapenko Jan 30, 2025
5967524
pattern fix
max-ostapenko Jan 30, 2025
146978d
Merge branch 'central-flyingfish' into central-flyingfish
max-ostapenko Jan 30, 2025
7ff9151
tech detections only
max-ostapenko Jan 31, 2025
718e3c4
fix
max-ostapenko Jan 31, 2025
34a4bb7
relaxed pattern
max-ostapenko Jan 31, 2025
330e918
remove hashing (#59)
max-ostapenko Feb 1, 2025
7ae88fa
dedupe technologies
max-ostapenko Feb 1, 2025
511983d
Merge branch 'main' into premier-chicken
max-ostapenko Feb 1, 2025
c6f8460
cleanup
max-ostapenko Feb 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_adoption.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ publish('cwv_tech_adoption', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
Expand Down
18 changes: 10 additions & 8 deletions definitions/output/reports/cwv_tech_categories.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth)
publish('cwv_tech_categories', {
schema: 'reports',
type: 'table',
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
WITH pages AS (
Expand Down Expand Up @@ -55,6 +55,14 @@ technology_stats AS (
GROUP BY
technology,
categories
),

total_pages AS (
SELECT
client,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY client
)

SELECT
Expand Down Expand Up @@ -82,11 +90,5 @@ SELECT
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
) AS origins,
NULL AS technologies
FROM (
SELECT
client,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY client
)
FROM total_pages
`)
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_core_web_vitals.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ publish('cwv_tech_core_web_vitals', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['tech_report']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_VITALS(
records ARRAY<STRUCT<
Expand Down
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_lighthouse.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ publish('cwv_tech_lighthouse', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE(
records ARRAY<STRUCT<
Expand Down
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_page_weight.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ publish('cwv_tech_page_weight', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT(
records ARRAY<STRUCT<
Expand Down
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth)
publish('cwv_tech_technologies', {
schema: 'reports',
type: 'table',
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */
WITH pages AS (
Expand Down
295 changes: 295 additions & 0 deletions definitions/output/reports/tech_crux.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_crux', {
schema: 'reports',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
clusterBy: ['geo', 'client', 'rank', 'technology'],
requirePartitionFilter: true
},
tags: ['tech_report'],
dependOnDependencyAssertions: true
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';

CREATE TEMP FUNCTION IS_GOOD(
good FLOAT64,
needs_improvement FLOAT64,
poor FLOAT64
) RETURNS BOOL AS (
SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
);

CREATE TEMP FUNCTION IS_NON_ZERO(
good FLOAT64,
needs_improvement FLOAT64,
poor FLOAT64
) RETURNS BOOL AS (
good + needs_improvement + poor > 0
);
`).query(ctx => `
WITH pages AS (
SELECT
client,
page,
root_page,
technologies,
summary,
lighthouse
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = '${pastMonth}'
${constants.devRankFilter}
),

geo_summary AS (
SELECT
\`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo,
rank,
device,
origin,
avg_fcp,
avg_fid,
avg_inp,
avg_lcp,
avg_ttfb,
fast_fcp,
fast_fid,
fast_inp,
fast_lcp,
fast_ttfb,
slow_fcp,
slow_fid,
slow_inp,
slow_lcp,
slow_ttfb,
small_cls,
medium_cls,
large_cls
FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')}
WHERE
yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND
device IN ('desktop', 'phone')

UNION ALL

SELECT
'ALL' AS geo,
rank,
device,
origin,
avg_fcp,
avg_fid,
avg_inp,
avg_lcp,
avg_ttfb,
fast_fcp,
fast_fid,
fast_inp,
fast_lcp,
fast_ttfb,
slow_fcp,
slow_fid,
slow_inp,
slow_lcp,
slow_ttfb,
small_cls,
medium_cls,
large_cls
FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')}
WHERE
date = '${pastMonth}' AND
device IN ('desktop', 'phone')
),

crux AS (
SELECT
geo,
CASE _rank
WHEN 100000000 THEN 'ALL'
WHEN 10000000 THEN 'Top 10M'
WHEN 1000000 THEN 'Top 1M'
WHEN 100000 THEN 'Top 100k'
WHEN 10000 THEN 'Top 10k'
WHEN 1000 THEN 'Top 1k'
END AS rank,
CONCAT(origin, '/') AS root_page,
IF(device = 'desktop', 'desktop', 'mobile') AS client,

# CWV
IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
IF('${pastMonth}' < '2024-01-01',
(IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp),
(IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp)
) AS good_cwv,

# WV
IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb,
IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
FROM geo_summary,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
WHERE rank <= _rank
),

technologies AS (
SELECT
tech.technology,
REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') AS version,
client,
page
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.info) AS version
WHERE
tech.technology IS NOT NULL AND
REGEXP_EXTRACT(version, r'\\d+(?:\\.\\d+)?') IS NOT NULL

UNION ALL

SELECT
tech.technology,
'ALL' AS version,
client,
page
FROM pages,
UNNEST(technologies) AS tech
WHERE
tech.technology IS NOT NULL


UNION ALL

SELECT
'ALL' AS technology,
'ALL' AS version,
client,
page
FROM pages
),

categories AS (
SELECT
tech.technology,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.categories) AS category
GROUP BY technology

UNION ALL

SELECT
'ALL' AS technology,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.categories) AS category
),

lab_metrics AS (
SELECT
client,
page,
root_page,
SAFE.INT64(summary.bytesTotal) AS bytesTotal,
SAFE.INT64(summary.bytesJS) AS bytesJS,
SAFE.INT64(summary.bytesImg) AS bytesImg,
SAFE.FLOAT64(lighthouse.categories.accessibility.score) AS accessibility,
SAFE.FLOAT64(lighthouse.categories['best-practices'].score) AS best_practices,
SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance,
SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa,
SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo
FROM pages
),

lab_data AS (
SELECT
client,
root_page,
technology,
version,
ANY_VALUE(category) AS category,
AVG(bytesTotal) AS bytesTotal,
AVG(bytesJS) AS bytesJS,
AVG(bytesImg) AS bytesImg,
AVG(accessibility) AS accessibility,
AVG(best_practices) AS best_practices,
AVG(performance) AS performance,
AVG(pwa) AS pwa,
AVG(seo) AS seo
FROM lab_metrics
INNER JOIN technologies
USING (client, page)
INNER JOIN categories
USING (technology)
GROUP BY
client,
root_page,
technology,
version
)

SELECT
DATE('${pastMonth}') AS date,
geo,
client,
rank,
technology,
version,
COUNT(DISTINCT root_page) AS origins,

# CrUX data
COUNTIF(good_fid) AS origins_with_good_fid,
COUNTIF(good_cls) AS origins_with_good_cls,
COUNTIF(good_lcp) AS origins_with_good_lcp,
COUNTIF(good_fcp) AS origins_with_good_fcp,
COUNTIF(good_ttfb) AS origins_with_good_ttfb,
COUNTIF(good_inp) AS origins_with_good_inp,
COUNTIF(any_fid) AS origins_with_any_fid,
COUNTIF(any_cls) AS origins_with_any_cls,
COUNTIF(any_lcp) AS origins_with_any_lcp,
COUNTIF(any_fcp) AS origins_with_any_fcp,
COUNTIF(any_ttfb) AS origins_with_any_ttfb,
COUNTIF(any_inp) AS origins_with_any_inp,
COUNTIF(good_cwv) AS origins_with_good_cwv,
COUNTIF(any_lcp AND any_cls) AS origins_eligible_for_cwv,
SAFE_DIVIDE(COUNTIF(good_cwv), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,

# Lighthouse data
SAFE_CAST(APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_accessibility,
SAFE_CAST(APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_best_practices,
SAFE_CAST(APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_performance,
SAFE_CAST(APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_pwa,
SAFE_CAST(APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_seo,

# Page weight stats
SAFE_CAST(APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS INT64) AS median_bytes_total,
SAFE_CAST(APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS INT64) AS median_bytes_js,
SAFE_CAST(APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS INT64) AS median_bytes_image

FROM lab_data
INNER JOIN crux
USING (client, root_page)
GROUP BY
geo,
client,
rank,
technology,
version
`)
Loading