Skip to content

Commit

Permalink
SEO 2024 queries (#3791)
Browse files Browse the repository at this point in the history
* Uploading Image loading property usage SQL

* Added SQL files for SEO analysis in 2024

* Linting

* Update sql/2024/seo/robots-text-size-2024.sql

Co-authored-by: Barry Pollard <[email protected]>

* Update sql/2024/seo/image-loading-property-usage-2024.sql

Co-authored-by: Barry Pollard <[email protected]>

* Update sql/2024/seo/lighthouse-seo-stats-2024.sql

Co-authored-by: Barry Pollard <[email protected]>

* Linting

---------

Co-authored-by: Barry Pollard <[email protected]>
  • Loading branch information
henryp25 and tunetheweb authored Nov 10, 2024
1 parent 7a80150 commit 083de67
Show file tree
Hide file tree
Showing 32 changed files with 2,490 additions and 0 deletions.
66 changes: 66 additions & 0 deletions sql/2024/seo/anchor-rel-attribute-usage-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#standardSQL
# Anchor rel attribute usage
# This query reports if a rel attribute value was ever used on a page, and calculates various statistics.

CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
rel ARRAY<STRING>
> LANGUAGE js AS '''
var result = {rel: []};
// Function to retrieve only keys if value is >0
function getKey(dict){
const arr = [],
obj = Object.keys(dict);
for (var x in obj){
if(dict[obj[x]] > 0){
arr.push(obj[x]);
}
}
return arr;
}
try {
var wpt_bodies = JSON.parse(wpt_bodies_string);
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
if (wpt_bodies.anchors && wpt_bodies.anchors.rendered && wpt_bodies.anchors.rendered.rel_attributes) {
result.rel = getKey(wpt_bodies.anchors.rendered.rel_attributes);
}
} catch (e) {}
return result;
''';

WITH rel_stats_table AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END
AS is_root_page,
getRelStatsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)

SELECT
client,
is_root_page,
rel,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
rel_stats_table,
UNNEST(wpt_bodies_info.rel) AS rel
GROUP BY
client,
is_root_page,
rel
ORDER BY
sites DESC,
rel,
client DESC;
74 changes: 74 additions & 0 deletions sql/2024/seo/anchor-same-site-occurance-stats-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#standardSQL
# Anchor same site occurrence stats
# This query aims to highlight sites with few same-site links, like SPAs.

CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
links_same_site INT64,
links_window_location INT64,
links_window_open INT64,
links_href_javascript INT64
> LANGUAGE js AS '''
var result = {
links_same_site: 0,
links_window_location: 0,
links_window_open: 0,
links_href_javascript: 0
};
try {
var wpt_bodies = JSON.parse(wpt_bodies_string);
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) {
var anchors_rendered = wpt_bodies.anchors.rendered;
result.links_same_site = anchors_rendered.same_site || 0;
result.links_window_location = anchors_rendered.same_page.dynamic.onclick_attributes.window_location || 0;
result.links_window_open = anchors_rendered.same_page.dynamic.onclick_attributes.window_open || 0;
result.links_href_javascript = anchors_rendered.same_page.dynamic.href_javascript || 0;
}
} catch (e) {}
return result;
''';

WITH same_links_info AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END
AS is_root_page,
getLinkDesciptionsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)

SELECT
client,
wpt_bodies_info.links_same_site AS links_same_site,
is_root_page,
COUNT(DISTINCT page) AS sites, -- Counting all occurrences of links_same_site
SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT page)) AS pct_links_same_site, -- Percentage of same-site links
AVG(wpt_bodies_info.links_window_location) AS avg_links_window_location,
AVG(wpt_bodies_info.links_window_open) AS avg_links_window_open,
AVG(wpt_bodies_info.links_href_javascript) AS avg_links_href_javascript,
AVG(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS avg_links_any,
MAX(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS max_links_any,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct -- Secondary page percentage within group
FROM
same_links_info
GROUP BY
client,
is_root_page,
wpt_bodies_info.links_same_site
ORDER BY
links_same_site ASC;
52 changes: 52 additions & 0 deletions sql/2024/seo/content-language-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_string STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS '''
var result = [];
try {
var almanac = JSON.parse(almanac_string);
if (Array.isArray(almanac) || typeof almanac != 'object') return ["NO PAYLOAD"];
if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes && almanac["meta-nodes"].nodes.filter) {
result = almanac["meta-nodes"].nodes.filter(n => n["http-equiv"] && n["http-equiv"].toLowerCase().trim() == 'content-language' && n.content).map(am => am.content.toLowerCase().trim());
}
if (result.length === 0)
result.push("NO TAG");
} catch (e) {result.push("ERROR "+e);} // results show some issues with the validity of the payload
return result;
''';
WITH content_language_usage AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getContentLanguagesAlmanac(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS content_languages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)
SELECT
client,
is_root_page,
content_language,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
content_language_usage,
UNNEST(content_languages) AS content_language
GROUP BY
client,
is_root_page,
content_language
ORDER BY
sites DESC,
client DESC;
39 changes: 39 additions & 0 deletions sql/2024/seo/core-web-vitals-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
CREATE TEMP FUNCTION IS_GOOD (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
good / (good + needs_improvement + poor) >= 0.75
);
CREATE TEMP FUNCTION IS_NON_ZERO (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
good + needs_improvement + poor > 0
);
SELECT
date,
device,
SAFE_DIVIDE(
COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) AS pct_good_lcp,
# Origins with good FID divided by origins with any FID.
SAFE_DIVIDE(
COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)),
COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL))) AS pct_good_fid,
# Origins with good CLS divided by origins with any CLS.
SAFE_DIVIDE(
COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_good_cls,
# Origins with good LCP, FID, and CLS dividied by origins with any LCP, FID, and CLS.
SAFE_DIVIDE(
COUNT(DISTINCT IF(
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND
IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND
IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
COUNT(DISTINCT IF(
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND
IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_good_cwv
FROM
`chrome-ux-report.materialized.device_summary`
WHERE
date BETWEEN '2019-11-01' AND '2024-06-01' AND
device IN ('desktop', 'phone')
GROUP BY
date,
device
ORDER BY
date DESC
59 changes: 59 additions & 0 deletions sql/2024/seo/hreflang-header-usage-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#standardSQL
# hreflang header usage

# Returns all the data we need from _wpt_bodies
CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
hreflangs ARRAY<STRING>
> LANGUAGE js AS '''
var result = {
hreflangs: []
};
try {
var wpt_bodies = JSON.parse(wpt_bodies_string);
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.http_header && wpt_bodies.hreflangs.http_header.values) {
result.hreflangs = wpt_bodies.hreflangs.http_header.values.map(v => v); // seems to fix a coercion issue!
}
} catch (e) {}
return result;
''';

WITH hreflang_usage AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'

)
SELECT
client,
is_root_page,
NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
hreflang_usage,
UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang
GROUP BY
hreflang,
client,
is_root_page
ORDER BY
sites DESC,
client DESC;
57 changes: 57 additions & 0 deletions sql/2024/seo/hreflang-link-tag-usage-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#standardSQL
# hreflang link tag usage

# Returns all the data we need from _wpt_bodies
CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
hreflangs ARRAY<STRING>
> LANGUAGE js AS '''
var result = {
hreflangs: []
};
try {
var wpt_bodies = JSON.parse(wpt_bodies_string);
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.rendered && wpt_bodies.hreflangs.rendered.values) {
result.hreflangs = wpt_bodies.hreflangs.rendered.values.map(v => v); // seems to fix a coercion issue!
}
} catch (e) {}
return result;
''';

WITH link_tag AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)
SELECT
client,
is_root_page,
NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
link_tag,
UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang
GROUP BY
hreflang,
is_root_page,
client
ORDER BY
client DESC;
35 changes: 35 additions & 0 deletions sql/2024/seo/html-response-content-language-2024.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
WITH subquery AS (
SELECT
client,
page,
request_headers,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page
FROM
`httparchive.all.requests`
WHERE
date = '2024-06-01'
)

SELECT
client,
is_root_page,
header.name AS request_header_name,
header.value AS request_header_value,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER ()) AS pct
FROM
subquery,
UNNEST(request_headers) AS header
GROUP BY
client,
is_root_page,
header.name,
header.value
ORDER BY
sites DESC,
client;
Loading

0 comments on commit 083de67

Please sign in to comment.