Skip to content

Commit

Permalink
Markup 2024 Chapter (#3815)
Browse files Browse the repository at this point in the history
* Add 2024 markup queries

* Update Markup 2024 queries

* Update Markup queries

* Update markup.md

* Update markdown.md

* Smart quotes

* Generate chapter

* Generate images

* Optimised images with calibre/image-actions

* Add author

* Lint SQL

* Retake hi-res images

* Update contributors file

* Final edits

---------

Co-authored-by: Barry Pollard <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Nov 9, 2024
1 parent 51b5f66 commit f084df4
Show file tree
Hide file tree
Showing 50 changed files with 1,693 additions and 12 deletions.
62 changes: 62 additions & 0 deletions sql/2024/markup/attributes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#standardSQL
# pages almanac metrics grouped by device and element attribute use (frequency)

CREATE TEMPORARY FUNCTION get_almanac_attribute_info(almanac_string STRING)
RETURNS ARRAY<STRUCT<name STRING, freq INT64>> LANGUAGE js AS '''
try {
var almanac = JSON.parse(almanac_string);
if (Array.isArray(almanac) || typeof almanac != 'object') return [];
if (almanac.attributes_used_on_elements) {
return Object.entries(almanac.attributes_used_on_elements).map(([name, freq]) => ({name, freq}));
}
} catch (e) {
}
return [];
''';

WITH totals AS (
SELECT
client,
COUNT(0) AS total
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
), attributes AS (
SELECT
client,
almanac_attribute_info.name,
COUNT(DISTINCT page) AS pages,
ANY_VALUE(total) AS total_pages,
COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages,
SUM(almanac_attribute_info.freq) AS freq,
SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS total,
SUM(almanac_attribute_info.freq) / SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS pct_ratio
FROM
`httparchive.all.pages`,
UNNEST(get_almanac_attribute_info(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS almanac_attribute_info
JOIN
totals
USING
(client)
WHERE
date = '2024-06-01'
GROUP BY
client,
almanac_attribute_info.name
)

SELECT
*
FROM
attributes
ORDER BY
pct_ratio DESC
LIMIT
1000
49 changes: 49 additions & 0 deletions sql/2024/markup/buttons.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
CREATE TEMPORARY FUNCTION get_markup_buttons_info(markup_string STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var markup = JSON.parse(markup_string);
var type_total = Object.values(markup.buttons.types).reduce((total, i) => total + i, 0);
var types = [];
if (markup.buttons.total > type_total) {
types = ['NO_TYPE'];
}
return Object.keys(markup.buttons.types).concat(types);
} catch (e) {
return [];
}
''';

WITH totals AS (
SELECT
client,
COUNT(0) AS total
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
)

SELECT
client AS client,
LOWER(TRIM(button_type)) AS button_type,
COUNT(DISTINCT page) AS page,
ANY_VALUE(total) AS total,
COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages
FROM
`httparchive.all.pages`
JOIN
totals
USING
(client),
UNNEST(get_markup_buttons_info(JSON_EXTRACT(custom_metrics, '$.markup'))) AS button_type
WHERE
date = '2024-06-01'
GROUP BY
client,
button_type
ORDER BY
pct_pages DESC
LIMIT
1000
22 changes: 22 additions & 0 deletions sql/2024/markup/comments.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
WITH comments AS (
SELECT
client,
CAST(JSON_VALUE(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.raw_html.comment_count') AS INT64) AS num_comments,
CAST(JSON_VALUE(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.raw_html.conditional_comment_count') AS INT64) AS num_conditional_comments
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)

SELECT
client,
COUNTIF(num_comments > 0) AS num_comments,
COUNTIF(num_conditional_comments > 0) AS num_conditional_comments,
COUNT(0) AS total,
COUNTIF(num_comments > 0) / COUNT(0) AS pct_comments,
COUNTIF(num_conditional_comments > 0) / COUNT(0) AS pct_conditional_comments
FROM
comments
GROUP BY
client
31 changes: 31 additions & 0 deletions sql/2024/markup/content_encoding.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
-- Temporary function to extract content-encoding
CREATE TEMPORARY FUNCTION GET_CONTENT_ENCODING(response_headers ARRAY<STRUCT<name STRING, value STRING>>)
RETURNS STRING AS (
(
SELECT
value
FROM
UNNEST(response_headers) AS header
WHERE
LOWER(header.name) = 'content-encoding'
LIMIT 1
)
);

SELECT
date,
client,
GET_CONTENT_ENCODING(response_headers) AS content_encoding,
COUNT(0) AS freq,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct
FROM
`httparchive.all.requests`
WHERE
date = '2024-06-01' AND
is_main_document
GROUP BY
client,
content_encoding
ORDER BY
pct DESC
23 changes: 23 additions & 0 deletions sql/2024/markup/custom_elements_adoption.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
WITH custom_elements AS (
SELECT
client,
page,
COALESCE(ARRAY_LENGTH(JSON_VALUE_ARRAY(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.web_components.rendered.customElements.names')) > 0, FALSE) AS has_custom_elements
FROM
`httparchive.all.pages`
WHERE
date IN ('2022-06-01', '2023-06-01', '2024-06-01')
)

SELECT
date,
client,
COUNT(0) AS total,
COUNTIF(has_custom_elements) AS freq,
COUNTIF(has_custom_elements) / COUNT(0) AS pct_custom_elements
FROM
custom_elements
GROUP BY
date, client
ORDER BY
date ASC
41 changes: 41 additions & 0 deletions sql/2024/markup/custom_elements_js_bytes_distribution.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
WITH js_bytes AS (
SELECT
client,
page,
SAFE_CAST(JSON_EXTRACT(summary, '$.bytesJS') AS INT64) / 1024 AS kbytes_js
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
), custom_elements AS (
SELECT
client,
page,
COALESCE(ARRAY_LENGTH(JSON_VALUE_ARRAY(JSON_EXTRACT(custom_metrics, '$.wpt_bodies'), '$.web_components.rendered.customElements.names')) > 0, FALSE) AS has_custom_elements
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)

SELECT
percentile,
client,
has_custom_elements,
APPROX_QUANTILES(kbytes_js, 1000)[OFFSET(percentile * 10)] AS kbytes_js,
COUNT(DISTINCT page) AS pages
FROM
custom_elements
JOIN
js_bytes
USING
(client, page),
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client,
has_custom_elements
ORDER BY
percentile,
client,
has_custom_elements
43 changes: 43 additions & 0 deletions sql/2024/markup/data_attribute_total.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
CREATE TEMPORARY FUNCTION get_almanac_attribute_info(almanac_string STRING)
RETURNS ARRAY<STRUCT<name STRING, freq INT64>> LANGUAGE js AS '''
try {
var almanac = JSON.parse(almanac_string);
if (Array.isArray(almanac) || typeof almanac != 'object') return [];
if (almanac.attributes_used_on_elements) {
return Object.entries(almanac.attributes_used_on_elements).filter(([name, freq]) => name.startsWith('data-')).map(([name, freq]) => ({name, freq}));
}
} catch (e) {}
return [];
''';

WITH totals AS (
SELECT
client,
COUNT(0) AS total_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
)

SELECT
client,
COUNT(DISTINCT page) AS pages,
ANY_VALUE(total_pages) AS total_pages,
COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages
FROM
`httparchive.all.pages`
JOIN
totals
USING
(client),
UNNEST(get_almanac_attribute_info(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS almanac_attribute_info
WHERE
date = '2024-06-01'
GROUP BY
client
57 changes: 57 additions & 0 deletions sql/2024/markup/data_attributes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
CREATE TEMPORARY FUNCTION get_almanac_attribute_info(almanac_string STRING)
RETURNS ARRAY<STRUCT<name STRING, freq INT64>> LANGUAGE js AS '''
try {
var almanac = JSON.parse(almanac_string);
if (Array.isArray(almanac) || typeof almanac != 'object') return [];
if (almanac.attributes_used_on_elements) {
return Object.entries(almanac.attributes_used_on_elements).filter(([name, freq]) => name.startsWith('data-')).map(([name, freq]) => ({name, freq}));
}
} catch (e) {}
return [];
''';

WITH totals AS (
SELECT
client,
COUNT(0) AS total_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
), data_attrs AS (
SELECT
client,
almanac_attribute_info.name,
COUNT(DISTINCT page) AS pages,
ANY_VALUE(total_pages) AS total_pages,
COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages,
SUM(almanac_attribute_info.freq) AS freq, # total count from all pages
SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS total,
SUM(almanac_attribute_info.freq) / SUM(SUM(almanac_attribute_info.freq)) OVER (PARTITION BY client) AS pct_ratio
FROM
`httparchive.all.pages`
JOIN
totals
USING
(client),
UNNEST(get_almanac_attribute_info(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS almanac_attribute_info
WHERE
date = '2024-06-01'
GROUP BY
client,
almanac_attribute_info.name
)

SELECT
*
FROM
data_attrs
ORDER BY
pct_ratio DESC
LIMIT
1000
19 changes: 19 additions & 0 deletions sql/2024/markup/distinct_lang.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
WITH langs AS (
SELECT
client,
TRIM(LOWER(JSON_EXTRACT(custom_metrics, '$.almanac.html_node.lang'))) AS lang
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)

SELECT
client,
COUNT(DISTINCT IFNULL(lang, '(not set)')) AS distinct_lang_count
FROM
langs
GROUP BY
client
ORDER BY
distinct_lang_count DESC;
22 changes: 22 additions & 0 deletions sql/2024/markup/doctype.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Temporary function to extract doctype
CREATE TEMPORARY FUNCTION EXTRACT_DOCTYPE(summary STRING) RETURNS STRING AS (
SAFE_CAST(JSON_EXTRACT(summary, '$.doctype') AS STRING)
);

SELECT
client,
LOWER(REGEXP_REPLACE(TRIM(EXTRACT_DOCTYPE(summary)), r' +', ' ')) AS doctype, # remove extra spaces and make lower case
COUNT(0) AS pages,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client,
doctype
ORDER BY
pct_pages DESC
LIMIT
100;
19 changes: 19 additions & 0 deletions sql/2024/markup/document_size_distribution.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Temporary function to extract bytesHtml
CREATE TEMPORARY FUNCTION EXTRACT_DOCTYPE(summary STRING) RETURNS INT64 AS (
SAFE_CAST(JSON_EXTRACT(summary, '$.bytesHtml') AS INT64)
);

SELECT
percentile,
client,
APPROX_QUANTILES(EXTRACT_DOCTYPE(summary) / 1014, 1000)[OFFSET(percentile * 10)] AS kb_html
FROM
`httparchive.all.pages`,
UNNEST([10, 25, 50, 75, 90]) AS percentile
WHERE
date = '2024-06-01'
GROUP BY
percentile,
client
ORDER BY
client
Loading

0 comments on commit f084df4

Please sign in to comment.