Skip to content

Commit

Permalink
Cookies sql 2024 (#3741)
Browse files Browse the repository at this point in the history
* query for intermediate table

* Add top first and 3rd party

* CHIPS

* edits

* Top registrable domains setting 3rd party cookies

* websites with most cookies

* add stats cookie table

* Format and lint queries

* Format and lint queries

* Stats

* more stats

* Lint fix

* Update stats_cookies_table.sql

* 10k to 100k

* edit

* new final queries

* Format and lint queries

* Format and lint queries

* adding __Secure- and __Host- prefixes

* lint

* Update sql/2024/cookies/prevalence_type_attributes_per_rank.sql

Co-authored-by: Barry Pollard <[email protected]>

* New almanac table

* Update SQL queries for Cookies 2024 chapter after creation of the httparchive.almanac.cookies table

* Fix linting issue

* more linting issues

---------

Co-authored-by: Barry Pollard <[email protected]>
  • Loading branch information
yohhaan and tunetheweb authored Nov 10, 2024
1 parent 187a571 commit 7a80150
Show file tree
Hide file tree
Showing 19 changed files with 435 additions and 0 deletions.
76 changes: 76 additions & 0 deletions sql/2024/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried
-- more efficiently in consecutive queries without having to reextract it every
-- time


-- Code used by @tunetheweb to create the table
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262

-- CREATE TABLE `httparchive.almanac.cookies`
-- (
-- date DATE,
-- client STRING,
-- page STRING,
-- root_page STRING,
-- rank INTEGER,
-- startedDateTime STRING,
-- firstPartyCookie BOOL,
-- name STRING,
-- domain STRING,
-- path STRING,
-- expires STRING,
-- size STRING,
-- httpOnly STRING,
-- secure STRING,
-- session STRING,
-- sameSite STRING,
-- sameParty STRING,
-- partitionKey STRING,
-- partitionKeyOpaque STRING
-- )
-- PARTITION BY date
-- CLUSTER BY
-- client, rank, page
-- AS
-- ...


INSERT INTO `httparchive.almanac.cookies`
WITH intermediate_cookie AS (
SELECT
date,
client,
page,
root_page,
rank,
JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime,
cookie
FROM
`httparchive.all.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie
WHERE
date = '2024-06-01'
)

SELECT
date,
client,
page,
root_page,
rank,
startedDateTime,
ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie, '$.domain'))) AS firstPartyCookie,
JSON_VALUE(cookie, '$.name') AS name,
JSON_VALUE(cookie, '$.domain') AS domain,
JSON_VALUE(cookie, '$.path') AS path,
JSON_VALUE(cookie, '$.expires') AS expires,
JSON_VALUE(cookie, '$.size') AS size,
JSON_VALUE(cookie, '$.httpOnly') AS httpOnly,
JSON_VALUE(cookie, '$.secure') AS secure,
JSON_VALUE(cookie, '$.session') AS session,
JSON_VALUE(cookie, '$.sameSite') AS sameSite,
JSON_VALUE(cookie, '$.sameParty') AS sameParty,
JSON_VALUE(cookie, '$.partitionKey') AS partitionKey,
JSON_VALUE(cookie, '$.partitionKeyOpaque') AS partitionKeyOpaque
FROM intermediate_cookie
18 changes: 18 additions & 0 deletions sql/2024/cookies/CHIPS_top_20_first_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie = TRUE AND
partitionKey IS NOT NULL
GROUP BY name
ORDER BY percentWebsites DESC
LIMIT 20
19 changes: 19 additions & 0 deletions sql/2024/cookies/CHIPS_top_20_third_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
domain,
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie = FALSE AND
partitionKey IS NOT NULL
GROUP BY name, domain
ORDER BY percentWebsites DESC
LIMIT 20
6 changes: 6 additions & 0 deletions sql/2024/cookies/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,9 @@
[~google-doc]: https://docs.google.com/document/d/1o2AgdsDq_x3OvthZF7Kb50rUKMVLn7UANT9Stz7ku2I/edit#heading=h.ymg495uvm3yx
[~google-sheets]: https://docs.google.com/spreadsheets/d/1wDGnUkO0rgcU5_V6hmUrhm1pq60VU2XbeMHgYJEEaSM/edit#gid=454016814
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/cookies.md

## Note about SQL queries

First execute [`0_create_cookies.sql`](0_create_cookies.sql) to export the
results of the <DATE> crawl specified into the `httparchive.almanac.cookies`
table that will then be used in other SQL queries.
28 changes: 28 additions & 0 deletions sql/2024/cookies/age_expire_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
firstPartyCookie,
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
firstPartyCookie,
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2024/cookies/age_expire_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
22 changes: 22 additions & 0 deletions sql/2024/cookies/age_expires_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Extract the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
age,
COUNT(0) AS nbCookies
FROM cookies_age
GROUP BY age
ORDER BY age ASC
24 changes: 24 additions & 0 deletions sql/2024/cookies/nb_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- Extract the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
distinctNbCookies,
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites
FROM nb_cookies_per_website
GROUP BY firstPartyCookie, distinctNbCookies
ORDER BY firstPartyCookie, distinctNbCookies ASC
28 changes: 28 additions & 0 deletions sql/2024/cookies/nb_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2024/cookies/nb_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY pageFirstPartyHost
)

SELECT
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
25 changes: 25 additions & 0 deletions sql/2024/cookies/prevalence_attributes_per_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party)
-- Before running query: edit date and client

SELECT
firstPartyCookie,
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque,
SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix,
SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
firstPartyCookie
28 changes: 28 additions & 0 deletions sql/2024/cookies/prevalence_type_attributes_per_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Prevalence of cookies type and attributes per rank grouping
-- Before running query: edit date and client

SELECT
rank_grouping,
SUM(IF(firstPartyCookie = TRUE, 1, 0)) / COUNT(0) AS firstParty,
SUM(IF(firstPartyCookie = FALSE, 1, 0)) / COUNT(0) AS thirdParty,
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque
FROM `httparchive.almanac.cookies`,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE
rank <= rank_grouping AND
date = '2024-06-01' AND
client = 'desktop' AND
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
rank_grouping
ORDER BY
rank_grouping
14 changes: 14 additions & 0 deletions sql/2024/cookies/size_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
-- Extract the size of the cookies
-- Before running query: edit date and client

SELECT
CAST(size AS INT) AS sizeCookie,
COUNT(0) AS nbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY sizeCookie
ORDER BY sizeCookie ASC
19 changes: 19 additions & 0 deletions sql/2024/cookies/size_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Extract stats around the size of the cookies
-- Before running query: edit date and client

SELECT
firstPartyCookie,
MIN(CAST(size AS INT)) AS min,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99,
MAX(CAST(size AS INT)) AS max
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie
17 changes: 17 additions & 0 deletions sql/2024/cookies/size_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- Extract stats around the size of the cookies
-- Before running query: edit date and client

SELECT
MIN(CAST(size AS INT)) AS min,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99,
MAX(CAST(size AS INT)) AS max
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
Loading

0 comments on commit 7a80150

Please sign in to comment.