-
-
Notifications
You must be signed in to change notification settings - Fork 183
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* query for intermediate table * Add top first and 3rd party * CHIPS * edits * Top registrable domains setting 3rd party cookies * websites with most cookies * add stats cookie table * Format and lint queries * Format and lint queries * Stats * more stats * Lint fix * Update stats_cookies_table.sql * 10k to 100k * edit * new final queries * Format and lint queries * Format and lint queries * adding __Secure- and __Host- prefixes * lint * Update sql/2024/cookies/prevalence_type_attributes_per_rank.sql Co-authored-by: Barry Pollard <[email protected]> * New almanac table * Update SQL queries for Cookies 2024 chapter after creation of the httparchive.almanac.cookies table * Fix linting issue * more linting issues --------- Co-authored-by: Barry Pollard <[email protected]>
- Loading branch information
1 parent
187a571
commit 7a80150
Showing
19 changed files
with
435 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set | ||
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried | ||
-- more efficiently in consecutive queries without having to reextract it every | ||
-- time | ||
|
||
|
||
-- Code used by @tunetheweb to create the table | ||
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262 | ||
|
||
-- CREATE TABLE `httparchive.almanac.cookies` | ||
-- ( | ||
-- date DATE, | ||
-- client STRING, | ||
-- page STRING, | ||
-- root_page STRING, | ||
-- rank INTEGER, | ||
-- startedDateTime STRING, | ||
-- firstPartyCookie BOOL, | ||
-- name STRING, | ||
-- domain STRING, | ||
-- path STRING, | ||
-- expires STRING, | ||
-- size STRING, | ||
-- httpOnly STRING, | ||
-- secure STRING, | ||
-- session STRING, | ||
-- sameSite STRING, | ||
-- sameParty STRING, | ||
-- partitionKey STRING, | ||
-- partitionKeyOpaque STRING | ||
-- ) | ||
-- PARTITION BY date | ||
-- CLUSTER BY | ||
-- client, rank, page | ||
-- AS | ||
-- ... | ||
|
||
|
||
INSERT INTO `httparchive.almanac.cookies` | ||
WITH intermediate_cookie AS ( | ||
SELECT | ||
date, | ||
client, | ||
page, | ||
root_page, | ||
rank, | ||
JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime, | ||
cookie | ||
FROM | ||
`httparchive.all.pages`, | ||
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie | ||
WHERE | ||
date = '2024-06-01' | ||
) | ||
|
||
SELECT | ||
date, | ||
client, | ||
page, | ||
root_page, | ||
rank, | ||
startedDateTime, | ||
ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie, '$.domain'))) AS firstPartyCookie, | ||
JSON_VALUE(cookie, '$.name') AS name, | ||
JSON_VALUE(cookie, '$.domain') AS domain, | ||
JSON_VALUE(cookie, '$.path') AS path, | ||
JSON_VALUE(cookie, '$.expires') AS expires, | ||
JSON_VALUE(cookie, '$.size') AS size, | ||
JSON_VALUE(cookie, '$.httpOnly') AS httpOnly, | ||
JSON_VALUE(cookie, '$.secure') AS secure, | ||
JSON_VALUE(cookie, '$.session') AS session, | ||
JSON_VALUE(cookie, '$.sameSite') AS sameSite, | ||
JSON_VALUE(cookie, '$.sameParty') AS sameParty, | ||
JSON_VALUE(cookie, '$.partitionKey') AS partitionKey, | ||
JSON_VALUE(cookie, '$.partitionKeyOpaque') AS partitionKeyOpaque | ||
FROM intermediate_cookie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
-- Extract the top 20 first party cookies seen across websites that are | ||
-- partitioned. Note: it is a bit weird that 1st party cookies would also be | ||
-- partitioned, as CHIPS is meant for a 3rd party context... | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
name, | ||
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie = TRUE AND | ||
partitionKey IS NOT NULL | ||
GROUP BY name | ||
ORDER BY percentWebsites DESC | ||
LIMIT 20 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
-- Extract the top 20 first party cookies seen across websites that are | ||
-- partitioned. Note: it is a bit weird that 1st party cookies would also be | ||
-- partitioned, as CHIPS is meant for a 3rd party context... | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
name, | ||
domain, | ||
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie = FALSE AND | ||
partitionKey IS NOT NULL | ||
GROUP BY name, domain | ||
ORDER BY percentWebsites DESC | ||
LIMIT 20 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
sql/2024/cookies/age_expire_cookies_per_type_quantiles.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
-- Extract stats around the expire date (age rounded to closest number of days) | ||
-- (only positive values, non session cookies) | ||
-- Before running query: edit date and client | ||
|
||
WITH cookies_age AS ( | ||
SELECT | ||
firstPartyCookie, | ||
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL AND | ||
CAST(expires AS FLOAT64) >= 0 | ||
) | ||
|
||
SELECT | ||
firstPartyCookie, | ||
MIN(age) AS min, | ||
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25, | ||
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median, | ||
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75, | ||
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90, | ||
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99, | ||
MAX(age) AS max | ||
FROM cookies_age | ||
GROUP BY firstPartyCookie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
-- Extract stats around the expire date (age rounded to closest number of days) | ||
-- (only positive values, non session cookies) | ||
-- Before running query: edit date and client | ||
|
||
WITH cookies_age AS ( | ||
SELECT | ||
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL AND | ||
CAST(expires AS FLOAT64) >= 0 | ||
) | ||
|
||
SELECT | ||
MIN(age) AS min, | ||
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25, | ||
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median, | ||
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75, | ||
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90, | ||
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99, | ||
MAX(age) AS max | ||
FROM cookies_age |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
-- Extract the expire date (age rounded to closest number of days) | ||
-- (only positive values, non session cookies) | ||
-- Before running query: edit date and client | ||
|
||
WITH cookies_age AS ( | ||
SELECT | ||
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL AND | ||
CAST(expires AS FLOAT64) >= 0 | ||
) | ||
|
||
SELECT | ||
age, | ||
COUNT(0) AS nbCookies | ||
FROM cookies_age | ||
GROUP BY age | ||
ORDER BY age ASC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
-- Extract the nb of cookies | ||
-- Before running query: edit date and client | ||
|
||
WITH nb_cookies_per_website AS ( | ||
SELECT | ||
firstPartyCookie, | ||
NET.HOST(page) AS pageFirstPartyHost, | ||
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL | ||
GROUP BY firstPartyCookie, pageFirstPartyHost | ||
) | ||
|
||
SELECT | ||
firstPartyCookie, | ||
distinctNbCookies, | ||
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites | ||
FROM nb_cookies_per_website | ||
GROUP BY firstPartyCookie, distinctNbCookies | ||
ORDER BY firstPartyCookie, distinctNbCookies ASC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
-- Extract stats about the nb of cookies | ||
-- Before running query: edit date and client | ||
|
||
WITH nb_cookies_per_website AS ( | ||
SELECT | ||
firstPartyCookie, | ||
NET.HOST(page) AS pageFirstPartyHost, | ||
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL | ||
GROUP BY firstPartyCookie, pageFirstPartyHost | ||
) | ||
|
||
SELECT | ||
firstPartyCookie, | ||
MIN(distinctNbCookies) AS min, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99, | ||
MAX(distinctNbCookies) AS max | ||
FROM nb_cookies_per_website | ||
GROUP BY firstPartyCookie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
-- Extract stats about the nb of cookies | ||
-- Before running query: edit date and client | ||
|
||
WITH nb_cookies_per_website AS ( | ||
SELECT | ||
NET.HOST(page) AS pageFirstPartyHost, | ||
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL | ||
GROUP BY pageFirstPartyHost | ||
) | ||
|
||
SELECT | ||
MIN(distinctNbCookies) AS min, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90, | ||
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99, | ||
MAX(distinctNbCookies) AS max | ||
FROM nb_cookies_per_website |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party) | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
firstPartyCookie, | ||
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly, | ||
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure, | ||
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session, | ||
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty, | ||
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax, | ||
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone, | ||
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict, | ||
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull, | ||
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey, | ||
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque, | ||
SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix, | ||
SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL -- just in case | ||
GROUP BY | ||
firstPartyCookie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
-- Prevalence of cookies type and attributes per rank grouping | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
rank_grouping, | ||
SUM(IF(firstPartyCookie = TRUE, 1, 0)) / COUNT(0) AS firstParty, | ||
SUM(IF(firstPartyCookie = FALSE, 1, 0)) / COUNT(0) AS thirdParty, | ||
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly, | ||
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure, | ||
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session, | ||
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty, | ||
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax, | ||
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone, | ||
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict, | ||
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull, | ||
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey, | ||
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque | ||
FROM `httparchive.almanac.cookies`, | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | ||
WHERE | ||
rank <= rank_grouping AND | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
firstPartyCookie IS NOT NULL -- just in case | ||
GROUP BY | ||
rank_grouping | ||
ORDER BY | ||
rank_grouping |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
-- Extract the size of the cookies | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
CAST(size AS INT) AS sizeCookie, | ||
COUNT(0) AS nbCookies | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL | ||
GROUP BY sizeCookie | ||
ORDER BY sizeCookie ASC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
-- Extract stats around the size of the cookies | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
firstPartyCookie, | ||
MIN(CAST(size AS INT)) AS min, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99, | ||
MAX(CAST(size AS INT)) AS max | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL | ||
GROUP BY firstPartyCookie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
-- Extract stats around the size of the cookies | ||
-- Before running query: edit date and client | ||
|
||
SELECT | ||
MIN(CAST(size AS INT)) AS min, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90, | ||
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99, | ||
MAX(CAST(size AS INT)) AS max | ||
FROM `httparchive.almanac.cookies` | ||
WHERE | ||
date = '2024-06-01' AND | ||
client = 'desktop' AND | ||
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future | ||
firstPartyCookie IS NOT NULL |
Oops, something went wrong.