From b90332f3b873203d2bcad819b9d6363a04d2b554 Mon Sep 17 00:00:00 2001 From: Ben Standaert <71239179+bstandaert-wustl@users.noreply.github.com> Date: Thu, 15 Aug 2024 04:13:04 -0500 Subject: [PATCH] Privacy 2024 queries - CCPA, fingerprinting, cookies (#3720) * CCPA metrics * fingerprinting metrics * cookie metrics * lint --- sql/2024/privacy/ccpa_most_common_phrases.sql | 5 +++++ sql/2024/privacy/ccpa_prevalence.sql | 6 ++++++ sql/2024/privacy/cookies_top_first_party.sql | 10 ++++++++++ sql/2024/privacy/cookies_top_third_party.sql | 8 ++++++++ sql/2024/privacy/fingerprinting_most_common_apis.sql | 9 +++++++++ .../privacy/fingerprinting_most_common_scripts.sql | 2 ++ sql/2024/privacy/fingerprinting_script_count.sql | 1 + 7 files changed, 41 insertions(+) create mode 100644 sql/2024/privacy/ccpa_most_common_phrases.sql create mode 100644 sql/2024/privacy/ccpa_prevalence.sql create mode 100644 sql/2024/privacy/cookies_top_first_party.sql create mode 100644 sql/2024/privacy/cookies_top_third_party.sql create mode 100644 sql/2024/privacy/fingerprinting_most_common_apis.sql create mode 100644 sql/2024/privacy/fingerprinting_most_common_scripts.sql create mode 100644 sql/2024/privacy/fingerprinting_script_count.sql diff --git a/sql/2024/privacy/ccpa_most_common_phrases.sql b/sql/2024/privacy/ccpa_most_common_phrases.sql new file mode 100644 index 00000000000..64185711975 --- /dev/null +++ b/sql/2024/privacy/ccpa_most_common_phrases.sql @@ -0,0 +1,5 @@ +WITH pages AS ( + SELECT client, rank_grouping, page, JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.01 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping +) +SELECT client, rank_grouping, link_phrase, count(DISTINCT page) AS num_pages FROM pages, unnest(ccpa_link_phrases) link_phrase GROUP BY link_phrase, rank_grouping, client ORDER BY rank_grouping, client, num_pages DESC diff --git a/sql/2024/privacy/ccpa_prevalence.sql b/sql/2024/privacy/ccpa_prevalence.sql new file mode 100644 index 00000000000..29453778b97 --- /dev/null +++ b/sql/2024/privacy/ccpa_prevalence.sql @@ -0,0 +1,6 @@ +WITH pages AS ( + SELECT client, rank_grouping, page, JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link FROM `httparchive.all.pages`, + -- TABLESAMPLE SYSTEM (0.0025 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping +) +SELECT client, rank_grouping, has_ccpa_link, count(DISTINCT page) AS num_pages FROM pages GROUP BY has_ccpa_link, rank_grouping, client ORDER BY rank_grouping, client, has_ccpa_link diff --git a/sql/2024/privacy/cookies_top_first_party.sql b/sql/2024/privacy/cookies_top_first_party.sql new file mode 100644 index 00000000000..0f167d02904 --- /dev/null +++ b/sql/2024/privacy/cookies_top_first_party.sql @@ -0,0 +1,10 @@ +-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites. + +WITH pages AS ( + SELECT client, root_page, custom_metrics FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT) + WHERE date = '2024-06-01' +), +cookies AS ( + SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie +) +SELECT client, count(DISTINCT firstparty_host) AS domain_count, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500 diff --git a/sql/2024/privacy/cookies_top_third_party.sql b/sql/2024/privacy/cookies_top_third_party.sql new file mode 100644 index 00000000000..cddd1bb54c5 --- /dev/null +++ b/sql/2024/privacy/cookies_top_third_party.sql @@ -0,0 +1,8 @@ +WITH pages AS ( + SELECT page, client, root_page, custom_metrics FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT) + WHERE date = '2024-06-01' +), +cookies AS ( + SELECT client, page, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie +) +SELECT client, cookie_host, count(DISTINCT page) AS page_count FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_host ORDER BY page_count DESC, client LIMIT 500 diff --git a/sql/2024/privacy/fingerprinting_most_common_apis.sql b/sql/2024/privacy/fingerprinting_most_common_apis.sql new file mode 100644 index 00000000000..e34d5bbe2a2 --- /dev/null +++ b/sql/2024/privacy/fingerprinting_most_common_apis.sql @@ -0,0 +1,9 @@ +CREATE TEMP FUNCTION getFingerprintingTypes(input STRING) +RETURNS ARRAY +LANGUAGE js AS """return Object.keys(JSON.parse(input).privacy?.fingerprinting?.counts || {})"""; + +WITH pages AS ( + SELECT client, page, fingerprinting_type FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.001 PERCENT) + unnest(getFingerprintingTypes(custom_metrics)) AS fingerprinting_type WHERE date = '2024-06-01' +) +SELECT client, fingerprinting_type, count(DISTINCT page) AS page_count FROM pages GROUP BY client, fingerprinting_type ORDER BY page_count DESC diff --git a/sql/2024/privacy/fingerprinting_most_common_scripts.sql b/sql/2024/privacy/fingerprinting_most_common_scripts.sql new file mode 100644 index 00000000000..04925f4d82a --- /dev/null +++ b/sql/2024/privacy/fingerprinting_most_common_scripts.sql @@ -0,0 +1,2 @@ +SELECT client, script, count(DISTINCT page) AS page_count FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.001 PERCENT) + unnest(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script WHERE date = '2024-06-01' GROUP BY client, script ORDER BY page_count DESC LIMIT 100; diff --git a/sql/2024/privacy/fingerprinting_script_count.sql b/sql/2024/privacy/fingerprinting_script_count.sql new file mode 100644 index 00000000000..47bde476208 --- /dev/null +++ b/sql/2024/privacy/fingerprinting_script_count.sql @@ -0,0 +1 @@ +SELECT client, array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, count(DISTINCT page) AS page_count FROM `httparchive.all.pages` WHERE date = '2024-06-01' GROUP BY script_count, client ORDER BY script_count ASC;