Skip to content

Commit

Permalink
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org…
Browse files Browse the repository at this point in the history
… into production
  • Loading branch information
tunetheweb committed Dec 29, 2024
2 parents eef92d1 + e4deb9b commit 58026bf
Show file tree
Hide file tree
Showing 57 changed files with 1,344 additions and 465 deletions.
2 changes: 1 addition & 1 deletion sql/2020/security/iframe_attributes_usage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# usage of allow and sandbox attribute of iframe elements, per page and over all iframe elements
SELECT
client,
COUNT(0) AS total_iframes,
COUNT(0) AS total_iframes, # Note: These are not the total number of iframes but only the number of iframes with allow/sandbox + 1 for each website without such iframes
COUNTIF(allow IS NOT NULL) AS freq_allow,
COUNTIF(allow IS NOT NULL) / COUNT(0) AS pct_allow_frames,
COUNTIF(sandbox IS NOT NULL) AS freq_sandbox,
Expand Down
2 changes: 1 addition & 1 deletion sql/2021/security/iframe_attributes_usage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# usage of allow and sandbox attribute of iframe elements, per page and over all iframe elements
SELECT
client,
COUNT(0) AS total_iframes,
COUNT(0) AS total_iframes, # Note: These are not the total number of iframes but only the number of iframes with allow/sandbox + 1 for each website without such iframes
COUNTIF(allow IS NOT NULL) AS freq_allow,
COUNTIF(allow IS NOT NULL) / COUNT(0) AS pct_allow_frames,
COUNTIF(sandbox IS NOT NULL) AS freq_sandbox,
Expand Down
2 changes: 1 addition & 1 deletion sql/2022/security/iframe_attributes_usage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# usage of allow and sandbox attribute of iframe elements, per page and over all iframe elements
SELECT
client,
COUNT(0) AS total_iframes,
COUNT(0) AS total_iframes, # Note: These are not the total number of iframes but only the number of iframes with allow/sandbox + 1 for each website without such iframes
COUNTIF(allow IS NOT NULL) AS freq_allow,
COUNTIF(allow IS NOT NULL) / COUNT(0) AS pct_allow_frames,
COUNTIF(sandbox IS NOT NULL) AS freq_sandbox,
Expand Down
20 changes: 0 additions & 20 deletions sql/2024/css/README.md

This file was deleted.

2 changes: 1 addition & 1 deletion sql/2024/security/coep_header_prevalence.sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#standardSQL
# Section: Attack Preventions - Preventing attacks using Cross-Origin policies
# Question: Which are the most common COEP values?
# Note: Considers headers of main document responses
# Note: Considers headers of main document responses only
SELECT
client,
coep_header,
Expand Down
2 changes: 1 addition & 1 deletion sql/2024/security/coop_header_prevalence.sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#standardSQL
# Section: Attack Preventions - Preventing attacks using Cross-Origin policies
# Question: Which are the most common COOP values?
# Note: Considers headers of main document responses
# Note: Considers headers of main document responses only
SELECT
client,
coop_header,
Expand Down
5 changes: 3 additions & 2 deletions sql/2024/security/csp_number_of_allowed_hosts.sql
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#standardSQL
# Section: Attack Preventions - Preventing attacks using CSP
# Question: CSP on home pages: number of unique headers, header length and number of allowed HTTP(S) hosts in all directives
# Note: for CSP we checked whether the header value is NULL (empty?) (99.65% of CSP headers are not NULL on desktop), we did not do this for other headers?
CREATE TEMP FUNCTION getNumUniqueHosts(str STRING) AS (
(SELECT COUNT(DISTINCT x) FROM UNNEST(REGEXP_EXTRACT_ALL(str, r'(?i)(https*://[^\s;]+)[\s;]')) AS x)
);

SELECT
client,
percentile,
COUNT(0) AS total_requests,
COUNTIF(csp_header IS NOT NULL) AS total_csp_headers,
COUNT(0) AS total_csp_headers,
COUNTIF(csp_header IS NOT NULL) AS total_non_null_csp_headers,
COUNTIF(csp_header IS NOT NULL) / COUNT(0) AS pct_csp_headers,
COUNT(DISTINCT csp_header) AS num_unique_csp_headers,
APPROX_QUANTILES(LENGTH(csp_header), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS csp_header_length,
Expand Down
4 changes: 2 additions & 2 deletions sql/2024/security/csp_script_source_list_keywords.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Question: usage of default/script-src, and within the directive usage of strict-dynamic, nonce values, unsafe-inline and unsafe-eval
SELECT
client,
total_pages,
total_pages_with_csp,
freq_csp,
freq_default_script_src,
SAFE_DIVIDE(freq_default_script_src, freq_csp) AS pct_default_script_src_over_csp,
Expand All @@ -22,7 +22,7 @@ SELECT
FROM (
SELECT
client,
COUNT(0) AS total_pages,
COUNT(0) AS total_pages_with_csp,
COUNTIF(csp_header IS NOT NULL) AS freq_csp,
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src')) AS freq_default_script_src,
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+strict-dynamic')) AS freq_strict_dynamic,
Expand Down
8 changes: 4 additions & 4 deletions sql/2024/security/iframe_attribute_popular_hosts.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ SELECT
client,
policy_type,
hostname,
total_iframes,
total_iframes_with_allow_or_sandbox,
COUNTIF(has_policy) AS freq,
COUNTIF(has_policy) / total_iframes AS pct
COUNTIF(has_policy) / total_iframes_with_allow_or_sandbox AS pct
FROM (
SELECT
client,
Expand All @@ -37,7 +37,7 @@ FROM (
JOIN (
SELECT
client,
SUM(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox'))) AS total_iframes
SUM(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox'))) AS total_iframes_with_allow_or_sandbox
FROM
`httparchive.all.pages`
WHERE
Expand All @@ -49,7 +49,7 @@ USING
(client)
GROUP BY
client,
total_iframes,
total_iframes_with_allow_or_sandbox,
policy_type,
hostname
HAVING
Expand Down
26 changes: 20 additions & 6 deletions sql/2024/security/iframe_attributes_usage.sql
Original file line number Diff line number Diff line change
@@ -1,16 +1,29 @@
#standardSQL
# Section: Content Inclusion - Iframe Sandbox/Permissions Policy
# Question: How often are the allow and sandbox attributes used on iframes? Both per page and over all iframe elements
# Question: How often are the allow and sandbox attributes used on iframes? Both per page (used in at least one iframe on a page) and over all iframe elements
WITH total_iframe_count AS (
SELECT
client,
date,
SUM(SAFE_CAST(JSON_EXTRACT(custom_metrics, '$.num_iframes') AS INT64)) AS total_iframes
FROM
`httparchive.all.pages`
WHERE
(date = '2022-06-01' OR date = '2023-06-01' OR date = '2023-12-01' OR date = '2024-03-01' OR date = '2024-04-01' OR date = '2024-05-01' OR date = '2024-06-01') AND
is_root_page
GROUP BY client, date
)

SELECT
client,
date,
COUNT(0) AS total_iframes,
total_iframes,
COUNTIF(allow IS NOT NULL) AS freq_allow,
COUNTIF(allow IS NOT NULL) / COUNT(0) AS pct_allow_frames,
COUNTIF(allow IS NOT NULL) / total_iframes AS pct_allow_frames,
COUNTIF(sandbox IS NOT NULL) AS freq_sandbox,
COUNTIF(sandbox IS NOT NULL) / COUNT(0) AS pct_sandbox_frames,
COUNTIF(sandbox IS NOT NULL) / total_iframes AS pct_sandbox_frames,
COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) AS freq_both_frames,
COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) / COUNT(0) AS pct_both_frames,
COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) / total_iframes AS pct_both_frames,
COUNT(DISTINCT url) AS total_urls,
COUNT(DISTINCT IF(allow IS NOT NULL, url, NULL)) AS allow_freq_urls,
COUNT(DISTINCT IF(allow IS NOT NULL, url, NULL)) / COUNT(DISTINCT url) AS allow_pct_urls,
Expand All @@ -36,8 +49,9 @@ FROM (
is_root_page
)
LEFT JOIN UNNEST(iframeAttrs) AS iframeAttr
)
) JOIN total_iframe_count USING (client, date)
GROUP BY
total_iframes,
client,
date
ORDER BY
Expand Down
58 changes: 58 additions & 0 deletions sql/2024/security/iframe_attributes_usage_fix.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#standardSQL
# Section: Content Inclusion - Iframe Sandbox/Permissions Policy
# Question: How often are the allow and sandbox attributes used on iframes? Both per page (used in at least one iframe on a page) and over all iframe elements
WITH total_iframe_count AS (
SELECT
client,
date,
SUM(SAFE.INT64(custom_metrics.other.num_iframes)) AS total_iframes
FROM
`httparchive.crawl.pages`
WHERE
(date = '2020-08-01' OR date = '2021-07-01' OR date = '2022-06-01') AND
is_root_page
GROUP BY client, date
)

SELECT
client,
date,
total_iframes,
COUNTIF(allow IS NOT NULL) AS freq_allow,
COUNTIF(allow IS NOT NULL) / total_iframes AS pct_allow_frames,
COUNTIF(sandbox IS NOT NULL) AS freq_sandbox,
COUNTIF(sandbox IS NOT NULL) / total_iframes AS pct_sandbox_frames,
COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) AS freq_both_frames,
COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) / total_iframes AS pct_both_frames,
COUNT(DISTINCT url) AS total_urls,
COUNT(DISTINCT IF(allow IS NOT NULL, url, NULL)) AS allow_freq_urls,
COUNT(DISTINCT IF(allow IS NOT NULL, url, NULL)) / COUNT(DISTINCT url) AS allow_pct_urls,
COUNT(DISTINCT IF(sandbox IS NOT NULL, url, NULL)) AS sandbox_freq_urls,
COUNT(DISTINCT IF(sandbox IS NOT NULL, url, NULL)) / COUNT(DISTINCT url) AS sandbox_pct_urls
FROM (
SELECT
client,
date,
url,
SAFE.STRING(iframeAttr.allow) AS allow,
SAFE.STRING(iframeAttr.sandbox) AS sandbox
FROM (
SELECT
client,
date,
page AS url,
JSON_EXTRACT_ARRAY(custom_metrics.security.`iframe-allow-sandbox`) AS iframeAttrs
FROM
`httparchive.crawl.pages`
WHERE
(date = '2020-08-01' OR date = '2021-07-01' OR date = '2022-06-01') AND
is_root_page
) LEFT JOIN UNNEST(iframeAttrs) AS iframeAttr
) JOIN total_iframe_count USING (client, date)
GROUP BY
total_iframes,
client,
date
ORDER BY
date,
client
21 changes: 19 additions & 2 deletions sql/2024/security/meta_csp_disallowed_directives.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,24 @@
# Section: Security misconfigurations - CSP directives that are ignored in <meta>
# Question: How many pages use invalid CSP directives in <meta>?
# Note: uses the old payload._almanac metric location instead of custom_metrics.almanac (also the meta-nodes metric is in the generic almanac.js custom metric)
WITH totals AS (
SELECT
client,
COUNT(0) AS total_pages
FROM
`httparchive.all.requests`
WHERE
date = '2024-06-01' AND
is_root_page
GROUP BY
client
)


SELECT
client,
COUNT(DISTINCT page) AS total_pages,
total_pages,
COUNT(DISTINCT page) AS total_pages_with_csp_meta,
COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)frame-ancestors') THEN page END) AS count_frame_ancestors,
COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)frame-ancestors') THEN page END) / COUNT(DISTINCT page) AS pct_frame_ancestors,
COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)sandbox( allow-[a-z]+)*;') THEN page END) AS count_sandbox,
Expand All @@ -22,7 +37,9 @@ FROM (
),
UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node,
UNNEST(['Content-Security-Policy']) AS policy
JOIN totals USING (client)
WHERE
LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = 'content-security-policy' OR LOWER(JSON_VALUE(meta_node, '$.name')) = 'content-security-policy'
GROUP BY
client
client,
total_pages
20 changes: 0 additions & 20 deletions sql/2024/webassembly/README.md

This file was deleted.

Loading

0 comments on commit 58026bf

Please sign in to comment.