Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…hive.org into production
  • Loading branch information
max-ostapenko committed Dec 3, 2024
2 parents a58c704 + 3f52951 commit 89c51be
Show file tree
Hide file tree
Showing 46 changed files with 854 additions and 427 deletions.
64 changes: 0 additions & 64 deletions sql/2024/privacy/ads_accounts_distribution.sql

This file was deleted.

114 changes: 0 additions & 114 deletions sql/2024/privacy/ads_and_sellers_graph.sql

This file was deleted.

45 changes: 0 additions & 45 deletions sql/2024/privacy/ads_lines_distribution.sql

This file was deleted.

29 changes: 0 additions & 29 deletions sql/2024/privacy/common_ads_variables.sql

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,83 +8,74 @@ WITH redirect_requests AS (
index,
response_headers,
page
FROM `httparchive.all.requests`
FROM `httparchive.crawl.requests`
WHERE
date = '2024-06-01' AND
is_root_page = TRUE AND
type NOT IN ('css', 'image', 'font', 'video', 'audio') AND
LEFT(JSON_VALUE(summary, '$.status'), 1) = '3' AND
ROUND(INT64(summary.status) / 100) = 3 AND
index <= 2
), navigation_redirect AS (
-- Find the first navigation redirect
SELECT
client,
url,
page,
headers.value AS navigation_redirect_location
response_header.value AS navigation_redirect_location
FROM redirect_requests,
UNNEST(response_headers) AS headers
UNNEST(response_headers) AS response_header
WHERE
index = 1 AND
LOWER(headers.name) = 'location' AND
NET.REG_DOMAIN(page) != NET.REG_DOMAIN(headers.value)
LOWER(response_header.name) = 'location' AND
NET.REG_DOMAIN(response_header.value) != NET.REG_DOMAIN(page)
), bounce_redirect AS (
-- Find the second navigation redirect
SELECT
client,
url,
page,
headers.value AS bounce_redirect_location,
response_header.value AS bounce_redirect_location,
response_headers
FROM redirect_requests,
UNNEST(response_headers) AS headers
UNNEST(response_headers) AS response_header
WHERE
index = 2 AND
LOWER(headers.name) = 'location' AND
NET.REG_DOMAIN(headers.value) = NET.REG_DOMAIN(page)
), bounce_redirect_with_cookies AS (
-- Find the cookies set during the second navigation redirect
SELECT
client,
url,
page,
bounce_redirect_location
--response_headers.value AS bounce_tracking_cookies
FROM bounce_redirect,
UNNEST(response_headers) AS response_headers
WHERE
LOWER(response_headers.name) = 'set-cookie'
LOWER(response_header.name) = 'location'
), bounce_sequences AS (
-- Combine the first and second navigation redirects
SELECT
nav.client,
nav.page,
nav.url AS navigation_url,
nav.navigation_redirect_location,
bounce.bounce_redirect_location
NET.REG_DOMAIN(navigation_redirect_location) AS bounce_hostname,
COUNT(DISTINCT nav.page) AS number_of_pages
--ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies
FROM navigation_redirect AS nav
LEFT JOIN bounce_redirect_with_cookies AS bounce
LEFT JOIN bounce_redirect AS bounce
ON
nav.client = bounce.client AND
nav.page = bounce.page AND
nav.navigation_redirect_location = bounce.url
WHERE bounce_redirect_location IS NOT NULL
GROUP BY
nav.client,
page,
navigation_url,
navigation_redirect_location,
bounce_redirect_location
bounce_hostname
), pages_total AS (
SELECT
client,
COUNT(DISTINCT page) AS total_pages
FROM `httparchive.crawl.pages`
WHERE date = '2024-06-01' AND
is_root_page
GROUP BY client
)

-- Count the number of websites with bounce tracking per bounce hostname
SELECT
client,
NET.HOST(navigation_redirect_location) AS bounce_hostname,
COUNT(DISTINCT page) AS number_of_pages
--ARRAY_AGG(page LIMIT 2) AS page_examples
bounce_hostname,
number_of_pages,
number_of_pages / total_pages AS pct_pages
FROM bounce_sequences
GROUP BY client, bounce_hostname
JOIN pages_total
USING (client)
ORDER BY number_of_pages DESC
LIMIT 100
Loading

0 comments on commit 89c51be

Please sign in to comment.