diff --git a/bootcamp/materials/4-applying-analytical-patterns/README.md b/bootcamp/materials/4-applying-analytical-patterns/README.md new file mode 100644 index 00000000..e052ce31 --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/README.md @@ -0,0 +1,7 @@ +# 4 Applying Analytics Patterns and Advanced SQL + +We are going to be talking about a few different analytical patterns + +- State Change Tracking +- Smoothing of trend lines +- J curve analysis diff --git a/bootcamp/materials/4-applying-analytical-patterns/homework/homework.md b/bootcamp/materials/4-applying-analytical-patterns/homework/homework.md new file mode 100644 index 00000000..30c2765a --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/homework/homework.md @@ -0,0 +1,25 @@ +# Week 4 Applying Analytical Patterns +The homework this week will be using the `players`, `players_scd`, and `player_seasons` tables from week 1 + +- A query that does state change tracking for `players` + - A player entering the league should be `New` + - A player leaving the league should be `Retired` + - A player staying in the league should be `Continued Playing` + - A player that comes out of retirement should be `Returned from Retirement` + - A player that stays out of the league should be `Stayed Retired` + +- A query that uses `GROUPING SETS` to do efficient aggregations of `game_details` data + - Aggregate this dataset along the following dimensions + - player and team + - Answer questions like who scored the most points playing for one team? + - player and season + - Answer questions like who scored the most points in one season? + - team + - Answer questions like which team has won the most games? + +- A query that uses window functions on `game_details` to find out the following things: + - What is the most games a team has won in a 90 game stretch? + - How many games in a row did LeBron James score over 10 points a game? + + +Please add these queries into a folder `homework/` \ No newline at end of file diff --git a/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/funnel_analysis.sql b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/funnel_analysis.sql new file mode 100644 index 00000000..d3445b85 --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/funnel_analysis.sql @@ -0,0 +1,30 @@ +WITH deduped_events AS ( + SELECT + url, host, user_id,event_time + FROM events + GROUP BY 1,2,3,4 +), + clean_events AS ( + SELECT *, DATE(event_time) as event_date FROM deduped_events +WHERE user_id IS NOT NULL +ORDER BY user_id, event_time + ), + converted AS ( + SELECT ce1.user_id, + ce1.event_time, + ce1.url, + COUNT(DISTINCT CASE WHEN ce2.url = '/api/v1/user' THEN ce2.url END) as converted + FROM clean_events ce1 + JOIN clean_events ce2 + ON ce2.user_id = ce1.user_id + AND ce2.event_date = ce1.event_date + AND ce2.event_time > ce1.event_time + + GROUP BY 1, 2,3 + ) + +SELECT url, COUNT(1), CAST(SUM(converted) AS REAL)/COUNT(1) +FROM converted +GROUP BY 1 +HAVING CAST(SUM(converted) AS REAL)/COUNT(1) > 0 +AND COUNT(1) > 100 diff --git a/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/grouping_sets.sql b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/grouping_sets.sql new file mode 100644 index 00000000..b33bfefc --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/grouping_sets.sql @@ -0,0 +1,34 @@ +CREATE TABLE device_hits_dashboard AS + +WITH events_augmented AS ( + SELECT COALESCE(d.os_type, 'unknown') AS os_type, + COALESCE(d.device_type, 'unknown') AS device_type, + COALESCE(d.browser_type, 'unknown') AS browser_type, + url, + user_id + FROM events e + JOIN devices d on e.device_id = d.device_id +) + +SELECT + CASE + WHEN GROUPING(os_type) = 0 + AND GROUPING(device_type) = 0 + AND GROUPING(browser_type) = 0 + THEN 'os_type__device_type__browser' + WHEN GROUPING(browser_type) = 0 THEN 'browser_type' + WHEN GROUPING(device_type) = 0 THEN 'device_type' + WHEN GROUPING(os_type) = 0 THEN 'os_type' + END as aggregation_level, + COALESCE(os_type, '(overall)') as os_type, + COALESCE(device_type, '(overall)') as device_type, + COALESCE(browser_type, '(overall)') as browser_type, + COUNT(1) as number_of_hits +FROM events_augmented +GROUP BY GROUPING SETS ( + (browser_type, device_type, os_type), + (browser_type), + (os_type), + (device_type) + ) +ORDER BY COUNT(1) DESC \ No newline at end of file diff --git a/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/growth_accounting.sql b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/growth_accounting.sql new file mode 100644 index 00000000..5176ecba --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/growth_accounting.sql @@ -0,0 +1,46 @@ +WITH yesterday AS ( + SELECT * FROM users_growth_accounting + WHERE date = DATE('2023-03-09') +), + today AS ( + SELECT + CAST(user_id AS TEXT) as user_id, + DATE_TRUNC('day', event_time::timestamp) as today_date, + COUNT(1) + FROM events + WHERE DATE_TRUNC('day', event_time::timestamp) = DATE('2023-03-10') + AND user_id IS NOT NULL + GROUP BY user_id, DATE_TRUNC('day', event_time::timestamp) + ) + + SELECT COALESCE(t.user_id, y.user_id) as user_id, + COALESCE(y.first_active_date, t.today_date) AS first_active_date, + COALESCE(t.today_date, y.last_active_date) AS last_active_date, + CASE + WHEN y.user_id IS NULL THEN 'New' + WHEN y.last_active_date = t.today_date - Interval '1 day' THEN 'Retained' + WHEN y.last_active_date < t.today_date - Interval '1 day' THEN 'Resurrected' + WHEN t.today_date IS NULL AND y.last_active_date = y.date THEN 'Churned' + ELSE 'Stale' + END as daily_active_state, + CASE + WHEN y.user_id IS NULL THEN 'New' + WHEN y.last_active_date < t.today_date - Interval '7 day' THEN 'Resurrected' + WHEN + t.today_date IS NULL + AND y.last_active_date = y.date - interval '7 day' THEN 'Churned' + WHEN COALESCE(t.today_date, y.last_active_date) + INTERVAL '7 day' >= y.date THEN 'Retained' + ELSE 'Stale' + END as weekly_active_state, + COALESCE(y.dates_active, + ARRAY []::DATE[]) + || CASE + WHEN + t.user_id IS NOT NULL + THEN ARRAY [t.today_date] + ELSE ARRAY []::DATE[] + END AS date_list, + COALESCE(t.today_date, y.date + Interval '1 day') as date + FROM today t + FULL OUTER JOIN yesterday y + ON t.user_id = y.user_id diff --git a/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/retention_analysis.sql b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/retention_analysis.sql new file mode 100644 index 00000000..280cd1f9 --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/retention_analysis.sql @@ -0,0 +1,7 @@ +SELECT + date - first_active_date AS days_since_first_active, + CAST(COUNT(CASE + WHEN daily_active_state + IN ('Retained', 'Resurrected', 'New') THEN 1 END) AS REAL)/COUNT(1) as pct_active, + COUNT(1) FROM users_growth_accounting +GROUP BY date - first_active_date; diff --git a/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/window_based_analysis.sql b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/window_based_analysis.sql new file mode 100644 index 00000000..1a93447c --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/lecture-lab/window_based_analysis.sql @@ -0,0 +1,68 @@ +WITH events_augmented AS ( + SELECT COALESCE(d.os_type, 'unknown') AS os_type, + COALESCE(d.device_type, 'unknown') AS device_type, + COALESCE(d.browser_type, 'unknown') AS browser_type, + url, + user_id, + CASE + WHEN referrer like '%linkedin%' THEN 'Linkedin' + WHEN referrer like '%t.co%' THEN 'Twitter' + WHEN referrer like '%google%' THEN 'Google' + WHEN referrer like '%lnkd%' THEN 'Linkedin' + WHEN referrer like '%eczachly%' THEN 'On Site' + WHEN referrer LIKE '%zachwilson%' THEN 'On Site' + ELSE referrer + END as referrer, + DATE(event_time) AS event_date + FROM events e + JOIN devices d on e.device_id = d.device_id +), + aggregated AS ( + SELECT url, referrer, event_date, COUNT(1) as count + FROM events_augmented + GROUP BY url, referrer, event_date + ), + windowed AS ( + SELECT referrer, + url, + event_date, + count, + SUM(count) OVER ( + PARTITION BY referrer, url, DATE_TRUNC('month', event_date) + ORDER BY event_date + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS monthly_cumulative_sum, + SUM(count) OVER ( + PARTITION BY referrer, url + ORDER BY event_date + ) AS rolling_cumulative_sum, + SUM(count) OVER ( + PARTITION BY referrer, url + ORDER BY event_date + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS total_cumulative_sum, + SUM(count) OVER ( + PARTITION BY referrer, url + ORDER BY event_date + ROWS BETWEEN 6 preceding AND CURRENT ROW + ) AS weekly_rolling_count, + SUM(count) OVER ( + PARTITION BY referrer, url + ORDER BY event_date + ROWS BETWEEN 13 preceding AND 6 preceding + ) AS previous_weekly_rolling_count + FROM aggregated + ORDER BY referrer, url, event_date + ) + +SELECT referrer, + url, + event_date, + count, + weekly_rolling_count, + previous_weekly_rolling_count, + CAST(count AS REAL) / monthly_cumulative_sum as pct_of_month, + CAST(count AS REAL) / total_cumulative_sum as pct_of_total +FROM windowed +WHERE total_cumulative_sum > 500 + AND referrer IS NOT NULL \ No newline at end of file diff --git a/bootcamp/materials/4-applying-analytical-patterns/tables/user_growth_accounting.sql b/bootcamp/materials/4-applying-analytical-patterns/tables/user_growth_accounting.sql new file mode 100644 index 00000000..c9ca9507 --- /dev/null +++ b/bootcamp/materials/4-applying-analytical-patterns/tables/user_growth_accounting.sql @@ -0,0 +1,10 @@ + CREATE TABLE users_growth_accounting ( + user_id TEXT, + first_active_date DATE, + last_active_date DATE, + daily_active_state TEXT, + weekly_active_state TEXT, + dates_active DATE[], + date DATE, + PRIMARY KEY (user_id, date) + ); \ No newline at end of file