From 91e466ccd7458dddd22a48e415a5e427a83b32b7 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 16 Apr 2024 11:42:53 +0200 Subject: [PATCH 01/18] initial commit --- infra_setup/init.sql | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/infra_setup/init.sql b/infra_setup/init.sql index b947479..c640d6d 100644 --- a/infra_setup/init.sql +++ b/infra_setup/init.sql @@ -19,7 +19,17 @@ FROM '/data/products.csv' DELIMITER ',' CSV HEADER; -- TODO: Provide the DDL statment to create this table ALT_SCHOOL.CUSTOMERS +create table if not exists ALT_SCHOOL.CUSTOMERS +( + customer_id uuid primary key, + device_id uuid NOT NULL, + "location" varchar(50) NOT NULL, + currency varchar(10) NULL +); + -- TODO: provide the command to copy the customers data in the /data folder into ALT_SCHOOL.CUSTOMERS +COPY ALT_SCHOOL.CUSTOMERS (customer_id, device_id, "location", currency) +FROM '/data/customers.csv' DELIMITER ',' CSV HEADER; @@ -27,31 +37,40 @@ FROM '/data/products.csv' DELIMITER ',' CSV HEADER; create table if not exists ALT_SCHOOL.ORDERS ( order_id uuid not null primary key, - -- provide the other fields + customer_id uuid not null, + "status" varchar(50) not null, + checked_out_at timestamp not null, ); - -- provide the command to copy orders data into POSTGRES - +COPY ALT_SCHOOL.ORDERS (order_id, customer_id, "status", checked_out_at) +FROM '/data/orders.csv' DELIMITER ',' CSV HEADER; create table if not exists ALT_SCHOOL.LINE_ITEMS ( line_item_id serial primary key, - -- provide the remaining fields + order_id uuid NOT NULL, + item_id int8 NOT NULL, + quantity int8 NOT NULL ); - -- provide the command to copy ALT_SCHOOL.LINE_ITEMS data into POSTGRES - +COPY ALT_SCHOOL.LINE_ITEMS (line_item_id,order_id, item_id, quantity) +FROM '/data/line_items.csv' DELIMITER ',' CSV HEADER; -- setup the events table following the examle provided create table if not exists ALT_SCHOOL.EVENTS ( - -- TODO: PROVIDE THE FIELDS + event_id serial primary key, + customer_id uuid NOT NULL, + event_data jsonb NOT NULL, + event_timestamp timestamp NOT NULL ); -- TODO: provide the command to copy ALT_SCHOOL.EVENTS data into POSTGRES +COPY ALT_SCHOOL.EVENTS (event_id,customer_id, event_data, event_timestamp) +FROM '/data/events.csv' DELIMITER ',' CSV HEADER; From f73c856c2f6ba9e0282ca3498d4925c9df9c8bbe Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Thu, 18 Apr 2024 20:46:48 +0200 Subject: [PATCH 02/18] initial commit --- infra_setup/init.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infra_setup/init.sql b/infra_setup/init.sql index c640d6d..feb369d 100644 --- a/infra_setup/init.sql +++ b/infra_setup/init.sql @@ -23,7 +23,7 @@ create table if not exists ALT_SCHOOL.CUSTOMERS ( customer_id uuid primary key, device_id uuid NOT NULL, - "location" varchar(50) NOT NULL, + "location" varchar(255) NOT NULL, currency varchar(10) NULL ); @@ -39,7 +39,7 @@ create table if not exists ALT_SCHOOL.ORDERS order_id uuid not null primary key, customer_id uuid not null, "status" varchar(50) not null, - checked_out_at timestamp not null, + checked_out_at timestamp not null ); -- provide the command to copy orders data into POSTGRES From 40f815c48fbe9827d2c50d1a56b4e37a860c0650 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Thu, 18 Apr 2024 20:48:30 +0200 Subject: [PATCH 03/18] add answer to question 2b.1 --- questions/answers.sql | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index e69de29..5ba546e 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -0,0 +1,31 @@ +-- Question 2a.1 + +with location_count as ( +select + location, + count(1) as checkout_count +from + alt_school.events e +join alt_school.customers + using(customer_id) +where + e.event_data ->> 'event_type' = 'checkout' + and e.event_data ->> 'status' = 'success' +group by + location), +location_count_rank as ( +select + location, + checkout_count, + rank() over( + order by checkout_count desc) row_rank +from + location_count) + +select + location, + checkout_count +from + location_count_rank +where + row_rank = 1 \ No newline at end of file From 3387ad742064aa293e567d2a14356b3b091b83dd Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 10:23:47 +0200 Subject: [PATCH 04/18] add answer to question 2b.2 --- questions/answers.sql | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/questions/answers.sql b/questions/answers.sql index 5ba546e..8457593 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -1,4 +1,4 @@ --- Question 2a.1 +-- Question 2b.1 with location_count as ( select @@ -28,4 +28,40 @@ select from location_count_rank where - row_rank = 1 \ No newline at end of file + row_rank = 1 + + + +-- Question 2b.2 +with event_group as ( +select + customer_id, + e.event_data ->> 'event_type' as event_type, + e.event_data ->> 'status' as status, + count(1)as event_count +from + alt_school.events e +group by + customer_id, + e.event_data ->> 'event_type', + e.event_data ->> 'status' +) +select + customer_id, + sum(event_count)as num_events +from + event_group +join alt_school.customers c + using(customer_id) +where + customer_id not in ( + select + distinct customer_id + from + event_group + where + event_type = 'checkout' + and status = 'success') + and event_type != 'visit' +group by + customer_id \ No newline at end of file From be4ab153cd02393b2c84a023b4bb01c4073eab8b Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 10:33:09 +0200 Subject: [PATCH 05/18] add answer to question 2b.3 --- questions/answers.sql | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/questions/answers.sql b/questions/answers.sql index 8457593..56aa92f 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -64,4 +64,32 @@ where and status = 'success') and event_type != 'visit' group by - customer_id \ No newline at end of file + customer_id + + + +-- question 2b.3 +with event_group as ( +select + customer_id, + count( distinct e.event_data ->> 'timestamp')as event_count +from + alt_school.events e +where + customer_id in ( + select + distinct customer_id + from + alt_school.events e + where + e.event_data ->> 'status' = 'success') + and e.event_data ->> 'event_type' = 'visit' +group by + customer_id +) + +select + avg(event_count):: numeric(5, + 2) as average_visits +from + event_group; \ No newline at end of file From 79ecb6610ace5479d75444f12fd9b9969106da56 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 10:47:36 +0200 Subject: [PATCH 06/18] add answer to question 2a.1 --- questions/answers.sql | 61 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index 56aa92f..4e51915 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -1,3 +1,64 @@ +-- Question 2a.1 + +with most_ordered_items as( +select + id as product_id, + name as product_name, + sum(quantity)as num_times_in_successful_orders +from + alt_school.orders o +join alt_school.line_items li + using(order_id) +join alt_school.products p +on + li.item_id = p.id +where + status = 'success' +group by + id, + name, + status), +most_ordered_items_rank as ( +select + *, + rank() over( +order by + num_times_in_successful_orders desc) row_rank +from + most_ordered_items) + +select + product_id, + product_name, + num_times_in_successful_orders +from + most_ordered_items_rank +where + row_rank = 1 + + + + + + + + + + + + + + + + + + + + + + + + -- Question 2b.1 with location_count as ( From f086c79cf1d61370da2a2f3f427478a035748685 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 11:15:00 +0200 Subject: [PATCH 07/18] add answer to question 2a.2 --- questions/answers.sql | 70 ++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/questions/answers.sql b/questions/answers.sql index 4e51915..9258fdb 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -37,25 +37,59 @@ where row_rank = 1 +-- Question 2a.2 - - - - - - - - - - - - - - - - - - +with order_quantity as ( +select + customer_id, + e.event_data ->> 'event_type' as event_type, + e.event_data ->> 'item_id' as item_id, + e.event_data ->> 'quantity' as quantity +from + alt_school.events e +where + customer_id in ( + select + distinct customer_id + from + alt_school.events e + where + e.event_data ->> 'status' = 'success') + and e.event_data ->> 'event_type' not in ('checkout', 'visit') +), +spender as ( +select + customer_id, + sum(quantity::int * price) as total_spend +from + order_quantity o +join alt_school.products p on + o.item_id :: int = p.id +where + quantity is not null +group by + customer_id ), +spender_rank as ( +select + customer_id, + location, + total_spend, + rank() over( +order by + total_spend desc) ROW_RANK +from + spender +join alt_school.customers + using (customer_id) +) +select + customer_id, + location, + total_spend +from + spender_rank +where + row_rank <= 5 From 3f269646b1eb7f9c33ebcc480f5acba7d94e931a Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 13:51:20 +0200 Subject: [PATCH 08/18] docs(infra_setup): remove todos --- infra_setup/init.sql | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/infra_setup/init.sql b/infra_setup/init.sql index feb369d..cdc8c3b 100644 --- a/infra_setup/init.sql +++ b/infra_setup/init.sql @@ -16,9 +16,6 @@ COPY ALT_SCHOOL.PRODUCTS (id, name, price) FROM '/data/products.csv' DELIMITER ',' CSV HEADER; -- setup customers table following the example above - --- TODO: Provide the DDL statment to create this table ALT_SCHOOL.CUSTOMERS - create table if not exists ALT_SCHOOL.CUSTOMERS ( customer_id uuid primary key, @@ -27,13 +24,10 @@ create table if not exists ALT_SCHOOL.CUSTOMERS currency varchar(10) NULL ); --- TODO: provide the command to copy the customers data in the /data folder into ALT_SCHOOL.CUSTOMERS +-- copy the customers data in the /data folder into ALT_SCHOOL.CUSTOMERS COPY ALT_SCHOOL.CUSTOMERS (customer_id, device_id, "location", currency) FROM '/data/customers.csv' DELIMITER ',' CSV HEADER; - - --- TODO: complete the table DDL statement create table if not exists ALT_SCHOOL.ORDERS ( order_id uuid not null primary key, @@ -42,7 +36,7 @@ create table if not exists ALT_SCHOOL.ORDERS checked_out_at timestamp not null ); --- provide the command to copy orders data into POSTGRES +-- copy orders data into POSTGRES COPY ALT_SCHOOL.ORDERS (order_id, customer_id, "status", checked_out_at) FROM '/data/orders.csv' DELIMITER ',' CSV HEADER; @@ -54,11 +48,11 @@ create table if not exists ALT_SCHOOL.LINE_ITEMS quantity int8 NOT NULL ); --- provide the command to copy ALT_SCHOOL.LINE_ITEMS data into POSTGRES +-- copy ALT_SCHOOL.LINE_ITEMS data into POSTGRES COPY ALT_SCHOOL.LINE_ITEMS (line_item_id,order_id, item_id, quantity) FROM '/data/line_items.csv' DELIMITER ',' CSV HEADER; --- setup the events table following the examle provided +-- setup the events table create table if not exists ALT_SCHOOL.EVENTS ( event_id serial primary key, @@ -66,8 +60,7 @@ create table if not exists ALT_SCHOOL.EVENTS event_data jsonb NOT NULL, event_timestamp timestamp NOT NULL ); - --- TODO: provide the command to copy ALT_SCHOOL.EVENTS data into POSTGRES +-- copy ALT_SCHOOL.EVENTS data into POSTGRES COPY ALT_SCHOOL.EVENTS (event_id,customer_id, event_data, event_timestamp) FROM '/data/events.csv' DELIMITER ',' CSV HEADER; From 1e670ec40dc2f8c3b48115f2a26200959c1265ad Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 14:28:58 +0200 Subject: [PATCH 09/18] docs(question 2a.1): add thought process and explanation --- questions/answers.sql | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index 9258fdb..1b01792 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -1,4 +1,16 @@ -- Question 2a.1 +/* +what is the most ordered item based on the number of times it appears in an order cart that checked out successfully? + +To get the most ordered item, created 2 CTEs, most_ordered_items and most_ordered_items_rank. +1. most_ordered_items: in this CTE, only product that are successfully checked out are selected. It is also worth nothing that the sum of 'quantity' of each product is used to determine + the number of times it appears in an order cart that checked out successfully. +2. most_ordered_items_rank: in this CTE, the rank() function is used to rank the products based on the number of times they appear in an order cart that checked out successfully. + The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. + +Note: Do not use order by and limit in the final query, as it will not work in the case of ties. + + */ with most_ordered_items as( select From 73e48680a5cf2b6b517b82e4591ae0b79c4acb73 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 14:36:43 +0200 Subject: [PATCH 10/18] style: format sql query --- questions/answers.sql | 263 +++++++++++++++++------------------------- 1 file changed, 104 insertions(+), 159 deletions(-) diff --git a/questions/answers.sql b/questions/answers.sql index 1b01792..b856e88 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -13,190 +13,135 @@ Note: Do not use order by and limit in the final query, as it will not work in t */ with most_ordered_items as( -select - id as product_id, - name as product_name, - sum(quantity)as num_times_in_successful_orders -from - alt_school.orders o -join alt_school.line_items li - using(order_id) -join alt_school.products p -on - li.item_id = p.id -where - status = 'success' -group by - id, - name, - status), + select id as product_id, + name as product_name, + sum(quantity) as num_times_in_successful_orders + from alt_school.orders o + join alt_school.line_items li using(order_id) + join alt_school.products p on li.item_id = p.id + where status = 'success' + group by id, + name, + status +), most_ordered_items_rank as ( -select - *, - rank() over( -order by - num_times_in_successful_orders desc) row_rank -from - most_ordered_items) - -select - product_id, - product_name, - num_times_in_successful_orders -from - most_ordered_items_rank -where - row_rank = 1 + select *, + rank() over( + order by num_times_in_successful_orders desc + ) row_rank + from most_ordered_items +) +select product_id, + product_name, + num_times_in_successful_orders +from most_ordered_items_rank +where row_rank = 1 -- Question 2a.2 with order_quantity as ( -select - customer_id, - e.event_data ->> 'event_type' as event_type, - e.event_data ->> 'item_id' as item_id, - e.event_data ->> 'quantity' as quantity -from - alt_school.events e -where - customer_id in ( - select - distinct customer_id - from - alt_school.events e - where - e.event_data ->> 'status' = 'success') - and e.event_data ->> 'event_type' not in ('checkout', 'visit') + select customer_id, + e.event_data->>'event_type' as event_type, + e.event_data->>'item_id' as item_id, + e.event_data->>'quantity' as quantity + from alt_school.events e + where customer_id in ( + select distinct customer_id + from alt_school.events e + where e.event_data->>'status' = 'success' + ) + and e.event_data->>'event_type' not in ('checkout', 'visit') ), spender as ( -select - customer_id, - sum(quantity::int * price) as total_spend -from - order_quantity o -join alt_school.products p on - o.item_id :: int = p.id -where - quantity is not null -group by - customer_id ), + select customer_id, + sum(quantity::int * price) as total_spend + from order_quantity o + join alt_school.products p on o.item_id::int = p.id + where quantity is not null + group by customer_id +), spender_rank as ( -select - customer_id, - location, - total_spend, - rank() over( -order by - total_spend desc) ROW_RANK -from - spender -join alt_school.customers - using (customer_id) + select customer_id, + location, + total_spend, + rank() over( + order by total_spend desc + ) ROW_RANK + from spender + join alt_school.customers using (customer_id) ) -select - customer_id, - location, - total_spend -from - spender_rank -where - row_rank <= 5 - +select customer_id, + location, + total_spend +from spender_rank +where row_rank <= 5 -- Question 2b.1 with location_count as ( -select - location, - count(1) as checkout_count -from - alt_school.events e -join alt_school.customers - using(customer_id) -where - e.event_data ->> 'event_type' = 'checkout' - and e.event_data ->> 'status' = 'success' -group by - location), + select location, + count(1) as checkout_count + from alt_school.events e + join alt_school.customers using(customer_id) + where e.event_data->>'event_type' = 'checkout' + and e.event_data->>'status' = 'success' + group by location +), location_count_rank as ( -select - location, - checkout_count, - rank() over( - order by checkout_count desc) row_rank -from - location_count) - -select - location, - checkout_count -from - location_count_rank -where - row_rank = 1 + select location, + checkout_count, + rank() over( + order by checkout_count desc + ) row_rank + from location_count +) +select location, + checkout_count +from location_count_rank +where row_rank = 1 -- Question 2b.2 with event_group as ( -select - customer_id, - e.event_data ->> 'event_type' as event_type, - e.event_data ->> 'status' as status, - count(1)as event_count -from - alt_school.events e -group by - customer_id, - e.event_data ->> 'event_type', - e.event_data ->> 'status' + select customer_id, + e.event_data->>'event_type' as event_type, + e.event_data->>'status' as status, + count(1) as event_count + from alt_school.events e + group by customer_id, + e.event_data->>'event_type', + e.event_data->>'status' ) -select - customer_id, - sum(event_count)as num_events -from - event_group -join alt_school.customers c - using(customer_id) -where - customer_id not in ( - select - distinct customer_id - from - event_group - where - event_type = 'checkout' - and status = 'success') - and event_type != 'visit' -group by - customer_id +select customer_id, + sum(event_count) as num_events +from event_group + join alt_school.customers c using(customer_id) +where customer_id not in ( + select distinct customer_id + from event_group + where event_type = 'checkout' + and status = 'success' + ) + and event_type != 'visit' +group by customer_id -- question 2b.3 with event_group as ( -select - customer_id, - count( distinct e.event_data ->> 'timestamp')as event_count -from - alt_school.events e -where - customer_id in ( - select - distinct customer_id - from - alt_school.events e - where - e.event_data ->> 'status' = 'success') - and e.event_data ->> 'event_type' = 'visit' -group by - customer_id + select customer_id, + count(distinct e.event_data->>'timestamp') as event_count + from alt_school.events e + where customer_id in ( + select distinct customer_id + from alt_school.events e + where e.event_data->>'status' = 'success' + ) + and e.event_data->>'event_type' = 'visit' + group by customer_id ) - -select - avg(event_count):: numeric(5, - 2) as average_visits -from - event_group; \ No newline at end of file +select avg(event_count)::numeric(5, 2) as average_visits +from event_group; \ No newline at end of file From 091a1eeec1109ac77950b1ddb78b35d331b87896 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 14:41:26 +0200 Subject: [PATCH 11/18] doc(question 2b.1): add thought process and explanation --- questions/answers.sql | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index b856e88..4cf3cc6 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -79,6 +79,13 @@ where row_rank <= 5 -- Question 2b.1 +/* +Determine the most common location (country) where successful checkouts occurred +To get the most common location where successful checkouts occurred, created 2 CTEs, location_count and location_count_rank. +1. location_count: in this CTE, only successful checkouts are selected. The count of successful checkouts is grouped by location. +2. location_count_rank: in this CTE, the rank() function is used to rank the locations based on the number of successful checkouts. + The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. +*/ with location_count as ( select location, From dc6619f402c2f3a2911370413c45bd9c68125595 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 14:59:02 +0200 Subject: [PATCH 12/18] docs(question 2b.2): add thought process and explanation --- questions/answers.sql | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index 4cf3cc6..20f0f82 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -112,6 +112,16 @@ where row_rank = 1 -- Question 2b.2 +/* +Identify the customers who abandoned their carts and count the number of events (excluding visits) that occurred before the abandonment + +To identify the customers who abandoned their carts, created a CTE. +1. event_group: in this CTE, the count of all events, grouped by customer_id, event_type, and status. +2. In the final query, get the unique list of customers (id) who successfully checked out (a subquery is used to get this list). + filter by the result of the subquery, by excluding customers who successfully checked out, the result will be customers who abandoned their carts. + In addition, exclude events of type 'visit' from customers who abandoned their carts. Sum the count of remaining events that occurred grouped by customer_id, this result to the num_of_events before the abandonment. + +*/ with event_group as ( select customer_id, e.event_data->>'event_type' as event_type, From fdae7e39c9a912c6905edeafb7b2ee2eea3fc317 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 15:11:32 +0200 Subject: [PATCH 13/18] docs(question 2b.3): add thought process and explanation --- questions/answers.sql | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index 20f0f82..22b2ad8 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -148,6 +148,14 @@ group by customer_id -- question 2b.3 +/* +Find the average number of visits per customer, considering only customers who completed + +To find the average number of visits per customer, created a CTE. +1. event_group: in this CTE, the count distinct event of all event_data[timestamp] events, grouped by customer_id. +filter for customers who successfully checked out by using a subquery returns a list, and considering only event_type is 'visit'. +In the final query, get the average of the event_count. +*/ with event_group as ( select customer_id, count(distinct e.event_data->>'timestamp') as event_count From ed0354bd95db6d079ab00a2f99ce010f9bfd9c13 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 15:25:58 +0200 Subject: [PATCH 14/18] style: format query and comments --- questions/answers.sql | 207 +++++++++++++++++++----------------------- 1 file changed, 92 insertions(+), 115 deletions(-) diff --git a/questions/answers.sql b/questions/answers.sql index 22b2ad8..971d28b 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -1,79 +1,64 @@ -- Question 2a.1 -/* +/* what is the most ordered item based on the number of times it appears in an order cart that checked out successfully? To get the most ordered item, created 2 CTEs, most_ordered_items and most_ordered_items_rank. -1. most_ordered_items: in this CTE, only product that are successfully checked out are selected. It is also worth nothing that the sum of 'quantity' of each product is used to determine - the number of times it appears in an order cart that checked out successfully. +1. most_ordered_items: in this CTE, only product that are successfully checked out are selected. It is also worth nothing that the sum of 'quantity' of each product is used to determine + the number of times it appears in an order cart that checked out successfully. 2. most_ordered_items_rank: in this CTE, the rank() function is used to rank the products based on the number of times they appear in an order cart that checked out successfully. - The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. + The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. Note: Do not use order by and limit in the final query, as it will not work in the case of ties. - + */ -with most_ordered_items as( - select id as product_id, - name as product_name, - sum(quantity) as num_times_in_successful_orders - from alt_school.orders o - join alt_school.line_items li using(order_id) - join alt_school.products p on li.item_id = p.id - where status = 'success' - group by id, - name, - status -), -most_ordered_items_rank as ( - select *, - rank() over( - order by num_times_in_successful_orders desc - ) row_rank - from most_ordered_items -) +with most_ordered_items as + ( select id as product_id, name as product_name, sum(quantity) as num_times_in_successful_orders + from alt_school.orders o + join alt_school.line_items li using(order_id) + join alt_school.products p on li.item_id = p.id + where status = 'success' + group by id, name, status), + most_ordered_items_rank as + ( select *, + rank() over(order by num_times_in_successful_orders desc ) row_rank + from most_ordered_items) select product_id, - product_name, - num_times_in_successful_orders + product_name, + num_times_in_successful_orders from most_ordered_items_rank where row_rank = 1 - -- Question 2a.2 -with order_quantity as ( - select customer_id, - e.event_data->>'event_type' as event_type, - e.event_data->>'item_id' as item_id, - e.event_data->>'quantity' as quantity - from alt_school.events e - where customer_id in ( - select distinct customer_id - from alt_school.events e - where e.event_data->>'status' = 'success' - ) - and e.event_data->>'event_type' not in ('checkout', 'visit') -), -spender as ( - select customer_id, - sum(quantity::int * price) as total_spend - from order_quantity o - join alt_school.products p on o.item_id::int = p.id - where quantity is not null - group by customer_id -), -spender_rank as ( - select customer_id, - location, - total_spend, - rank() over( - order by total_spend desc - ) ROW_RANK - from spender - join alt_school.customers using (customer_id) -) +with order_quantity as + ( select customer_id, + e.event_data->>'event_type' as event_type, + e.event_data->>'item_id' as item_id, + e.event_data->>'quantity' as quantity + from alt_school.events e + where customer_id in + ( select distinct customer_id + from alt_school.events e + where e.event_data->>'status' = 'success' ) + and e.event_data->>'event_type' not in ('checkout','visit') ), + spender as + ( select customer_id, + sum(quantity::int * price) as total_spend + from order_quantity o + join alt_school.products p on o.item_id::int = p.id + where quantity is not null + group by customer_id), + spender_rank as + ( select customer_id, + location, + total_spend, + rank() over(order by total_spend desc ) ROW_RANK + from spender + join alt_school.customers using (customer_id)) select customer_id, - location, - total_spend + location, + total_spend from spender_rank where row_rank <= 5 @@ -84,28 +69,24 @@ Determine the most common location (country) where successful checkouts occurred To get the most common location where successful checkouts occurred, created 2 CTEs, location_count and location_count_rank. 1. location_count: in this CTE, only successful checkouts are selected. The count of successful checkouts is grouped by location. 2. location_count_rank: in this CTE, the rank() function is used to rank the locations based on the number of successful checkouts. - The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. + The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. */ -with location_count as ( - select location, - count(1) as checkout_count - from alt_school.events e - join alt_school.customers using(customer_id) - where e.event_data->>'event_type' = 'checkout' - and e.event_data->>'status' = 'success' - group by location -), -location_count_rank as ( - select location, - checkout_count, - rank() over( - order by checkout_count desc - ) row_rank - from location_count -) +with location_count as + ( select location, + count(1) as checkout_count + from alt_school.events e + join alt_school.customers using(customer_id) + where e.event_data->>'event_type' = 'checkout' + and e.event_data->>'status' = 'success' + group by location), + location_count_rank as + ( select location, + checkout_count, + rank() over(order by checkout_count desc ) row_rank + from location_count) select location, - checkout_count + checkout_count from location_count_rank where row_rank = 1 @@ -118,55 +99,51 @@ Identify the customers who abandoned their carts and count the number of events To identify the customers who abandoned their carts, created a CTE. 1. event_group: in this CTE, the count of all events, grouped by customer_id, event_type, and status. 2. In the final query, get the unique list of customers (id) who successfully checked out (a subquery is used to get this list). - filter by the result of the subquery, by excluding customers who successfully checked out, the result will be customers who abandoned their carts. - In addition, exclude events of type 'visit' from customers who abandoned their carts. Sum the count of remaining events that occurred grouped by customer_id, this result to the num_of_events before the abandonment. + filter by the result of the subquery, by excluding customers who successfully checked out, the result will be customers who abandoned their carts. + In addition, exclude events of type 'visit' from customers who abandoned their carts. + Sum the count of remaining events that occurred grouped by customer_id, this result to the num_of_events before the abandonment. */ -with event_group as ( - select customer_id, - e.event_data->>'event_type' as event_type, - e.event_data->>'status' as status, - count(1) as event_count - from alt_school.events e - group by customer_id, - e.event_data->>'event_type', - e.event_data->>'status' -) +with event_group as + ( select customer_id, + e.event_data->>'event_type' as event_type, + e.event_data->>'status' as status, + count(1) as event_count + from alt_school.events e + group by customer_id, + e.event_data->>'event_type', + e.event_data->>'status') select customer_id, - sum(event_count) as num_events + sum(event_count) as num_events from event_group - join alt_school.customers c using(customer_id) -where customer_id not in ( - select distinct customer_id - from event_group - where event_type = 'checkout' - and status = 'success' - ) - and event_type != 'visit' +join alt_school.customers c using(customer_id) +where customer_id not in + ( select distinct customer_id + from event_group + where event_type = 'checkout' + and status = 'success' ) + and event_type != 'visit' group by customer_id - -- question 2b.3 /* Find the average number of visits per customer, considering only customers who completed To find the average number of visits per customer, created a CTE. 1. event_group: in this CTE, the count distinct event of all event_data[timestamp] events, grouped by customer_id. -filter for customers who successfully checked out by using a subquery returns a list, and considering only event_type is 'visit'. -In the final query, get the average of the event_count. + filter for customers who successfully checked out by using a subquery returns a list, and considering only event_type is 'visit'. + In the final query, get the average of the event_count. */ -with event_group as ( - select customer_id, - count(distinct e.event_data->>'timestamp') as event_count - from alt_school.events e - where customer_id in ( - select distinct customer_id - from alt_school.events e - where e.event_data->>'status' = 'success' - ) - and e.event_data->>'event_type' = 'visit' - group by customer_id -) +with event_group as + ( select customer_id, + count(distinct e.event_data->>'timestamp') as event_count + from alt_school.events e + where customer_id in + ( select distinct customer_id + from alt_school.events e + where e.event_data->>'status' = 'success' ) + and e.event_data->>'event_type' = 'visit' + group by customer_id) select avg(event_count)::numeric(5, 2) as average_visits from event_group; \ No newline at end of file From 3e5d4010b8a6bb582d1e3c91ee4cdabc33168d0a Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 17:47:09 +0200 Subject: [PATCH 15/18] style: format query for improved readability --- questions/answers.sql | 263 +++++++++++++++++++++++++++--------------- 1 file changed, 171 insertions(+), 92 deletions(-) diff --git a/questions/answers.sql b/questions/answers.sql index 971d28b..a0b3da5 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -12,55 +12,96 @@ Note: Do not use order by and limit in the final query, as it will not work in t */ -with most_ordered_items as - ( select id as product_id, name as product_name, sum(quantity) as num_times_in_successful_orders - from alt_school.orders o - join alt_school.line_items li using(order_id) - join alt_school.products p on li.item_id = p.id - where status = 'success' - group by id, name, status), - most_ordered_items_rank as - ( select *, - rank() over(order by num_times_in_successful_orders desc ) row_rank - from most_ordered_items) -select product_id, +with + most_ordered_items as ( + select + id as product_id, + name as product_name, + sum(quantity) as num_times_in_successful_orders + from + alt_school.orders o + join alt_school.line_items li using (order_id) + join alt_school.products p on li.item_id = p.id + where + status = 'success' + group by + id, + name, + status + ), + most_ordered_items_rank as ( + select + *, + rank() over ( + order by + num_times_in_successful_orders desc + ) row_rank + from + most_ordered_items + ) +select + product_id, product_name, num_times_in_successful_orders -from most_ordered_items_rank -where row_rank = 1 - +from + most_ordered_items_rank +where + row_rank = 1; -- Question 2a.2 -with order_quantity as - ( select customer_id, - e.event_data->>'event_type' as event_type, - e.event_data->>'item_id' as item_id, - e.event_data->>'quantity' as quantity - from alt_school.events e - where customer_id in - ( select distinct customer_id - from alt_school.events e - where e.event_data->>'status' = 'success' ) - and e.event_data->>'event_type' not in ('checkout','visit') ), - spender as - ( select customer_id, - sum(quantity::int * price) as total_spend - from order_quantity o - join alt_school.products p on o.item_id::int = p.id - where quantity is not null - group by customer_id), - spender_rank as - ( select customer_id, - location, - total_spend, - rank() over(order by total_spend desc ) ROW_RANK - from spender - join alt_school.customers using (customer_id)) -select customer_id, +with + order_quantity as ( + select + customer_id, + e.event_data - > > 'event_type' as event_type, + e.event_data - > > 'item_id' as item_id, + e.event_data - > > 'quantity' as quantity + from + alt_school.events e + where + customer_id in ( + select distinct + customer_id + from + alt_school.events e + where + e.event_data - > > 'status' = 'success' + ) + and e.event_data - > > 'event_type' not in ('checkout', 'visit') + ), + spender as ( + select + customer_id, + sum(cast(quantity as integer) * price) as total_spend + from + order_quantity o + join alt_school.products p on cast(o.item_id as integer) = p.id + where + quantity is not null + group by + customer_id + ), + spender_rank as ( + select + customer_id, + location, + total_spend, + rank() over ( + order by + total_spend desc + ) ROW_RANK + from + spender + join alt_school.customers using (customer_id) + ) +select + customer_id, location, total_spend -from spender_rank -where row_rank <= 5 +from + spender_rank +where + row_rank <= 5; -- Question 2b.1 @@ -72,24 +113,38 @@ To get the most common location where successful checkouts occurred, created 2 C The rank is done in descending order, finally, filter from this CTE where the row_rank is equal to 1. */ -with location_count as - ( select location, - count(1) as checkout_count - from alt_school.events e - join alt_school.customers using(customer_id) - where e.event_data->>'event_type' = 'checkout' - and e.event_data->>'status' = 'success' - group by location), - location_count_rank as - ( select location, - checkout_count, - rank() over(order by checkout_count desc ) row_rank - from location_count) -select location, +with + location_count as ( + select + location, + count(1) as checkout_count + from + alt_school.events e + join alt_school.customers using (customer_id) + where + e.event_data - > > 'event_type' = 'checkout' + and e.event_data - > > 'status' = 'success' + group by + location + ), + location_count_rank as ( + select + location, + checkout_count, + rank() over ( + order by + checkout_count desc + ) row_rank + from + location_count + ) +select + location, checkout_count -from location_count_rank -where row_rank = 1 - +from + location_count_rank +where + row_rank = 1; -- Question 2b.2 @@ -104,27 +159,39 @@ To identify the customers who abandoned their carts, created a CTE. Sum the count of remaining events that occurred grouped by customer_id, this result to the num_of_events before the abandonment. */ -with event_group as - ( select customer_id, - e.event_data->>'event_type' as event_type, - e.event_data->>'status' as status, - count(1) as event_count - from alt_school.events e - group by customer_id, - e.event_data->>'event_type', - e.event_data->>'status') -select customer_id, +with + event_group as ( + select + customer_id, + e.event_data - > > 'event_type' as event_type, + e.event_data - > > 'status' as status, + count(1) as event_count + from + alt_school.events e + group by + customer_id, + e.event_data - > > 'event_type', + e.event_data - > > 'status' + ) +select + customer_id, sum(event_count) as num_events -from event_group -join alt_school.customers c using(customer_id) -where customer_id not in - ( select distinct customer_id - from event_group - where event_type = 'checkout' - and status = 'success' ) - and event_type != 'visit' -group by customer_id - +from + event_group + join alt_school.customers c using (customer_id) +where + customer_id not in ( + select distinct + customer_id + from + event_group + where + event_type = 'checkout' + and status = 'success' + ) + and event_type != 'visit' +group by + customer_id; -- question 2b.3 /* @@ -135,15 +202,27 @@ To find the average number of visits per customer, created a CTE. filter for customers who successfully checked out by using a subquery returns a list, and considering only event_type is 'visit'. In the final query, get the average of the event_count. */ -with event_group as - ( select customer_id, - count(distinct e.event_data->>'timestamp') as event_count - from alt_school.events e - where customer_id in - ( select distinct customer_id - from alt_school.events e - where e.event_data->>'status' = 'success' ) - and e.event_data->>'event_type' = 'visit' - group by customer_id) -select avg(event_count)::numeric(5, 2) as average_visits -from event_group; \ No newline at end of file +with + event_group as ( + select + customer_id, + count(distinct e.event_data - > > 'timestamp') as event_count + from + alt_school.events e + where + customer_id in ( + select distinct + customer_id + from + alt_school.events e + where + e.event_data - > > 'status' = 'success' + ) + and e.event_data - > > 'event_type' = 'visit' + group by + customer_id + ) +select + round(avg(event_count), 2) as average_visits +from + event_group; \ No newline at end of file From 4b70e242a241780a3d0c28fb87e180ca93f63a5a Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 17:52:48 +0200 Subject: [PATCH 16/18] style: format query to improve readability --- infra_setup/init.sql | 103 ++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/infra_setup/init.sql b/infra_setup/init.sql index cdc8c3b..0094c76 100644 --- a/infra_setup/init.sql +++ b/infra_setup/init.sql @@ -1,72 +1,73 @@ - -- Create schema CREATE SCHEMA IF NOT EXISTS ALT_SCHOOL; - -- create and populate tables -create table if not exists ALT_SCHOOL.PRODUCTS -( - id serial primary key, - name varchar not null, - price numeric(10, 2) not null -); - +create table + if not exists ALT_SCHOOL.PRODUCTS ( + id serial primary key, + name varchar not null, + price numeric(10, 2) not null + ); COPY ALT_SCHOOL.PRODUCTS (id, name, price) -FROM '/data/products.csv' DELIMITER ',' CSV HEADER; +FROM + '/data/products.csv' DELIMITER ',' CSV HEADER; -- setup customers table following the example above -create table if not exists ALT_SCHOOL.CUSTOMERS -( - customer_id uuid primary key, - device_id uuid NOT NULL, - "location" varchar(255) NOT NULL, - currency varchar(10) NULL -); +create table + if not exists ALT_SCHOOL.CUSTOMERS ( + customer_id uuid primary key, + device_id uuid NOT NULL, + "location" varchar(255) NOT NULL, + currency varchar(10) NULL + ); -- copy the customers data in the /data folder into ALT_SCHOOL.CUSTOMERS COPY ALT_SCHOOL.CUSTOMERS (customer_id, device_id, "location", currency) -FROM '/data/customers.csv' DELIMITER ',' CSV HEADER; +FROM + '/data/customers.csv' DELIMITER ',' CSV HEADER; -create table if not exists ALT_SCHOOL.ORDERS -( - order_id uuid not null primary key, - customer_id uuid not null, - "status" varchar(50) not null, - checked_out_at timestamp not null -); +create table + if not exists ALT_SCHOOL.ORDERS ( + order_id uuid not null primary key, + customer_id uuid not null, + "status" varchar(50) not null, + checked_out_at timestamp not null + ); -- copy orders data into POSTGRES COPY ALT_SCHOOL.ORDERS (order_id, customer_id, "status", checked_out_at) -FROM '/data/orders.csv' DELIMITER ',' CSV HEADER; +FROM + '/data/orders.csv' DELIMITER ',' CSV HEADER; -create table if not exists ALT_SCHOOL.LINE_ITEMS -( - line_item_id serial primary key, - order_id uuid NOT NULL, - item_id int8 NOT NULL, - quantity int8 NOT NULL -); +create table + if not exists ALT_SCHOOL.LINE_ITEMS ( + line_item_id serial primary key, + order_id uuid NOT NULL, + item_id int8 NOT NULL, + quantity int8 NOT NULL + ); -- copy ALT_SCHOOL.LINE_ITEMS data into POSTGRES -COPY ALT_SCHOOL.LINE_ITEMS (line_item_id,order_id, item_id, quantity) -FROM '/data/line_items.csv' DELIMITER ',' CSV HEADER; +COPY ALT_SCHOOL.LINE_ITEMS (line_item_id, order_id, item_id, quantity) +FROM + '/data/line_items.csv' DELIMITER ',' CSV HEADER; -- setup the events table -create table if not exists ALT_SCHOOL.EVENTS -( - event_id serial primary key, - customer_id uuid NOT NULL, - event_data jsonb NOT NULL, - event_timestamp timestamp NOT NULL -); --- copy ALT_SCHOOL.EVENTS data into POSTGRES - -COPY ALT_SCHOOL.EVENTS (event_id,customer_id, event_data, event_timestamp) -FROM '/data/events.csv' DELIMITER ',' CSV HEADER; - - - - - +create table + if not exists ALT_SCHOOL.EVENTS ( + event_id serial primary key, + customer_id uuid NOT NULL, + event_data jsonb NOT NULL, + event_timestamp timestamp NOT NULL + ); +-- copy ALT_SCHOOL.EVENTS data into POSTGRES +COPY ALT_SCHOOL.EVENTS ( + event_id, + customer_id, + event_data, + event_timestamp +) +FROM + '/data/events.csv' DELIMITER ',' CSV HEADER; \ No newline at end of file From 0a7f29484d73033f2935182b562a668239faf877 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 17:59:19 +0200 Subject: [PATCH 17/18] docs(question 2a.2): add thought process and explanation --- questions/answers.sql | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/questions/answers.sql b/questions/answers.sql index a0b3da5..45d895f 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -47,8 +47,19 @@ from most_ordered_items_rank where row_rank = 1; + -- Question 2a.2 +/* +without considering currency, and without using the line_item table, find the top 5 spenders +To find the top 5 spenders, created 3 CTEs, order_quantity, spender, and spender_rank. +1. order_quantity: in this CTE, only customers who successfully checked out are selected. +2. spender: in this CTE, the total spend of each customer is calculated by multiplying the quantity of each product by the price of the product. +3. spender_rank: in this CTE, the rank() function is used to rank the customers based on the total spend. + The rank is done in descending order, finally, filter from this CTE where the row_rank is less than or equal to 5. + +Note: Do not use order by and limit 5 in the final query, as it will not work in the case of ties. +*/ with order_quantity as ( select From d22c1e4695b5f4a0498e68ddd70f6e3e108246d3 Mon Sep 17 00:00:00 2001 From: Sophia Lawal Date: Tue, 23 Apr 2024 19:37:30 +0200 Subject: [PATCH 18/18] fix: jsonb operator --- questions/answers.sql | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/questions/answers.sql b/questions/answers.sql index 45d895f..ea76cfa 100644 --- a/questions/answers.sql +++ b/questions/answers.sql @@ -64,9 +64,9 @@ with order_quantity as ( select customer_id, - e.event_data - > > 'event_type' as event_type, - e.event_data - > > 'item_id' as item_id, - e.event_data - > > 'quantity' as quantity + e.event_data ->> 'event_type' as event_type, + e.event_data ->> 'item_id' as item_id, + e.event_data ->> 'quantity' as quantity from alt_school.events e where @@ -76,9 +76,9 @@ with from alt_school.events e where - e.event_data - > > 'status' = 'success' + e.event_data ->> 'status' = 'success' ) - and e.event_data - > > 'event_type' not in ('checkout', 'visit') + and e.event_data ->> 'event_type' not in ('checkout', 'visit') ), spender as ( select @@ -133,8 +133,8 @@ with alt_school.events e join alt_school.customers using (customer_id) where - e.event_data - > > 'event_type' = 'checkout' - and e.event_data - > > 'status' = 'success' + e.event_data ->> 'event_type' = 'checkout' + and e.event_data ->> 'status' = 'success' group by location ), @@ -174,15 +174,15 @@ with event_group as ( select customer_id, - e.event_data - > > 'event_type' as event_type, - e.event_data - > > 'status' as status, + e.event_data ->> 'event_type' as event_type, + e.event_data ->> 'status' as status, count(1) as event_count from alt_school.events e group by customer_id, - e.event_data - > > 'event_type', - e.event_data - > > 'status' + e.event_data ->> 'event_type', + e.event_data ->> 'status' ) select customer_id, @@ -217,7 +217,7 @@ with event_group as ( select customer_id, - count(distinct e.event_data - > > 'timestamp') as event_count + count(distinct e.event_data ->> 'timestamp') as event_count from alt_school.events e where @@ -227,9 +227,9 @@ with from alt_school.events e where - e.event_data - > > 'status' = 'success' + e.event_data ->> 'status' = 'success' ) - and e.event_data - > > 'event_type' = 'visit' + and e.event_data ->> 'event_type' = 'visit' group by customer_id )