Skip to content

Commit

Permalink
ClickHouse Developer Lab Review March 2024
Browse files Browse the repository at this point in the history
Main changes
- standardize responses
- fix some answers
- fix some typos
- add extra solutions to existing questions
- add solutions to new questions
  • Loading branch information
pmusa committed Mar 20, 2024
1 parent b198733 commit e8f512f
Show file tree
Hide file tree
Showing 19 changed files with 245 additions and 78 deletions.
22 changes: 9 additions & 13 deletions developer/01_getting_started/lab_1.2.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,33 @@ SELECT *
FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
LIMIT 100;


--Step 2:
SELECT count()
FROM s3(
'https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');
FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');

--Step 3:
SELECT formatReadableQuantity(count())
FROM s3(
'https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');
FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');

--Step 4:
SELECT
avg(volume)
FROM s3(
'https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
WHERE
crypto_name = 'Bitcoin';

--Step 5:
SELECT
crypto_name,
count() AS count
crypto_name,
count() AS count
FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
GROUP BY crypto_name
ORDER BY crypto_name;

--Step 6:
SELECT
crypto_name,
count() AS count
trim(crypto_name) as name,
count() AS count
FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
GROUP BY crypto_name
ORDER BY trim(crypto_name);
GROUP BY name
ORDER BY name;
81 changes: 72 additions & 9 deletions developer/02_clickhouse_architecture/lab_2.1.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
--Step 2:
SELECT *
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2023/pypi_0_7_34.snappy.parquet')
LIMIT 10;

--Step 3:
SELECT count()
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2023/pypi_0_7_34.snappy.parquet');

--Step 4:
CREATE TABLE pypi (
TIMESTAMP DateTime,
COUNTRY_CODE String,
URL String,
PROJECT String
)
ENGINE = MergeTree
PRIMARY KEY TIMESTAMP;

--Step 5:
INSERT INTO pypi
SELECT TIMESTAMP, COUNTRY_CODE, URL, PROJECT
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2023/pypi_0_7_34.snappy.parquet');

--Step 6:
SELECT
PROJECT,
Expand All @@ -8,9 +32,12 @@ ORDER BY c DESC
LIMIT 100;

--Step 7:
-- All of the rows were read, because the query had no WHERE clause - so ClickHouse needed to process every granule.
/*
* All of the rows were read, because the query had no WHERE clause - so
* ClickHouse needed to process every granule.
*/

--Step 9:
--Step 8:
SELECT
PROJECT,
count() AS c
Expand All @@ -20,14 +47,50 @@ GROUP BY PROJECT
ORDER BY c DESC
LIMIT 100;

SELECT
PROJECT,
count() AS c
FROM pypi
WHERE TIMESTAMP >= toDate('2023-04-01') AND TIMESTAMP < toDate('2023-05-01')
GROUP BY PROJECT
ORDER BY c DESC
LIMIT 100;

--Step 9:
/*
* Your answer may vary by a granule or two, but the query only has to process
* 565,248 rows, which is exactly 8,192 x 69. So the query processed 69
* granules instead of performing a scan of the entire table. Why? Because the
* primary key is the TIMESTAMP column, which allows ClickHouse to skip about
* 1/3 of the data.
*/

--Step 10:
--Your answer may vary by a granule or two, but the query only has to process 565,248 rows, which is exactly 8,192 x 69. So the query processed 69 granules instead of performing a scan of the entire table. Why? Because the primary key is the TIMESTAMP column, which allows ClickHouse to skip about 1/4 of the data.
SELECT
PROJECT,
count() AS c
FROM pypi
WHERE PROJECT LIKE 'boto%'
GROUP BY PROJECT
ORDER BY c DESC;

--Step 12:
--The PROJECT column is not in the primary key, so the primary index is no help in skipping granules.
--Step 11:
/*
* The PROJECT column is not in the primary key, so the primary index is no
* help in skipping granules.
*/

--Step 14:
--None. Even though PROJECT was added to the primary key, it did not allow ClickHouse to skip any granules. Why? Because the TIMESTAMP has a high cardinality that is making any subsequent columns in the primary key difficult to be useful.
--Step 13:
/*
* None. Even though PROJECT was added to the primary key, it did not allow
* ClickHouse to skip any granules. Why? Because the TIMESTAMP has a high
* cardinality that is making any subsequent columns in the primary key
* difficult to be useful.
*/

--Step 16:
--The first column of the primary key is an important and powerful design decision. By putting PROJECT first, we are assuring that our queries that filter by PROJECT will process a minimum amount of rows.
--Step 15:
/*
* The first column of the primary key is an important and powerful design
* decision. By putting PROJECT first, we are assuring that our queries that
* filter by PROJECT will process a minimum amount of rows.
*/
13 changes: 2 additions & 11 deletions developer/02_clickhouse_architecture/lab_2.2.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,7 @@ CREATE TABLE test_pypi (
PROJECT String
)
ENGINE = MergeTree
PRIMARY KEY (PROJECT, TIMESTAMP)
ORDER BY (PROJECT, TIMESTAMP, COUNTRY_CODE);
PRIMARY KEY (PROJECT, COUNTRY_CODE, TIMESTAMP);

INSERT INTO test_pypi
SELECT * FROM pypi2;


--Step 5:
SELECT
formatReadableSize(sum(data_compressed_bytes)) AS compressed_size,
count() AS num_of_active_parts
FROM system.parts
WHERE (active = 1) AND (table = 'test_pypi');
SELECT * FROM pypi2;
19 changes: 14 additions & 5 deletions developer/03_modeling_data/lab_3.1.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
--Step 3:
--Step 2:
SELECT uniqExact(COUNTRY_CODE)
FROM pypi;

-- You will notice there are only 186 unique values of the country code, which makes it a great candidate for LowCardinality.
/*
* You will notice there are only 186 unique values of the country code, which
* makes it a great candidate for LowCardinality.
*/

--Step 4:
--Step 3:
SELECT
uniqExact(PROJECT),
uniqExact(URL)
FROM pypi;

-- There are over 24,000 unique values of PROJECT, which is large - but not too large. We will try LowCardinality on this column as well and see if it improves storage and query performance. The URL has over 79,000 unique values, and we can assume that a URL could have a lot of different values, so it is probably a bad choice for LowCardinality.
/*
* There are over 24,000 unique values of PROJECT, which is large - but not too
* large. We will try LowCardinality on this column as well and see if it
* improves storage and query performance. The URL has over 79,000 unique
* values, and we can assume that a URL could have a lot of different values,
* so it is probably a bad choice for LowCardinality.
*/

--Step 5:
--Step 4:
CREATE TABLE pypi3 (
TIMESTAMP DateTime,
COUNTRY_CODE LowCardinality(String),
Expand Down
11 changes: 10 additions & 1 deletion developer/03_modeling_data/lab_3.2.sql
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,23 @@ SELECT count()
FROM crypto_prices
WHERE volume >= 1_000_000;

/*
* It read all of the rows because volume is not part of the primary key.
*/

--Step 6:
SELECT
avg(price)
FROM crypto_prices
WHERE crypto_name = 'Bitcoin';

/*
* Only a single granule was processed. As crypto_name is a primary key,
* ClickHouse use it to optmize the query.
*/

--Step 7:
SELECT
avg(price)
FROM crypto_prices
WHERE crypto_name LIKE 'B%';
WHERE crypto_name LIKE 'B%';
31 changes: 22 additions & 9 deletions developer/04_inserting_data/lab_4.2.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
--Step 1:
DESCRIBE s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/uk_property_prices.snappy.parquet');
DESCRIBE s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/uk_property_prices.snappy.parquet')
SETTINGS
schema_inference_make_columns_nullable=false;

--Step 2:
CREATE TABLE uk_price_paid
Expand All @@ -8,9 +10,9 @@ CREATE TABLE uk_price_paid
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
type Enum('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
is_new UInt8,
duration Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
duration Enum('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
addr1 String,
addr2 String,
street LowCardinality(String),
Expand All @@ -36,22 +38,33 @@ SELECT avg(price)
FROM uk_price_paid
WHERE postcode1 = 'LU1' AND postcode2 = '5FT';

-- The primary key contains postcode1 and postcode2 as the first two columns, so filtering by both allows ClickHouse to skip the most granules.
/*
* The primary key contains postcode1 and postcode2 as the first two columns,
* so filtering by both allows ClickHouse to skip the most granules.
*/

--Step 6:
SELECT avg(price)
FROM uk_price_paid
WHERE postcode2 = '5FT';

-- The postcode2 column is the second column in the primary key, which allows ClickHouse to avoid about 1/3 of the table.
-- Not bad, but note that the second value of a primary key is not as helpful in our dataset as the first column of the primary key.
-- This all depends on your dataset, but this query gives you an idea of how you should think through and test if a column will be useful before adding it to the primary key.
-- In this example, postcode2 seems beneficial (assuming we need to filter by postcode2 regularly.)
/*
* The postcode2 column is the second column in the primary key, which allows
* ClickHouse to avoid about 1/3 of the table. Not bad, but note that the
* second value of a primary key is not as helpful in our dataset as the first
* column of the primary key. This all depends on your dataset, but this query
* gives you an idea of how you should think through and test if a column will
* be useful before adding it to the primary key. In this example, postcode2
* seems beneficial (assuming we need to filter by postcode2 regularly.)
*/

--Step 7:
SELECT avg(price)
FROM uk_price_paid
WHERE town = 'YORK';

-- The town column is not a part of the primary key, so the primary index does not provide any skipping of granules.
/*
* The town column is not a part of the primary key, so the primary index does
* not provide any skipping of granules.
*/

14 changes: 9 additions & 5 deletions developer/04_inserting_data/lab_4.3.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@ FROM s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/operating_budget.cs
SETTINGS format_csv_delimiter = '~';

--Step 3:
SELECT formatReadableQuantity(sum(approve_amount))
SELECT formatReadableQuantity(sum(approved_amount))
FROM s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/operating_budget.csv')
SETTINGS format_csv_delimiter = '~';

-- You get an exception telling you that trying to sum a String column is not allowed. Apparently, the approved_amount column is not entirely numeric data, and ClickHouse inferred that column as a String.
/*
* You get an exception telling you that trying to sum a String column is not
* allowed. Apparently, the approved_amount column is not entirely numeric
* data, and ClickHouse inferred that column as a String.
*/

--Step 4:
DESCRIBE s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/operating_budget.csv')
Expand All @@ -32,7 +36,7 @@ CREATE TABLE operating_budget (
service LowCardinality(String),
department LowCardinality(String),
program LowCardinality(String),
program_code UInt32,
program_code LowCardinality(String),
description String,
item_category LowCardinality(String),
approved_amount UInt32,
Expand All @@ -53,7 +57,7 @@ INSERT INTO operating_budget
c2 AS service,
c3 AS department,
result[1] AS program,
toUInt32OrZero(splitByChar(')',result[2])[1]) AS program_code,
splitByChar(')',result[2])[1] AS program_code,
c5 AS description,
c6 AS item_category,
toUInt32OrZero(c7) AS approved_amount,
Expand Down Expand Up @@ -92,4 +96,4 @@ WHERE fiscal_year = '2022';
SELECT sum(actual_amount)
FROM operating_budget
WHERE fiscal_year = '2022'
AND program_code = 31;
AND program_code = '031';
4 changes: 4 additions & 0 deletions developer/05_analyzing_data/lab_5.1.sql
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ SELECT
avgIf(price, type = 'other')
FROM uk_price_paid;

SELECT type, avg(price) as avg_price
FROM uk_price_paid
GROUP BY type;

--Step 9:
SELECT
formatReadableQuantity(sum(price))
Expand Down
5 changes: 4 additions & 1 deletion developer/06_materialized_views/lab_6.2.sql
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ SELECT
FROM prices_by_year_dest
WHERE toYear(date) = '2020';

-- The query only needs to read 886,642 rows, which is exactly how many properties were sold in the UK in 2020.
/*
* The query only needs to read 886,642 rows, which is exactly how many
* properties were sold in the UK in 2020.
*/

--Step 11:
SELECT
Expand Down
Loading

0 comments on commit e8f512f

Please sign in to comment.