ClickHouse Developer Lab Review March 2024

Main changes - standardize responses - fix some answers - fix some typos - add extra solutions to existing questions - add solutions to new questions
ClickHouse · Mar 20, 2024 · e8f512f · e8f512f
1 parent b198733
commit e8f512f
Show file tree

Hide file tree

Showing 19 changed files with 245 additions and 78 deletions.
diff --git a/developer/01_getting_started/lab_1.2.sql b/developer/01_getting_started/lab_1.2.sql
@@ -3,37 +3,33 @@ SELECT *
 FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
 LIMIT 100;
 
-
 --Step 2:
 SELECT count()
-FROM s3(
-   'https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');
+FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');
 
 --Step 3:
 SELECT formatReadableQuantity(count())
-FROM s3(
-   'https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');
+FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet');
 
 --Step 4:
 SELECT
     avg(volume)
-FROM s3(
-   'https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
+FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
 WHERE
     crypto_name = 'Bitcoin';
 
 --Step 5:
 SELECT
-   crypto_name,
-   count() AS count
+    crypto_name,
+    count() AS count
 FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
 GROUP BY crypto_name
 ORDER BY crypto_name;
 
 --Step 6:
 SELECT
-   crypto_name,
-   count() AS count
+    trim(crypto_name) as name,
+    count() AS count
 FROM s3('https://learnclickhouse.s3.us-east-2.amazonaws.com/datasets/crypto_prices.parquet')
-GROUP BY crypto_name
-ORDER BY trim(crypto_name);
+GROUP BY name
+ORDER BY name;
diff --git a/developer/02_clickhouse_architecture/lab_2.1.sql b/developer/02_clickhouse_architecture/lab_2.1.sql
@@ -1,3 +1,27 @@
+--Step 2:
+SELECT *
+FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2023/pypi_0_7_34.snappy.parquet')
+LIMIT 10;
+
+--Step 3:
+SELECT count()
+FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2023/pypi_0_7_34.snappy.parquet');
+
+--Step 4:
+CREATE TABLE pypi (
+    TIMESTAMP DateTime,
+    COUNTRY_CODE String,
+    URL String,
+    PROJECT String
+)
+ENGINE = MergeTree
+PRIMARY KEY TIMESTAMP;
+
+--Step 5:
+INSERT INTO pypi
+    SELECT TIMESTAMP, COUNTRY_CODE, URL, PROJECT
+    FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2023/pypi_0_7_34.snappy.parquet');
+
 --Step 6:
 SELECT
     PROJECT,
@@ -8,9 +32,12 @@ ORDER BY c DESC
 LIMIT 100;
 
 --Step 7:
--- All of the rows were read, because the query had no WHERE clause - so ClickHouse needed to process every granule.
+/*
+ * All of the rows were read, because the query had no WHERE clause - so
+ * ClickHouse needed to process every granule.
+ */
 
---Step 9:
+--Step 8:
 SELECT
     PROJECT,
     count() AS c
@@ -20,14 +47,50 @@ GROUP BY PROJECT
 ORDER BY c DESC
 LIMIT 100;
 
+SELECT
+    PROJECT,
+    count() AS c
+FROM pypi
+WHERE TIMESTAMP >= toDate('2023-04-01') AND TIMESTAMP < toDate('2023-05-01')
+GROUP BY PROJECT
+ORDER BY c DESC
+LIMIT 100;
+
+--Step 9:
+/*
+ * Your answer may vary by a granule or two, but the query only has to process
+ * 565,248 rows, which is exactly 8,192 x 69. So the query processed 69
+ * granules instead of performing a scan of the entire table. Why? Because the
+ * primary key is the TIMESTAMP column, which allows ClickHouse to skip about
+ * 1/3 of the data.
+*/
+
 --Step 10:
---Your answer may vary by a granule or two, but the query only has to process 565,248 rows, which is exactly 8,192 x 69. So the query processed 69 granules instead of performing a scan of the entire table. Why? Because the primary key is the TIMESTAMP column, which allows ClickHouse to skip about 1/4 of the data.
+SELECT
+    PROJECT,
+    count() AS c
+FROM pypi
+WHERE PROJECT LIKE 'boto%'
+GROUP BY PROJECT
+ORDER BY c DESC;
 
---Step 12:
---The PROJECT column is not in the primary key, so the primary index is no help in skipping granules.
+--Step 11:
+/*
+ * The PROJECT column is not in the primary key, so the primary index is no
+ * help in skipping granules.
+ */
 
---Step 14:
---None. Even though PROJECT was added to the primary key, it did not allow ClickHouse to skip any granules. Why? Because the TIMESTAMP has a high cardinality that is making any subsequent columns in the primary key difficult to be useful.
+--Step 13:
+/*
+ * None. Even though PROJECT was added to the primary key, it did not allow
+ * ClickHouse to skip any granules. Why? Because the TIMESTAMP has a high
+ * cardinality that is making any subsequent columns in the primary key
+ * difficult to be useful.
+ */
 
---Step 16:
---The first column of the primary key is an important and powerful design decision. By putting PROJECT first, we are assuring that our queries that filter by PROJECT will process a minimum amount of rows.
+--Step 15:
+/*
+ * The first column of the primary key is an important and powerful design
+ * decision. By putting PROJECT first, we are assuring that our queries that
+ * filter by PROJECT will process a minimum amount of rows.
+ */
diff --git a/developer/02_clickhouse_architecture/lab_2.2.sql b/developer/02_clickhouse_architecture/lab_2.2.sql
@@ -6,16 +6,7 @@ CREATE TABLE test_pypi (
     PROJECT String
 )
 ENGINE = MergeTree
-PRIMARY KEY (PROJECT, TIMESTAMP)
-ORDER BY (PROJECT, TIMESTAMP, COUNTRY_CODE);
+PRIMARY KEY (PROJECT, COUNTRY_CODE, TIMESTAMP);
 
 INSERT INTO test_pypi
-    SELECT * FROM pypi2;
-
-
---Step 5:
-SELECT
-    formatReadableSize(sum(data_compressed_bytes)) AS compressed_size,
-    count() AS num_of_active_parts
-FROM system.parts
-WHERE (active = 1) AND (table = 'test_pypi');
+    SELECT * FROM pypi2;
diff --git a/developer/03_modeling_data/lab_3.1.sql b/developer/03_modeling_data/lab_3.1.sql
@@ -1,18 +1,27 @@
---Step 3:
+--Step 2:
 SELECT uniqExact(COUNTRY_CODE)
 FROM pypi;
 
--- You will notice there are only 186 unique values of the country code, which makes it a great candidate for LowCardinality.
+/*
+ * You will notice there are only 186 unique values of the country code, which
+ * makes it a great candidate for LowCardinality.
+ */
 
---Step 4:
+--Step 3:
 SELECT
     uniqExact(PROJECT),
     uniqExact(URL)
 FROM pypi;
 
--- There are over 24,000 unique values of PROJECT, which is large - but not too large. We will try LowCardinality on this column as well and see if it improves storage and query performance. The URL has over 79,000 unique values, and we can assume that a URL could have a lot of different values, so it is probably a bad choice for LowCardinality.
+/*
+ * There are over 24,000 unique values of PROJECT, which is large - but not too
+ * large. We will try LowCardinality on this column as well and see if it
+ * improves storage and query performance. The URL has over 79,000 unique
+ * values, and we can assume that a URL could have a lot of different values,
+ * so it is probably a bad choice for LowCardinality.
+ */
 
---Step 5:
+--Step 4:
 CREATE TABLE pypi3 (
     TIMESTAMP DateTime,
     COUNTRY_CODE LowCardinality(String),

diff --git a/developer/03_modeling_data/lab_3.2.sql b/developer/03_modeling_data/lab_3.2.sql
@@ -24,14 +24,23 @@ SELECT count()
 FROM crypto_prices
 WHERE volume >= 1_000_000;
 
+/*
+ * It read all of the rows because volume is not part of the primary key.
+ */
+
 --Step 6:
 SELECT
    avg(price)
 FROM crypto_prices
 WHERE crypto_name = 'Bitcoin';
 
+/*
+ * Only a single granule was processed. As crypto_name is a primary key,
+ * ClickHouse use it to optmize the query.
+ */
+
 --Step 7:
 SELECT
    avg(price)
 FROM crypto_prices
-WHERE crypto_name LIKE 'B%';
+WHERE crypto_name LIKE 'B%';
diff --git a/developer/04_inserting_data/lab_4.2.sql b/developer/04_inserting_data/lab_4.2.sql
@@ -1,5 +1,7 @@
 --Step 1:
-DESCRIBE s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/uk_property_prices.snappy.parquet');
+DESCRIBE s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/uk_property_prices.snappy.parquet')
+SETTINGS
+   schema_inference_make_columns_nullable=false;
 
 --Step 2:
 CREATE TABLE uk_price_paid
@@ -8,9 +10,9 @@ CREATE TABLE uk_price_paid
     date Date,
     postcode1 LowCardinality(String),
     postcode2 LowCardinality(String),
-    type Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
+    type Enum('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
     is_new UInt8,
-    duration Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
+    duration Enum('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
     addr1 String,
     addr2 String,
     street LowCardinality(String),
@@ -36,22 +38,33 @@ SELECT avg(price)
 FROM uk_price_paid
 WHERE postcode1 = 'LU1' AND postcode2 = '5FT';
 
--- The primary key contains postcode1 and postcode2 as the first two columns, so filtering by both allows ClickHouse to skip the most granules.
+/*
+ * The primary key contains postcode1 and postcode2 as the first two columns,
+ * so filtering by both allows ClickHouse to skip the most granules.
+ */
 
 --Step 6:
 SELECT avg(price)
 FROM uk_price_paid
 WHERE postcode2 = '5FT';
 
--- The postcode2 column is the second column in the primary key, which allows ClickHouse to avoid about 1/3 of the table.
--- Not bad, but note that the second value of a primary key is not as helpful in our dataset as the first column of the primary key.
--- This all depends on your dataset, but this query gives you an idea of how you should think through and test if a column will be useful before adding it to the primary key.
--- In this example, postcode2 seems beneficial (assuming we need to filter by postcode2 regularly.)
+/*
+ * The postcode2 column is the second column in the primary key, which allows
+ * ClickHouse to avoid about 1/3 of the table. Not bad, but note that the
+ * second value of a primary key is not as helpful in our dataset as the first
+ * column of the primary key. This all depends on your dataset, but this query
+ * gives you an idea of how you should think through and test if a column will
+ * be useful before adding it to the primary key. In this example, postcode2
+ * seems beneficial (assuming we need to filter by postcode2 regularly.)
+ */
 
 --Step 7:
 SELECT avg(price)
 FROM uk_price_paid
 WHERE town = 'YORK';
 
--- The town column is not a part of the primary key, so the primary index does not provide any skipping of granules.
+/*
+ * The town column is not a part of the primary key, so the primary index does
+ * not provide any skipping of granules.
+ */
 
diff --git a/developer/04_inserting_data/lab_4.3.sql b/developer/04_inserting_data/lab_4.3.sql
@@ -9,11 +9,15 @@ FROM s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/operating_budget.cs
 SETTINGS format_csv_delimiter = '~';
 
 --Step 3:
-SELECT formatReadableQuantity(sum(approve_amount))
+SELECT formatReadableQuantity(sum(approved_amount))
 FROM s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/operating_budget.csv')
 SETTINGS format_csv_delimiter = '~';
 
--- You get an exception telling you that trying to sum a String column is not allowed. Apparently, the approved_amount column is not entirely numeric data, and ClickHouse inferred that column as a String.
+/*
+ * You get an exception telling you that trying to sum a String column is not
+ * allowed. Apparently, the approved_amount column is not entirely numeric
+ * data, and ClickHouse inferred that column as a String.
+ */
 
 --Step 4:
 DESCRIBE s3('https://learn-clickhouse.s3.us-east-2.amazonaws.com/operating_budget.csv')
@@ -32,7 +36,7 @@ CREATE TABLE operating_budget (
     service LowCardinality(String),
     department LowCardinality(String),
     program LowCardinality(String),
-    program_code UInt32,
+    program_code LowCardinality(String),
     description String,
     item_category LowCardinality(String),
     approved_amount UInt32,
@@ -53,7 +57,7 @@ INSERT INTO operating_budget
         c2 AS service,
         c3 AS department,
         result[1] AS program,
-        toUInt32OrZero(splitByChar(')',result[2])[1]) AS program_code,
+        splitByChar(')',result[2])[1] AS program_code,
         c5 AS description,
         c6 AS item_category,
         toUInt32OrZero(c7) AS approved_amount,
@@ -92,4 +96,4 @@ WHERE fiscal_year = '2022';
 SELECT sum(actual_amount)
 FROM operating_budget
 WHERE fiscal_year = '2022'
-AND program_code = 31;
+AND program_code = '031';
diff --git a/developer/05_analyzing_data/lab_5.1.sql b/developer/05_analyzing_data/lab_5.1.sql
@@ -61,6 +61,10 @@ SELECT
     avgIf(price, type = 'other')
 FROM uk_price_paid;
 
+SELECT type, avg(price) as avg_price
+FROM uk_price_paid
+GROUP BY type;
+
 --Step 9:
 SELECT
     formatReadableQuantity(sum(price))

diff --git a/developer/06_materialized_views/lab_6.2.sql b/developer/06_materialized_views/lab_6.2.sql
@@ -70,7 +70,10 @@ SELECT
 FROM prices_by_year_dest
 WHERE toYear(date) = '2020';
 
--- The query only needs to read 886,642 rows, which is exactly how many properties were sold in the UK in 2020.
+/*
+ * The query only needs to read 886,642 rows, which is exactly how many
+ * properties were sold in the UK in 2020.
+ */
 
 --Step 11:
 SELECT