diff --git a/_modules/week-11.md b/_modules/week-11.md index 1b028f3..a9e658e 100644 --- a/_modules/week-11.md +++ b/_modules/week-11.md @@ -11,7 +11,7 @@ Nov 5 : [Mini-Lecture](https://youtu.be/bp7-OwxdGwg){:target="_blank"}, [Solutions](https://drive.google.com/file/d/1qhvi8H4nmnJnZlStYYNxmY5ZMf_B47ER/view?usp=sharing){:target="_blank"} Nov 7 -: **Lecture 21**{: .label .label-lecture} SQL II +: **Lecture 21**{: .label .label-lecture} [SQL II](lecture/lec21) Nov 8 diff --git a/lecture/lec21.md b/lecture/lec21.md index c4b58bd..4222a75 100644 --- a/lecture/lec21.md +++ b/lecture/lec21.md @@ -6,12 +6,12 @@ nav_exclude: true # Lecture 21 – SQL II -Presented by Joseph E. Gonzalez +Presented by Narges Norouzi Content by many dedicated Data 100 instructors at UC Berkeley. See our [Acknowledgments](../../acks) page. -- [slides](https://docs.google.com/presentation/d/1jJx4Qmsg9Vm2ZF5F4oXYC1f6DdlzbVWW3nKj9DNdLIs/edit?usp=sharing){:target="_blank"} -- [code](https://data100.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FDS-100%2Fsp24-student&urlpath=lab%2Ftree%2Fsp24-student%2Flecture%2Flec21%2Flec21.ipynb&branch=main){:target="_blank"} +- [slides](https://docs.google.com/presentation/d/1FYM0rOhev0kJCuJAC9b8NtZ3tUYMcGk-upWCvxEPRFM/edit?usp=share_link){:target="_blank"} +- [code](https://data100.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FDS-100%2Ffa24-student&urlpath=lab%2Ftree%2Ffa24-student%2Flecture%2Flec21%2Flec21.ipynb&branch=main){:target="_blank"} - [code HTML](../../resources/assets/lectures/lec21/lec21.html){:target="_blank"} -- [recording](https://youtu.be/ZIWQ5YGf-oY){:target="_blank"} \ No newline at end of file + \ No newline at end of file diff --git a/resources/assets/lectures/lec21/lec21.html b/resources/assets/lectures/lec21/lec21.html index b984f31..8e77e74 100644 --- a/resources/assets/lectures/lec21/lec21.html +++ b/resources/assets/lectures/lec21/lec21.html @@ -7516,8 +7516,8 @@
%load_ext sql
+
Add support for reading sqlite tables using duckdb
+Add support for reading sqlite
tables using duckdb.
import duckdb
@@ -7598,20 +7636,32 @@ Loading the Data
+
%sql duckdb:///data/basic_examples.db --alias basic
+%sql duckdb:///data/basic_examples.db --alias duckdb
Get the large IMDB database file
+Get the large IMDB database file for other examples in this demo.
from sqlalchemy import create_engine
@@ -7678,13 +7728,723 @@ Loading the Data
+
+
+
+
+
+More on Basic Queries¶
+
+
+
+
ORDER BY
¶%%sql duckdb
+SELECT *
+FROM Dragon
+ORDER BY cute DESC;
+
name | +year | +cute | +
---|---|---|
hiccup | +2010 | +10 | +
dragon 2 | +2019 | +0 | +
drogon | +2011 | +-100 | +
LIMIT
and OFFSET
¶%%sql duckdb
+SELECT *
+FROM Dragon
+LIMIT 2;
+
name | +year | +cute | +
---|---|---|
hiccup | +2010 | +10 | +
drogon | +2011 | +-100 | +
%%sql duckdb
+SELECT *
+FROM Dragon
+LIMIT 2
+OFFSET 1;
+
name | +year | +cute | +
---|---|---|
drogon | +2011 | +-100 | +
dragon 2 | +2019 | +0 | +
What if we wanted a random sample:
+%%sql duckdb
+SELECT *
+FROM Dragon
+ORDER BY RANDOM()
+LIMIT 2
+
name | +year | +cute | +
---|---|---|
dragon 2 | +2019 | +0 | +
hiccup | +2010 | +10 | +
%%sql duckdb
+SELECT *
+FROM Dragon USING SAMPLE reservoir(2 ROWS) REPEATABLE (100);
+
name | +year | +cute | +
---|---|---|
hiccup | +2010 | +10 | +
drogon | +2011 | +-100 | +
GROUP BY
¶%%sql duckdb
+SELECT *
+FROM Dish;
+
name | +type | +cost | +
---|---|---|
ravioli | +entree | +10 | +
ramen | +entree | +7 | +
taco | +entree | +7 | +
edamame | +appetizer | +4 | +
fries | +appetizer | +4 | +
potsticker | +appetizer | +4 | +
ice cream | +dessert | +5 | +
%%sql duckdb
+SELECT type
+FROM Dish;
+
type | +
---|
entree | +
entree | +
entree | +
appetizer | +
appetizer | +
appetizer | +
dessert | +
%%sql duckdb
+SELECT type
+FROM Dish
+GROUP BY type;
+
type | +
---|
entree | +
dessert | +
appetizer | +
%%sql duckdb
+SELECT type, SUM(cost)
+FROM Dish
+GROUP BY type;
+
type | +sum("cost") | +
---|---|
entree | +24 | +
dessert | +5 | +
appetizer | +12 | +
%%sql duckdb
+SELECT type,
+ SUM(cost),
+ MIN(cost),
+ MAX(name)
+FROM Dish
+GROUP BY type;
+
type | +sum("cost") | +min("cost") | +max("name") | +
---|---|---|---|
entree | +24 | +7 | +taco | +
dessert | +5 | +5 | +ice cream | +
appetizer | +12 | +4 | +potsticker | +
%%sql duckdb
+SELECT year, COUNT(cute)
+FROM Dragon
+GROUP BY year;
+
year | +count(cute) | +
---|---|
2011 | +1 | +
2019 | +1 | +
2010 | +1 | +
%%sql duckdb
+SELECT year, COUNT(*)
+FROM Dragon
+GROUP BY year;
+
year | +count_star() | +
---|---|
2011 | +1 | +
2010 | +1 | +
2019 | +1 | +
HAVING
%%sql basic
+%%sql duckdb
SELECT *
FROM Dish;
@@ -7710,7 +8470,7 @@ Filtering Groups Using HAVING
-Out[6]:
+Out[19]:
@@ -7758,7 +8518,6 @@ Filtering Groups Using HAVING
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -7768,10 +8527,10 @@ Filtering Groups Using HAVING
-In [7]:
+In [20]:
-%%sql basic
+%%sql duckdb
SELECT type, COUNT(*)
FROM Dish
@@ -7787,7 +8546,7 @@ Filtering Groups Using HAVING
-Out[7]:
+Out[20]:
@@ -7807,7 +8566,6 @@ Filtering Groups Using HAVING
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -7817,10 +8575,10 @@ Filtering Groups Using HAVING
-In [8]:
+In [21]:
-%%sql basic
+%%sql duckdb
SELECT type, COUNT(*)
FROM Dish
@@ -7836,7 +8594,7 @@ Filtering Groups Using HAVING
-Out[8]:
+Out[21]:
@@ -7860,7 +8618,6 @@ Filtering Groups Using HAVING
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -7870,10 +8627,11 @@ Filtering Groups Using HAVING
-In [9]:
+In [22]:
-%%sql basic
+%%sql duckdb
+
SELECT type, MAX(name)
FROM DishDietary
WHERE notes == 'gf'
@@ -7889,7 +8647,7 @@ Filtering Groups Using HAVING
-Out[9]:
+Out[22]:
@@ -7900,16 +8658,15 @@ Filtering Groups Using HAVING
-entree
-taco
-
-
appetizer
fries
+
+entree
+taco
+
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -7921,7 +8678,7 @@ Filtering Groups Using HAVING
-EDA in SQL¶
Our typical workflow when working with "big data" is:
+EDA in SQL¶
Our typical workflow when working with "big data" is:
- Use SQL to query data from a database
- Use Python (with
pandas
) to analyze this data in detail
@@ -7935,13 +8692,12 @@ EDA in SQL¶
-In [10]:
+In [23]:
%%sql imdb
SELECT setseed(0.42); -- Setting the random number seed
-
SELECT *
FROM Title
ORDER BY RANDOM()
@@ -7955,8 +8711,14 @@ EDA in SQL¶
+
+
+
+Switching to connection 'imdb'
+
+
-Out[10]:
+Out[23]:
@@ -8007,48 +8769,59 @@ EDA in SQL¶
Comedy
-tt0288008
+tt2518926
movie
-Fini Henriques
-Fini Henriques
+Age of Dinosaurs
+Age of Dinosaurs
False
-1929
+2013
None
-None
-Documentary
+88
+Action,Adventure,Sci-Fi
-tt4963886
+tt1041794
movie
-Cage Kings
-Cage Kings
+Le miracle de la foi
+Le miracle de la foi
False
+2005
None
+111
+Drama
+
+
+tt8480486
+movie
+Çetin Ceviz 2
+Çetin Ceviz 2
+False
+2016
None
-None
-Action
+114
+Comedy
-tt2092523
+tt5502918
movie
-Bombora: The Story of Australian Surfing
-Bombora: The Story of Australian Surfing
+Ek Full Chaar Half
+Ek Full Chaar Half
False
-2009
+1991
+None
None
-110
-Documentary,Sport
+Comedy
-tt1195480
-tvMovie
-George Beverly Shea: The Wonder of it All
-George Beverly Shea: The Wonder of it All
+tt0392297
+movie
+Krishna Kausalya
+Krishna Kausalya
False
-1999
+1929
+None
+None
None
-60
-Biography
tt3062740
@@ -8062,30 +8835,18 @@ EDA in SQL¶
Documentary
-tt8506910
+tt19245950
movie
-Twisted Games
-Twisted Games
+Squad of Girls
+Squad of Girls
False
+2022
None
None
-None
-Horror
-
-
-tt0251270
-movie
-A Midsummer Night's Dream
-A Midsummer Night's Dream
-False
-1968
-None
-124
-Comedy,Fantasy,Romance
+Drama,War
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -8097,7 +8858,7 @@ EDA in SQL¶
@@ -8106,7 +8867,7 @@ Matching Text Using LIKE
-In [11]:
+In [24]:
%%sql imdb
@@ -8124,7 +8885,7 @@ Matching Text Using LIKE
-Out[11]:
+Out[24]:
@@ -8536,8 +9297,7 @@ Matching Text Using LIKE
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
-Truncated to displaylimit of 100
If you want to see more, please visit displaylimit configuration
+Truncated to displaylimit of 100.
@@ -8547,7 +9307,7 @@ Matching Text Using LIKE
-In [12]:
+In [25]:
%%sql imdb
@@ -8565,7 +9325,7 @@ Matching Text Using LIKE
-Out[12]:
+Out[25]:
@@ -8977,8 +9737,7 @@ Matching Text Using LIKE
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
-Truncated to displaylimit of 100
If you want to see more, please visit displaylimit configuration
+Truncated to displaylimit of 100.
@@ -8988,7 +9747,7 @@ Matching Text Using LIKE
-In [13]:
+In [26]:
%%sql imdb
@@ -9006,7 +9765,7 @@ Matching Text Using LIKE
-Out[13]:
+Out[26]:
@@ -9026,7 +9785,6 @@ Matching Text Using LIKE
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -9038,7 +9796,7 @@ Matching Text Using LIKE
@@ -9047,7 +9805,7 @@ Converting Data Types Using CAST
-In [14]:
+In [27]:
%%sql imdb
@@ -9065,7 +9823,7 @@ Converting Data Types Using CAST
-Out[14]:
+Out[27]:
@@ -9117,7 +9875,6 @@ Converting Data Types Using CAST
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -9129,7 +9886,7 @@ Converting Data Types Using CAST
-Applying Conditions With CASE
¶
Here, we return a random order so we can see the various movie ages (otherwise, the top few entries happen to all be old movies).
+Applying Conditions With CASE
¶
Here, we return a random order so we can see the various movie ages (otherwise, the top few entries happen to all be old movies).
@@ -9138,7 +9895,7 @@ Applying Conditions With CASE
-In [15]:
+In [28]:
%%sql imdb
@@ -9163,7 +9920,7 @@ Applying Conditions With CASE
-Out[15]:
+Out[28]:
@@ -9226,7 +9983,6 @@ Applying Conditions With CASE
-ResultSet
: to convert to pandas, call .DataFrame()
or to polars, call .PolarsDataFrame()
@@ -9238,7 +9994,7 @@ Applying Conditions With CASE
-
+
Joining Tables¶
We combine data from multiple tables by performing a join. We will explore joins using the cats database, which includes two tables: s
and t
.
@@ -9249,10 +10005,10 @@ Joining Tables
-In [16]:
+In [29]:
+
+
+
+Switching to connection 'duckdb'
+
+
-Out[16]:
+Out[29]: