diff --git a/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json b/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json
index dba4afd..f7b778c 100644
--- a/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json
+++ b/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json
@@ -1,8 +1,10 @@
 {
-  "hash": "741bd535116c5b43069f2373bcc57e78",
+  "hash": "53f610ff8cc8524ff8fdda04614a7b6f",
   "result": {
-    "markdown": "---\ntitle: \"Hello Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1155795912\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year %in% 2014:2017) |> \n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 4 × 4\n   year all_trips shared_trips pct_shared\n  <int>     <int>        <int>      <dbl>\n1  2014 165114361     48816505       29.6\n2  2015 146112989     43081091       29.5\n3  2016 131165043     38163870       29.1\n4  2017 113495512     32296166       28.5\n```\n:::\n:::\n\n\n::: {#exercise-hello-nyc-taxi .callout-tip}\n## Exercises: First {dplyr} pipeline with Arrow\n\n::: panel-tabset\n## Problems\n\n1.  Calculate the total number of rides for every month in 2019\n2.  About how long did this query of 1.15 billion rows take?\n\n## Solution 1\n\nTotal number of rides for every month in 2019:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  count(month) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month       n\n   <int>   <int>\n 1     1 7667255\n 2    12 6895933\n 3    11 6877463\n 4    10 7213588\n 5     2 7018750\n 6     3 7832035\n 7     4 7432826\n 8     5 7564884\n 9     6 6940489\n10     7 6310134\n11     8 6072851\n12     9 6567396\n```\n:::\n:::\n\n\n## Solution 2\n\nCompute time for querying the 1.15 billion rows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.962   0.209   0.364 \n```\n:::\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month longest_trip\n   <int>        <dbl>\n 1     1         832.\n 2     2         702.\n 3     3         237.\n 4     4         831.\n 5     5         401.\n 6     6       45977.\n 7     7         312.\n 8     8         602.\n 9     9         604.\n10    10         308.\n11    11         701.\n12    12       19130.\n```\n:::\n\n```{.r .cell-code}\ntoc()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n0.324 sec elapsed\n```\n:::\n:::\n\n:::\n:::\n",
-    "supporting": [],
+    "markdown": "---\ntitle: \"Hello Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1155795912\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year %in% 2014:2017) |> \n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 4 × 4\n   year all_trips shared_trips pct_shared\n  <int>     <int>        <int>      <dbl>\n1  2014 165114361     48816505       29.6\n2  2015 146112989     43081091       29.5\n3  2016 131165043     38163870       29.1\n4  2017 113495512     32296166       28.5\n```\n:::\n:::\n\n\n::: {#exercise-hello-nyc-taxi .callout-tip}\n## Exercises: First {dplyr} pipeline with Arrow\n\n::: panel-tabset\n## Problems\n\n1.  Calculate the total number of rides for every month in 2019\n2.  About how long did this query of 1.15 billion rows take?\n\n## Solution 1\n\nTotal number of rides for every month in 2019:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  count(month) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month       n\n   <int>   <int>\n 1     1 7667255\n 2    11 6877463\n 3    12 6895933\n 4    10 7213588\n 5     2 7018750\n 6     3 7832035\n 7     4 7432826\n 8     5 7564884\n 9     6 6940489\n10     7 6310134\n11     8 6072851\n12     9 6567396\n```\n:::\n:::\n\n\n## Solution 2\n\nCompute time for querying the 1.15 billion rows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.844   0.175   0.331 \n```\n:::\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month longest_trip\n   <int>        <dbl>\n 1     1         832.\n 2     2         702.\n 3     3         237.\n 4     4         831.\n 5     5         401.\n 6     6       45977.\n 7     7         312.\n 8     8         602.\n 9     9         604.\n10    10         308.\n11    11         701.\n12    12       19130.\n```\n:::\n\n```{.r .cell-code}\ntoc()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n0.379 sec elapsed\n```\n:::\n:::\n\n:::\n:::\n",
+    "supporting": [
+      "1_hello_arrow-exercises_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json b/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json
index 19b882d..5982538 100644
--- a/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json
+++ b/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json
@@ -1,7 +1,7 @@
 {
-  "hash": "62602546d12ab790ca112e42f2d3549a",
+  "hash": "4e2ed176da5e01d7cca35ff8a9067c99",
   "result": {
-    "markdown": "---\ntitle: \"Data Manipulation Part 1 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(stringr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n::: {#exercise-compute-collect .callout-tip}\n# Using `compute()` and `collect()`\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi fares in the dataset had a total amount greater than \\$100?\n\n2.  How many distinct pickup locations are in the dataset?\n\n## Solution 1\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-1_6f0b91138fe8ef9057e815121068628b'}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(total_amount > 100) %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1529191\n```\n:::\n:::\n\n\n## Solution 2\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-2_b6ea5034a000a75cef933166dbea5e4e'}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  distinct(pickup_longitude, pickup_latitude) %>%\n  compute() %>%\n  nrow()\n```\n:::\n\n:::\n:::\n\n::: {#exercise-dplyr-api .callout-tip}\n# Using the dplyr API in arrow\n\n::: panel-tabset\n## Problem\n\n1.  Use the `dplyr::filter()` and `stringr::str_ends()` to return a subset of the data which is a) from September 2020, and b) the value in `vendor_name` ends with the letter \"S\".\n\n2.  Try to use the `stringr` function `str_replace_na()` to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. What happens, and why?\n\n3.  Bonus question: see if you can find a different way of completing the task in question 2.\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(str_ends(vendor_name, \"S\"), year == 2020,  month == 9) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 847,149 × 24\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   <chr>       <dttm>              <dttm>                        <int>\n 1 VTS         2020-09-03 14:27:50 2020-09-03 14:43:50               1\n 2 VTS         2020-09-03 14:53:22 2020-09-03 15:07:33               3\n 3 VTS         2020-09-03 14:32:22 2020-09-03 14:41:19               2\n 4 VTS         2020-09-03 14:48:33 2020-09-03 15:06:47               3\n 5 VTS         2020-09-03 14:54:54 2020-09-03 15:13:48               1\n 6 VTS         2020-09-03 14:23:52 2020-09-03 14:26:03               2\n 7 VTS         2020-09-03 14:31:24 2020-09-03 14:35:20               1\n 8 VTS         2020-09-03 14:20:13 2020-09-03 14:49:34               2\n 9 VTS         2020-09-03 14:06:08 2020-09-03 14:19:54               1\n10 VTS         2020-09-03 14:29:26 2020-09-03 14:32:45               1\n# ℹ 847,139 more rows\n# ℹ 20 more variables: trip_distance <dbl>, pickup_longitude <dbl>,\n#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,\n#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,\n#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,\n#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,\n#   congestion_surcharge <dbl>, pickup_location_id <int>, …\n```\n:::\n:::\n\n\n## Solution 2 and 3\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\")) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError: Expression stringr::str_replace_na(vendor_name, \"No vendor\") not supported in Arrow\nCall collect() first to pull data into R.\n```\n:::\n:::\n\n\nThis won't work as `stringr::str_replace_na()` hasn't been implemented in Arrow. You could try using `mutate()` and `ifelse()` here instead.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  mutate(vendor_name = ifelse(is.na(vendor_name), \"No vendor\", vendor_name)) %>%\n  collect()\n```\n:::\n\n\nOr, if you only needed a subset of the data, you could apply the function after collecting it into R memory.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year == 2019, month == 10) %>% # smaller subset of the data\n  collect() %>%\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\"))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 7,213,588 × 24\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   <chr>       <dttm>              <dttm>                        <int>\n 1 VTS         2019-10-01 21:41:22 2019-10-01 21:52:59               1\n 2 CMT         2019-10-01 21:53:46 2019-10-01 22:13:09               1\n 3 CMT         2019-10-01 21:05:22 2019-10-01 21:14:06               1\n 4 CMT         2019-10-01 21:19:59 2019-10-01 21:39:04               1\n 5 CMT         2019-10-01 21:45:45 2019-10-01 22:06:14               1\n 6 CMT         2019-10-01 21:03:44 2019-10-01 21:09:16               1\n 7 CMT         2019-10-01 21:15:40 2019-10-01 21:31:26               1\n 8 CMT         2019-10-01 21:34:57 2019-10-01 21:42:53               1\n 9 CMT         2019-10-01 21:57:55 2019-10-01 22:04:22               1\n10 CMT         2019-10-01 21:19:21 2019-10-01 21:29:08               1\n# ℹ 7,213,578 more rows\n# ℹ 20 more variables: trip_distance <dbl>, pickup_longitude <dbl>,\n#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,\n#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,\n#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,\n#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,\n#   congestion_surcharge <dbl>, pickup_location_id <int>, …\n```\n:::\n:::\n\n:::\n:::\n",
+    "markdown": "---\ntitle: \"Data Manipulation Part 1 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(stringr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n::: {#exercise-compute-collect .callout-tip}\n# Using `compute()` and `collect()`\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi fares in the dataset had a total amount greater than \\$100?\n\n2.  How many distinct pickup locations are in the dataset since 2016?\n\n## Solution 1\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-1_6f0b91138fe8ef9057e815121068628b'}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(total_amount > 100) %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1529191\n```\n:::\n:::\n\n\n## Solution 2\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-2_31838425beb6cb58051570c1c799a7ff'}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year >= 2016) %>%\n  distinct(pickup_longitude, pickup_latitude) %>%\n  compute() %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 29105801\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-dplyr-api .callout-tip}\n# Using the dplyr API in arrow\n\n::: panel-tabset\n## Problem\n\n1.  Use the `dplyr::filter()` and `stringr::str_ends()` to return a subset of the data which is a) from September 2020, and b) the value in `vendor_name` ends with the letter \"S\".\n\n2.  Try to use the `stringr` function `str_replace_na()` to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. What happens, and why?\n\n3.  Bonus question: see if you can find a different way of completing the task in question 2.\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(str_ends(vendor_name, \"S\"), year == 2020,  month == 9) %>%\n  collect()\n```\n:::\n\n\n## Solution 2 and 3\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\")) %>%\n  head() %>%\n  collect()\n```\n:::\n\n\nThis won't work as `stringr::str_replace_na()` hasn't been implemented in Arrow. You could try using `mutate()` and `ifelse()` here instead.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  mutate(vendor_name = ifelse(is.na(vendor_name), \"No vendor\", vendor_name)) %>%\n  head() %>%\n  collect()\n```\n:::\n\n\nOr, if you only needed a subset of the data, you could apply the function after collecting it into R memory.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year == 2019, month == 10) %>% # smaller subset of the data\n  collect() %>%\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\"))\n```\n:::\n\n:::\n:::\n",
     "supporting": [],
     "filters": [
       "rmarkdown/pagebreak.lua"
diff --git a/_freeze/materials/2_data_manipulation_1/execute-results/html.json b/_freeze/materials/2_data_manipulation_1/execute-results/html.json
index 5c0effb..b5074b8 100644
--- a/_freeze/materials/2_data_manipulation_1/execute-results/html.json
+++ b/_freeze/materials/2_data_manipulation_1/execute-results/html.json
@@ -1,8 +1,10 @@
 {
-  "hash": "7c655ef8fe9aa408db2abf9431246a3a",
+  "hash": "a1507fc5b1107b8c28cecd78709610f8",
   "result": {
-    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n\n# Data Manipulation---Part 1 {#data-manip-1}\n\n\n::: {.cell}\n\n:::\n\n\n## dplyr API in arrow\n\n![](images/dplyr-backend.png)\n\n\n```{=html}\n<!-- \nHere, we talk about the equivalents between dbplyr and dplyr\n-->\n```\n\n## The dataset\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\n\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n## How many trips had more than 1 passenger?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\nshared_rides <- nyc_taxi |>\n  filter(year %in% 2017:2021) |> \n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) \n\nclass(shared_rides)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"arrow_dplyr_query\"\n```\n:::\n:::\n\n\n## arrow dplyr queries\n\n\n::: {.cell}\n\n```{.r .cell-code}\nshared_rides\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset (query)\nyear: int32\nall_trips: int64\nshared_trips: uint64\npct_shared: double (multiply_checked(divide(cast(shared_trips, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), cast(all_trips, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false})), 100))\n\nSee $.data for the source Arrow object\n```\n:::\n:::\n\n\n## arrow dplyr queries\n\n-   query has been constructed but not evaluated\n-   nothing has been pulled into memory\n\n## To `collect()` or to `compute()`?\n\n-   `compute()` evaluates the query, in-memory output stays in Arrow\n-   `collect()` evaluates the query, in-memory output returns to R\n\n## compute\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncompute(shared_rides)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTable\n5 rows x 4 columns\n$year <int32>\n$all_trips <int64>\n$shared_trips <uint64>\n$pct_shared <double>\n```\n:::\n:::\n\n\n## collect\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncollect(shared_rides)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 5 × 4\n   year all_trips shared_trips pct_shared\n  <int>     <int>        <int>      <dbl>\n1  2017 113495512     32296166       28.5\n2  2018 102797401     28796633       28.0\n3  2019  84393604     23515989       27.9\n4  2020  24647055      5837960       23.7\n5  2021  30902618      7221844       23.4\n```\n:::\n:::\n\n\n## calling `nrow()` to see how much data\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year %in% 2017:2021) %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 356236190\n```\n:::\n:::\n\n```{=html}\n<!-- \n* if we have transformations which don't have multiple stages of computation, we can call nrow on the query easily like this\n-->\n```\n\n## calling `nrow()` doesn't work with intermediate step\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year %in% 2017:2021) %>%\n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA\n```\n:::\n:::\n\n\n## use `compute()` to execute intermediate steps\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"9\"}\nnyc_taxi %>%\n  filter(year %in% 2017:2021) %>%\n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) %>%\n  compute() %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 5\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  How many taxi fares in the dataset had a total amount greater than \\$100?\n2.  How many distinct pickup locations are in the dataset?\n\n➡️ [Data Manipulation Part I Exercises Page](2_data_manipulation_1-exercises.html)\n\n## use `glimpse()` to preview datasets\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  glimpse()\n```\n:::\n\n\n## use `head()` then `collect()` to preview output for large queries\n\nHow much were fares in GBP (£)?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfares_pounds <- nyc_taxi %>%\n  filter(year %in% 2012:2015) %>%\n  mutate(\n    fare_amount_pounds = fare_amount * 0.79\n  ) %>%\n  select(fare_amount, fare_amount_pounds)\n```\n:::\n\n\nHow many rows?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfares_pounds %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 662951433\n```\n:::\n:::\n\n\n## use `head()` then `collect()` to preview output\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfares_pounds %>%\n  head() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6 × 2\n  fare_amount fare_amount_pounds\n        <dbl>              <dbl>\n1        29.7              23.5 \n2         9.3               7.35\n3         4.1               3.24\n4         4.5               3.56\n5         4.5               3.56\n6         4.1               3.24\n```\n:::\n:::\n\n\n## use `across()` to transform data in multiple columns\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxis_gbp <- nyc_taxi %>%\n  mutate(across(ends_with(\"amount\"), list(pounds = ~.x * 0.79)))\n\ntaxis_gbp\n```\n:::\n\n\n## use `across()` to transform data in multiple columns\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxis_gbp %>%\n  select(contains(\"amount\")) %>%\n  head() %>%\n  collect()\n```\n:::\n\n\n# dplyr API in arrow - what is and isn't implemented?\n\n## example - `slice()`\n\nFirst three trips in the dataset in 2021 where distance \\> 100 miles\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlong_rides_2021 <- nyc_taxi %>%\n  filter(year == 2021 & trip_distance > 100) %>%\n  select(pickup_datetime, year, trip_distance)\n\nlong_rides_2021 %>%\n  slice(1:3)\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in UseMethod(\"slice\"): no applicable method for 'slice' applied to an object of class \"arrow_dplyr_query\"\n```\n:::\n:::\n\n\n## head to the docs!\n\n\n::: {.cell}\n\n```{.r .cell-code}\n?arrow-dplyr\n```\n:::\n\n\n## A different function\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlong_rides_2021 %>%\n  slice_max(n = 3, order_by = trip_distance, with_ties = FALSE) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  pickup_datetime      year trip_distance\n  <dttm>              <int>         <dbl>\n1 2021-11-16 12:55:00  2021       351613.\n2 2021-10-27 17:46:00  2021       345124.\n3 2021-12-11 10:48:00  2021       335094.\n```\n:::\n:::\n\n\n## Or call `collect()` first\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlong_rides_2021 %>%\n  collect() %>%\n  slice(1:3)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  pickup_datetime      year trip_distance\n  <dttm>              <int>         <dbl>\n1 2021-01-03 09:01:26  2021          216.\n2 2021-01-03 11:36:52  2021          268.\n3 2021-10-02 15:04:53  2021          188.\n```\n:::\n:::\n\n\n## tidyr functions - pivot\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tidyr)\n\nnyc_taxi %>% \n  group_by(vendor_name) %>%\n  summarise(max_fare = max(fare_amount), min_fare = min(fare_amount)) %>%\n  pivot_longer(!vendor_name, names_to = \"metric\") %>% \n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in UseMethod(\"pivot_longer\"): no applicable method for 'pivot_longer' applied to an object of class \"arrow_dplyr_query\"\n```\n:::\n:::\n\n\n## duckdb\n\n![](images/dplyr-arrow-duckdb.png)\n\n## tidyr functions - pivot with duckdb!\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(duckdb)\n\nnyc_taxi %>% \n  group_by(vendor_name) %>%\n  summarise(max_fare = max(fare_amount), min_fare = min(fare_amount)) %>%\n  to_duckdb() %>% # send data to duckdb\n  pivot_longer(!vendor_name, names_to = \"metric\") %>% \n  to_arrow() %>% # return data back to arrow\n  collect()\n```\n:::\n\n\n# Using functions inside verbs\n\n## Using functions inside verbs\n\n-   lots of the lubridate and stringr APIs supported!\n-   base R and others too - always good to check the docs\n\n## Morning vs afternoon with namespacing\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"2\"}\nnyc_taxi %>%\n  group_by(time_of_day = ifelse(lubridate::am(pickup_datetime), \"morning\", \"afternoon\")) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 2\n# Groups:   time_of_day [2]\n  time_of_day         n\n  <chr>           <int>\n1 afternoon   740298348\n2 morning     415497564\n```\n:::\n:::\n\n\n## Morning vs afternoon - without namespacing\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"2\"}\nlibrary(lubridate)\n\nnyc_taxi %>%\n  group_by(time_of_day = ifelse(am(pickup_datetime), \"morning\", \"afternoon\")) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 2\n# Groups:   time_of_day [2]\n  time_of_day         n\n  <chr>           <int>\n1 afternoon   740298348\n2 morning     415497564\n```\n:::\n:::\n\n\n## Head to the docs again to see what's implemented!\n\n\n::: {.cell}\n\n```{.r .cell-code}\n?arrow-dplyr\n```\n:::\n\n\n## Your Turn\n\n1.  Use the `dplyr::filter()` and `stringr::str_ends()` to return a subset of the data which is a) from September 2020, and b) the value in `vendor_name` ends with the letter \"S\".\n2.  Try to use the `stringr` function `str_replace_na()` to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. What happens, and why?\n3.  Bonus question: see if you can find a different way of completing the task in question 2.\n\n➡️ [Data Manipulation Part I Exercises Page](2_data_manipulation_1-exercises.html)\n",
-    "supporting": [],
+    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n\n# Data Manipulation---Part 1 {#data-manip-1}\n\n\n::: {.cell}\n\n:::\n\n\n## dplyr API in arrow\n\n![](images/dplyr-backend.png)\n\n\n```{=html}\n<!-- \nHere, we talk about the equivalents between dbplyr and dplyr\n-->\n```\n\n## The dataset\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\n\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n## How many trips had more than 1 passenger?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\nshared_rides <- nyc_taxi |>\n  filter(year %in% 2017:2021) |> \n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) \n\nclass(shared_rides)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"arrow_dplyr_query\"\n```\n:::\n:::\n\n\n## arrow dplyr queries\n\n\n::: {.cell}\n\n```{.r .cell-code}\nshared_rides\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset (query)\nyear: int32\nall_trips: int64\nshared_trips: uint64\npct_shared: double (multiply_checked(divide(cast(shared_trips, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), cast(all_trips, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false})), 100))\n\nSee $.data for the source Arrow object\n```\n:::\n:::\n\n\n## arrow dplyr queries\n\n-   query has been constructed but not evaluated\n-   nothing has been pulled into memory\n\n## To `collect()` or to `compute()`?\n\n-   `compute()` evaluates the query, in-memory output stays in Arrow\n-   `collect()` evaluates the query, in-memory output returns to R\n\n## compute\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncompute(shared_rides)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTable\n5 rows x 4 columns\n$year <int32>\n$all_trips <int64>\n$shared_trips <uint64>\n$pct_shared <double>\n```\n:::\n:::\n\n\n## collect\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncollect(shared_rides)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 5 × 4\n   year all_trips shared_trips pct_shared\n  <int>     <int>        <int>      <dbl>\n1  2017 113495512     32296166       28.5\n2  2018 102797401     28796633       28.0\n3  2019  84393604     23515989       27.9\n4  2020  24647055      5837960       23.7\n5  2021  30902618      7221844       23.4\n```\n:::\n:::\n\n\n## calling `nrow()` to see how much data\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year %in% 2017:2021) %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 356236190\n```\n:::\n:::\n\n```{=html}\n<!-- \n* if we have transformations which don't have multiple stages of computation, we can call nrow on the query easily like this\n-->\n```\n\n## calling `nrow()` doesn't work with intermediate step\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year %in% 2017:2021) %>%\n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA\n```\n:::\n:::\n\n\n## use `compute()` to execute intermediate steps\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"9\"}\nnyc_taxi %>%\n  filter(year %in% 2017:2021) %>%\n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) %>%\n  compute() %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 5\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  How many taxi fares in the dataset had a total amount greater than \\$100?\n2.  How many distinct pickup locations are in the dataset?\n\n➡️ [Data Manipulation Part I Exercises Page](2_data_manipulation_1-exercises.html)\n\n## use `head()` then `collect()` to preview output for large queries\n\nHow much were fares in GBP (£)?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfares_pounds <- nyc_taxi %>%\n  filter(year %in% 2012:2015) %>%\n  mutate(\n    fare_amount_pounds = fare_amount * 0.79\n  ) %>%\n  select(fare_amount, fare_amount_pounds)\n```\n:::\n\n\nHow many rows?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfares_pounds %>%\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 662951433\n```\n:::\n:::\n\n\n## use `head()` then `collect()` to preview output\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfares_pounds %>%\n  head() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6 × 2\n  fare_amount fare_amount_pounds\n        <dbl>              <dbl>\n1        29.7              23.5 \n2         9.3               7.35\n3         4.1               3.24\n4         4.5               3.56\n5         4.5               3.56\n6         4.1               3.24\n```\n:::\n:::\n\n\n## use `across()` to transform data in multiple columns\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxis_gbp <- nyc_taxi %>%\n  mutate(across(ends_with(\"amount\"), list(pounds = ~.x * 0.79)))\n\ntaxis_gbp\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset (query)\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\nfare_amount_pounds: double (multiply_checked(fare_amount, 0.79))\ntip_amount_pounds: double (multiply_checked(tip_amount, 0.79))\ntolls_amount_pounds: double (multiply_checked(tolls_amount, 0.79))\ntotal_amount_pounds: double (multiply_checked(total_amount, 0.79))\n\nSee $.data for the source Arrow object\n```\n:::\n:::\n\n\n## use `across()` to transform data in multiple columns\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxis_gbp %>%\n  select(contains(\"amount\")) %>%\n  head() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6 × 8\n  fare_amount tip_amount tolls_amount total_amount fare_amount_pounds\n        <dbl>      <dbl>        <dbl>        <dbl>              <dbl>\n1        29.7       6.04            0        36.2               23.5 \n2         9.3       0               0         9.8                7.35\n3         4.1       1.38            0         5.98               3.24\n4         4.5       1               0         6                  3.56\n5         4.5       0               0         5.5                3.56\n6         4.1       0               0         5.6                3.24\n# ℹ 3 more variables: tip_amount_pounds <dbl>, tolls_amount_pounds <dbl>,\n#   total_amount_pounds <dbl>\n```\n:::\n:::\n\n\n# dplyr API in arrow - what is and isn't implemented?\n\n## example - `slice()`\n\nFirst three trips in the dataset in 2021 where distance \\> 100 miles\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlong_rides_2021 <- nyc_taxi %>%\n  filter(year == 2021 & trip_distance > 100) %>%\n  select(pickup_datetime, year, trip_distance)\n\nlong_rides_2021 %>%\n  slice(1:3)\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in UseMethod(\"slice\"): no applicable method for 'slice' applied to an object of class \"arrow_dplyr_query\"\n```\n:::\n:::\n\n\n## head to the docs!\n\n\n::: {.cell}\n\n```{.r .cell-code}\n?`arrow-dplyr`\n```\n:::\n\n\n## A different function\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlong_rides_2021 %>%\n  slice_max(n = 3, order_by = trip_distance, with_ties = FALSE) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  pickup_datetime      year trip_distance\n  <dttm>              <int>         <dbl>\n1 2021-11-16 12:55:00  2021       351613.\n2 2021-10-27 17:46:00  2021       345124.\n3 2021-12-11 10:48:00  2021       335094.\n```\n:::\n:::\n\n\n## Or call `collect()` first\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlong_rides_2021 %>%\n  collect() %>%\n  slice(1:3)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  pickup_datetime      year trip_distance\n  <dttm>              <int>         <dbl>\n1 2021-10-02 15:04:53  2021          188.\n2 2021-10-03 16:45:02  2021          134 \n3 2021-10-03 17:29:35  2021          218.\n```\n:::\n:::\n\n\n## tidyr functions - pivot\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tidyr)\n\nnyc_taxi %>% \n  group_by(vendor_name) %>%\n  summarise(max_fare = max(fare_amount), min_fare = min(fare_amount)) %>%\n  pivot_longer(!vendor_name, names_to = \"metric\") %>% \n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in UseMethod(\"pivot_longer\"): no applicable method for 'pivot_longer' applied to an object of class \"arrow_dplyr_query\"\n```\n:::\n:::\n\n\n## duckdb\n\n![](images/dplyr-arrow-duckdb.png)\n\n## tidyr functions - pivot with duckdb!\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(duckdb)\n\nnyc_taxi %>% \n  group_by(vendor_name) %>%\n  summarise(max_fare = max(fare_amount), min_fare = min(fare_amount)) %>%\n  to_duckdb() %>% # send data to duckdb\n  pivot_longer(!vendor_name, names_to = \"metric\") %>% \n  to_arrow() %>% # return data back to arrow\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6 × 3\n  vendor_name metric     value\n  <chr>       <chr>      <dbl>\n1 CMT         max_fare 998310.\n2 VTS         max_fare  10000.\n3 <NA>        max_fare   3555.\n4 CMT         min_fare   -652.\n5 VTS         min_fare  -1856 \n6 <NA>        min_fare   -150.\n```\n:::\n:::\n\n::: {.callout-caution collapse=\"true\"}\n## Requires arrow 13.0.0\nThis code requires arrow 13.0.0 or above to run, due to a bug which was fixed in this version\n:::\n\n# Using functions inside verbs\n\n## Using functions inside verbs\n\n-   lots of the lubridate and stringr APIs supported!\n-   base R and others too - always good to check the docs\n\n## Morning vs afternoon with namespacing\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"2\"}\nnyc_taxi %>%\n  group_by(time_of_day = ifelse(lubridate::am(pickup_datetime), \"morning\", \"afternoon\")) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 2\n# Groups:   time_of_day [2]\n  time_of_day         n\n  <chr>           <int>\n1 afternoon   740298348\n2 morning     415497564\n```\n:::\n:::\n\n\n## Morning vs afternoon - without namespacing\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"2\"}\nlibrary(lubridate)\n\nnyc_taxi %>%\n  group_by(time_of_day = ifelse(am(pickup_datetime), \"morning\", \"afternoon\")) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 2\n# Groups:   time_of_day [2]\n  time_of_day         n\n  <chr>           <int>\n1 afternoon   740298348\n2 morning     415497564\n```\n:::\n:::\n\n\n## Head to the docs again to see what's implemented!\n\n\n::: {.cell}\n\n```{.r .cell-code}\n?`arrow-dplyr`\n```\n:::\n\n\n## Your Turn\n\n1.  Use the `dplyr::filter()` and `stringr::str_ends()` to return a subset of the data which is a) from September 2020, and b) the value in `vendor_name` ends with the letter \"S\".\n2.  Try to use the `stringr` function `str_replace_na()` to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. What happens, and why?\n3.  Bonus question: see if you can find a different way of completing the task in question 2.\n\n➡️ [Data Manipulation Part I Exercises Page](2_data_manipulation_1-exercises.html)\n",
+    "supporting": [
+      "2_data_manipulation_1_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/3_data_engineering-exercises/execute-results/html.json b/_freeze/materials/3_data_engineering-exercises/execute-results/html.json
index cb3a721..04a9c01 100644
--- a/_freeze/materials/3_data_engineering-exercises/execute-results/html.json
+++ b/_freeze/materials/3_data_engineering-exercises/execute-results/html.json
@@ -1,7 +1,7 @@
 {
-  "hash": "2ebffc6703abf9f5154314e1b611bf5c",
+  "hash": "13a3e49815222e14b5507dbadd84de3f",
   "result": {
-    "markdown": "---\ntitle: \"Data Engineering with Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n\n# Schemas\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\"\n)\n```\n:::\n\n\n::: {#exercise-schema .callout-tip}\n# Data Types & Controlling the Schema\n\n::: panel-tabset\n## Problem\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` instead of the `<null>` interpreted by Arrow.\n\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n## Solution 1\n\n\n::: {.cell hash='3_data_engineering-exercises_cache/html/seattle-csv-schema-1_e40b0cab0210225d521566e1f79413a4'}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\",\n  skip = 1,\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(),\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\n```\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\",\n  skip = 1,\n schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = utf8(),\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\n```\n:::\n\n\n## Solution 2\n\nThe number of `Checkouts` by `CheckoutYear` arranged by `CheckoutYear`:\n\n\n::: {.cell hash='3_data_engineering-exercises_cache/html/seattle-csv-dplyr-1_20e12befff20e836a34522f4bebbc29a'}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n```\n:::\n:::\n\n\nor\n\n\n::: {.cell hash='3_data_engineering-exercises_cache/html/seattle-csv-dplyr-2_cc583a3db56f188202fdb17f830d19ae'}\n\n```{.r .cell-code}\nseattle_csv |> \n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   CheckoutYear       n\n          <int>   <int>\n 1         2005 3798685\n 2         2006 6599318\n 3         2007 7126627\n 4         2008 8438486\n 5         2009 9135167\n 6         2010 8608966\n 7         2011 8321732\n 8         2012 8163046\n 9         2013 9057096\n10         2014 9136081\n11         2015 9084179\n12         2016 9021051\n13         2017 9231648\n14         2018 9149176\n15         2019 9199083\n16         2020 6053717\n17         2021 7361031\n18         2022 7001989\n```\n:::\n:::\n\n\nQuerying 42 million rows of data stored in a CSV on disk in \\~10 seconds, not too bad.\n:::\n:::\n\n# Parquet\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- here::here(\"data/seattle-library-checkouts-parquet\")\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n::: {#exercise-dataset .callout-tip}\n# Parquet\n\n::: panel-tabset\n## Problems\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(seattle_parquet, \n             format = \"parquet\") |>\n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n```\n:::\n\n\nA *much* faster compute time for the query when the on-disk data is stored in the Parquet format.\n:::\n:::\n\n# Partitioning\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n::: {#exercise-dataset .callout-tip}\n# Partitioning\n\n::: panel-tabset\n## Problems\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `CheckoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n## Solution 1\n\nWriting the data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_checkouttype <- here::here(\"data/seattle-library-checkouts-type\")\n\nseattle_csv |>\n  group_by(CheckoutType) |>\n  write_dataset(path = seattle_checkouttype,\n                format = \"parquet\")\n```\n:::\n\n\n## Solution 2\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutType`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(here::here(\"data/seattle-library-checkouts-type\")) |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  group_by(CheckoutYear) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.991   0.243   0.489 \n```\n:::\n:::\n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutYear` and `CheckoutMonth`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(here::here(\"data/seattle-library-checkouts\")) |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  group_by(CheckoutYear) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.090   0.006   0.085 \n```\n:::\n:::\n\n\n\\~10x faster compute time because the `filter()` call is based on the partitions.\n:::\n:::\n",
+    "markdown": "---\ntitle: \"Data Engineering with Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n\n# Schemas\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\"\n)\n```\n:::\n\n\n::: {#exercise-schema .callout-tip}\n# Data Types & Controlling the Schema\n\n::: panel-tabset\n## Problem\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` instead of the `<null>` interpreted by Arrow.\n\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n## Solution 1\n\n\n::: {.cell hash='3_data_engineering-exercises_cache/html/seattle-csv-schema-1_e40b0cab0210225d521566e1f79413a4'}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\",\n  skip = 1,\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(),\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\n```\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\",\n  skip = 1,\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = utf8(),\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\n```\n:::\n\n\n## Solution 2\n\nThe number of `Checkouts` by `CheckoutYear` arranged by `CheckoutYear`:\n\n\n::: {.cell hash='3_data_engineering-exercises_cache/html/seattle-csv-dplyr-1_20e12befff20e836a34522f4bebbc29a'}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n```\n:::\n:::\n\n\nor\n\n\n::: {.cell hash='3_data_engineering-exercises_cache/html/seattle-csv-dplyr-2_cc583a3db56f188202fdb17f830d19ae'}\n\n```{.r .cell-code}\nseattle_csv |> \n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   CheckoutYear       n\n          <int>   <int>\n 1         2005 3798685\n 2         2006 6599318\n 3         2007 7126627\n 4         2008 8438486\n 5         2009 9135167\n 6         2010 8608966\n 7         2011 8321732\n 8         2012 8163046\n 9         2013 9057096\n10         2014 9136081\n11         2015 9084179\n12         2016 9021051\n13         2017 9231648\n14         2018 9149176\n15         2019 9199083\n16         2020 6053717\n17         2021 7361031\n18         2022 7001989\n```\n:::\n:::\n\n\nQuerying 42 million rows of data stored in a CSV on disk in \\~10 seconds, not too bad.\n:::\n:::\n\n# Parquet\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- here::here(\"data/seattle-library-checkouts-parquet\")\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n::: {#exercise-dataset .callout-tip}\n# Parquet\n\n::: panel-tabset\n## Problems\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(seattle_parquet, \n             format = \"parquet\") |>\n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n```\n:::\n\n\nA *much* faster compute time for the query when the on-disk data is stored in the Parquet format.\n:::\n:::\n\n# Partitioning\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n::: {#exercise-dataset .callout-tip}\n# Partitioning\n\n::: panel-tabset\n## Problems\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `CheckoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n## Solution 1\n\nWriting the data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_checkouttype <- here::here(\"data/seattle-library-checkouts-type\")\n\nseattle_csv |>\n  group_by(CheckoutType) |>\n  write_dataset(path = seattle_checkouttype,\n                format = \"parquet\")\n```\n:::\n\n\n## Solution 2\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutType`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(here::here(\"data/seattle-library-checkouts-type\")) |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  group_by(CheckoutYear) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  1.112   0.350   0.582 \n```\n:::\n:::\n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutYear` and `CheckoutMonth`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(here::here(\"data/seattle-library-checkouts\")) |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  group_by(CheckoutYear) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.088   0.006   0.079 \n```\n:::\n:::\n\n\n\\~10x faster compute time because the `filter()` call is based on the partitions.\n:::\n:::\n",
     "supporting": [],
     "filters": [
       "rmarkdown/pagebreak.lua"
diff --git a/_freeze/materials/3_data_engineering/execute-results/html.json b/_freeze/materials/3_data_engineering/execute-results/html.json
index 97b5d43..4070b85 100644
--- a/_freeze/materials/3_data_engineering/execute-results/html.json
+++ b/_freeze/materials/3_data_engineering/execute-results/html.json
@@ -1,8 +1,10 @@
 {
-  "hash": "dbfc1aac3a13a5887c1185e2bc916ae1",
+  "hash": "85aa8c2f414f47610e2b576a94026fcf",
   "result": {
-    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n::: {.cell}\n\n:::\n\n\n# Data Engineering with Arrow {#data-eng-storage}\n\n## Data Engineering\n\n<br>\n\n![](images/data-engineering.png)\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://en.wikipedia.org/wiki/Data_engineering>\n:::\n\n## .NORM Files\n\n![](images/norm_normal_file_format_2x.png){.absolute top=\"0\" left=\"400\"}\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://xkcd.com/2116/>\n:::\n\n## Formats\n\n![](images/big-data-formats-luminousman.png){.absolute top=\"0\" left=\"250\"}\n\n::: {style=\"font-size: 60%; margin-top: 550px;\"}\n<https://luminousmen.com/post/big-data-file-formats>\n:::\n\n::: notes\nThere are lots of big data/columnar formats (not all supported by Arrow we are only covering Parquet and CSV --- CSV is still a big player in the file format world, so we will learn how to work with CSVs with Arrow\n:::\n\n## Arrow & File Formats\n\n![](images/arrow-read-write-updated.png)\n\n## Slido Poll: Formats\n\n<br>\n\nWhich file formats do you use most often?\n\n-   CSV (.csv)\n-   MS Excel (.xls and .xlsx)\n-   Parquet (.parquet)\n-   Something else\n\n## Seattle<br>Checkouts<br>Big CSV\n\n![](images/seattle-checkouts.png){.absolute top=\"0\" left=\"300\"}\n\n::: notes\n<https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6>\n:::\n\n## Download the 9GB CSV file\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncurl::multi_download(\n  \"https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv\",\n  here::here(\"data/seattle-library-checkouts.csv\"),\n  resume = TRUE\n)\n```\n:::\n\n\n## arrow::open_dataset() with a CSV\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"), \n  format = \"csv\"\n)\n\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 1 csv file\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n## 👀 Glimpse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |> glimpse()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 1 csv file\n41,389,465 rows x 12 columns\n$ UsageClass      <string> \"Physical\", \"Physical\", \"Digital\", \"Physical\", \"Physi…\n$ CheckoutType    <string> \"Horizon\", \"Horizon\", \"OverDrive\", \"Horizon\", \"Horizo…\n$ MaterialType    <string> \"BOOK\", \"BOOK\", \"EBOOK\", \"BOOK\", \"SOUNDDISC\", \"BOOK\",…\n$ CheckoutYear     <int64> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016,…\n$ CheckoutMonth    <int64> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…\n$ Checkouts        <int64> 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 3, 2, 1, 3, 2, 3,…\n$ Title           <string> \"Super rich : a guide to having it all / Russell Simm…\n$ ISBN              <null> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…\n$ Creator         <string> \"Simmons, Russell\", \"Barclay, James, 1965-\", \"Tim Par…\n$ Subjects        <string> \"Self realization, Conduct of life, Attitude Psycholo…\n$ Publisher       <string> \"Gotham Books,\", \"Pyr,\", \"Random House, Inc.\", \"Dial …\n$ PublicationYear <string> \"c2011.\", \"2010.\", \"2015\", \"2005.\", \"c2004.\", \"c2005.…\n```\n:::\n:::\n\n\n## Parsing the Metadata\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n::: {.cell}\n\n:::\n\n\n## Parsing the Metadata\n\n<br>\n\nArrow scans 👀 a few thousand rows of the file(s) to impute or \"guess\" the data types\n\n::: {style=\"font-size: 80%; margin-top: 200px;\"}\n📚 arrow vs readr blog post: <https://thisisnic.github.io/2022/11/21/type-inference-in-readr-and-arrow/>\n:::\n\n## Parsers Are Not Always Right\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n::: notes\nInternational Standard Book Number (ISBN) is a 13-digit number that uniquely identifies books and book-like products published internationally.\n\nData Dictionaries, metadata in data catalogues should provide this info.\n:::\n\n## Arrow Data Types\n\nArrow has a rich data type system, including direct analogs of many R data types\n\n-   `<dbl>` == `<double>`\n-   `<chr>` == `<string>` or `<utf8>`\n-   `<int>` == `<int32>`\n\n<br>\n\n<https://arrow.apache.org/docs/r/articles/data_types.html>\n\n## Arrow's schema()\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema$code() \n```\n\n::: {.cell-output .cell-output-stdout}\n```\nschema(UsageClass = utf8(), CheckoutType = utf8(), MaterialType = utf8(), \n    CheckoutYear = int64(), CheckoutMonth = int64(), Checkouts = int64(), \n    Title = utf8(), ISBN = null(), Creator = utf8(), Subjects = utf8(), \n    Publisher = utf8(), PublicationYear = utf8())\n```\n:::\n:::\n\n::: {.cell}\n\n:::\n\n\n## Let's Control the Schema\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|13\"}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\",\n  skip = 1,\n  schema = schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 1 csv file\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` instead of the `<null>` interpreted by Arrow.\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 9GB CSV file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2016          9021051\n 2         2022          7001989\n 3         2017          9231648\n 4         2018          9149176\n 5         2019          9199083\n 6         2020          6053717\n 7         2021          7361031\n 8         2005          3798685\n 9         2006          6599318\n10         2007          7126627\n11         2008          8438486\n12         2009          9135167\n13         2010          8608966\n14         2011          8321732\n15         2012          8163046\n16         2013          9057096\n17         2014          9136081\n18         2015          9084179\n```\n:::\n:::\n\n\n## 9GB CSV file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"5\"}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n 15.093   1.502  14.283 \n```\n:::\n:::\n\n\n42 million rows -- not bad, but could be faster....\n\n## File Format: Apache Parquet\n\n![](images/apache-parquet.png){.absolute top=\"100\" left=\"200\" width=\"700\"}\n\n::: {style=\"font-size: 60%; margin-top: 450px;\"}\n<https://parquet.apache.org/>\n:::\n\n## Parquet\n\n-   usually smaller than equivalent CSV file\n-   rich type system & stores the data type along with the data\n-   \"column-oriented\" == better performance over CSV's row-by-row\n-   \"row-chunked\" == work on different parts of the file at the same time or skip some chunks all together\n\n::: notes\n-   efficient encodings to keep file size down, and supports file compression, less data to move from disk to memory\n-   CSV has no info about data types, inferred by each parser\n:::\n\n## Parquet Files: \"row-chunked\"\n\n![](images/parquet-chunking.png)\n\n## Parquet Files: \"row-chunked & column-oriented\"\n\n![](images/parquet-columnar.png)\n\n## Writing to Parquet\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- here::here(\"data/seattle-library-checkouts-parquet\")\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n## Storage: Parquet vs CSV\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile <- list.files(seattle_parquet)\nfile.size(file.path(seattle_parquet, file)) / 10**9\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 4.423348\n```\n:::\n:::\n\n\n<br>\n\nParquet about half the size of the CSV file on-disk 💾\n\n## Your Turn\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 4.5GB Parquet file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.290   0.403   0.726 \n```\n:::\n:::\n\n\n42 million rows -- much better! But could be *even* faster....\n\n## File Storage:<br>Partitioning\n\n<br>\n\n::: columns\n::: {.column width=\"50%\"}\nDividing data into smaller pieces, making it more easily accessible and manageable\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n::: notes\nalso called multi-files or sometimes shards\n:::\n\n## Slido Poll: Partitioning?\n\nHave you partitioned your data or used partitioned data before today?\n\n-   Yes\n-   No\n-   Not sure, the data engineers sort that out!\n\n## Art & Science of Partitioning\n\n<br>\n\n-   avoid files \\< 20MB and \\> 2GB\n-   avoid \\> 10,000 files (🤯)\n-   partition on variables used in `filter()`\n\n::: notes\n-   guidelines not rules, results vary\n-   experiment\n-   arrow suggests avoid files smaller than 20MB and larger than 2GB\n-   avoid partitions that produce more than 10,000 files\n-   partition by variables that you filter by, allows arrow to only read relevant files\n:::\n\n## Rewriting the Data Again\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n## What Did We \"Engineer\"?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nsizes <- tibble(\n  files = list.files(seattle_parquet_part, recursive = TRUE),\n  size_GB = file.size(file.path(seattle_parquet_part, files)) / 10**9\n)\n\nsizes\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   files                            size_GB\n   <chr>                              <dbl>\n 1 CheckoutYear=2005/part-0.parquet   0.114\n 2 CheckoutYear=2006/part-0.parquet   0.172\n 3 CheckoutYear=2007/part-0.parquet   0.186\n 4 CheckoutYear=2008/part-0.parquet   0.204\n 5 CheckoutYear=2009/part-0.parquet   0.224\n 6 CheckoutYear=2010/part-0.parquet   0.233\n 7 CheckoutYear=2011/part-0.parquet   0.250\n 8 CheckoutYear=2012/part-0.parquet   0.261\n 9 CheckoutYear=2013/part-0.parquet   0.282\n10 CheckoutYear=2014/part-0.parquet   0.296\n11 CheckoutYear=2015/part-0.parquet   0.308\n12 CheckoutYear=2016/part-0.parquet   0.315\n13 CheckoutYear=2017/part-0.parquet   0.319\n14 CheckoutYear=2018/part-0.parquet   0.306\n15 CheckoutYear=2019/part-0.parquet   0.302\n16 CheckoutYear=2020/part-0.parquet   0.158\n17 CheckoutYear=2021/part-0.parquet   0.240\n18 CheckoutYear=2022/part-0.parquet   0.252\n```\n:::\n:::\n\n\n## 4.5GB partitioned Parquet files + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nopen_dataset(seattle_parquet_part,\n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.187   0.267   0.352 \n```\n:::\n:::\n\n\n<br>\n\n42 million rows -- not too shabby!\n\n## Your Turn\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `ChekcoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## Partition Design\n\n::: columns\n::: {.column width=\"50%\"}\n-   Partitioning on variables commonly used in `filter()` often faster\n-   Number of partitions also important (Arrow reads the metadata of each file)\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n## Performance Review: Single CSV\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"), \n  format = \"csv\"\n) |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n 16.259   2.423  15.192 \n```\n:::\n:::\n\n\n## Performance Review: Partitioned Parquet\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(here::here(\"data/seattle-library-checkouts\"),\n             format = \"parquet\") |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.353   0.046   0.107 \n```\n:::\n:::\n\n\n## Engineering Data Tips for Improved Storage & Performance\n\n<br>\n\n-   consider \"column-oriented\" file formats like Parquet\n-   consider partitioning, experiment to get an appropriate partition design 🗂️\n-   watch your schemas 👀\n\n## R for Data Science (2e)\n\n::: columns\n::: {.column width=\"50%\"}\n![](images/r4ds-cover.jpg){.absolute top=\"100\" width=\"400\"}\n:::\n\n::: {.column width=\"50%\"}\n<br>\n\n[Chapter 23: Arrow](https://r4ds.hadley.nz/arrow.html)\n\n<br>\n\n<https://r4ds.hadley.nz/>\n:::\n:::\n",
-    "supporting": [],
+    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n::: {.cell}\n\n:::\n\n\n# Data Engineering with Arrow {#data-eng-storage}\n\n## Data Engineering\n\n<br>\n\n![](images/data-engineering.png)\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://en.wikipedia.org/wiki/Data_engineering>\n:::\n\n## .NORM Files\n\n![](images/norm_normal_file_format_2x.png){.absolute top=\"0\" left=\"400\"}\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://xkcd.com/2116/>\n:::\n\n## Formats\n\n![](images/big-data-formats-luminousman.png){.absolute top=\"0\" left=\"250\"}\n\n::: {style=\"font-size: 60%; margin-top: 550px;\"}\n<https://luminousmen.com/post/big-data-file-formats>\n:::\n\n::: notes\nThere are lots of big data/columnar formats (not all supported by Arrow we are only covering Parquet and CSV --- CSV is still a big player in the file format world, so we will learn how to work with CSVs with Arrow\n:::\n\n## Arrow & File Formats\n\n![](images/arrow-read-write-updated.png)\n\n## Poll: Formats\n\n<br>\n\nWhich file formats do you use most often?\n\n-   CSV (.csv)\n-   MS Excel (.xls and .xlsx)\n-   Parquet (.parquet)\n-   Something else\n\n## Seattle<br>Checkouts<br>Big CSV\n\n![](images/seattle-checkouts.png){.absolute top=\"0\" left=\"300\"}\n\n::: notes\n<https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6>\n:::\n\n## arrow::open_dataset() with a CSV\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"), \n  format = \"csv\"\n)\n\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 1 csv file\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n## 👀 Glimpse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |> glimpse()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 1 csv file\n41,389,465 rows x 12 columns\n$ UsageClass      <string> \"Physical\", \"Physical\", \"Digital\", \"Physical\", \"Physi…\n$ CheckoutType    <string> \"Horizon\", \"Horizon\", \"OverDrive\", \"Horizon\", \"Horizo…\n$ MaterialType    <string> \"BOOK\", \"BOOK\", \"EBOOK\", \"BOOK\", \"SOUNDDISC\", \"BOOK\",…\n$ CheckoutYear     <int64> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016,…\n$ CheckoutMonth    <int64> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…\n$ Checkouts        <int64> 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 3, 2, 1, 3, 2, 3,…\n$ Title           <string> \"Super rich : a guide to having it all / Russell Simm…\n$ ISBN              <null> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…\n$ Creator         <string> \"Simmons, Russell\", \"Barclay, James, 1965-\", \"Tim Par…\n$ Subjects        <string> \"Self realization, Conduct of life, Attitude Psycholo…\n$ Publisher       <string> \"Gotham Books,\", \"Pyr,\", \"Random House, Inc.\", \"Dial …\n$ PublicationYear <string> \"c2011.\", \"2010.\", \"2015\", \"2005.\", \"c2004.\", \"c2005.…\n```\n:::\n:::\n\n\n## Parsing the Metadata\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n## Parsing the Metadata\n\n<br>\n\nArrow scans 👀 a few thousand rows of the file(s) to impute or \"guess\" the data types\n\n::: {style=\"font-size: 80%; margin-top: 200px;\"}\n📚 arrow vs readr blog post: <https://thisisnic.github.io/2022/11/21/type-inference-in-readr-and-arrow/>\n:::\n\n## Parsers Are Not Always Right\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n::: notes\nInternational Standard Book Number (ISBN) is a 13-digit number that uniquely identifies books and book-like products published internationally.\n\nData Dictionaries, metadata in data catalogues should provide this info.\n:::\n\n## Arrow Data Types\n\nArrow has a rich data type system, including direct analogs of many R data types\n\n-   `<dbl>` == `<double>`\n-   `<chr>` == `<string>` or `<utf8>`\n-   `<int>` == `<int32>`\n\n<br>\n\n<https://arrow.apache.org/docs/r/articles/data_types.html>\n\n## Arrow's schema()\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema$code() \n```\n\n::: {.cell-output .cell-output-stdout}\n```\nschema(UsageClass = utf8(), CheckoutType = utf8(), MaterialType = utf8(), \n    CheckoutYear = int64(), CheckoutMonth = int64(), Checkouts = int64(), \n    Title = utf8(), ISBN = null(), Creator = utf8(), Subjects = utf8(), \n    Publisher = utf8(), PublicationYear = utf8())\n```\n:::\n:::\n\n\n## Let's Control the Schema\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|13\"}\nseattle_csv <- open_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"),\n  format = \"csv\",\n  skip = 1,\n  schema = schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 1 csv file\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` instead of the `<null>` interpreted by Arrow.\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 9GB CSV file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2016          9021051\n 2         2022          7001989\n 3         2017          9231648\n 4         2018          9149176\n 5         2019          9199083\n 6         2020          6053717\n 7         2021          7361031\n 8         2005          3798685\n 9         2006          6599318\n10         2007          7126627\n11         2008          8438486\n12         2009          9135167\n13         2010          8608966\n14         2011          8321732\n15         2012          8163046\n16         2013          9057096\n17         2014          9136081\n18         2015          9084179\n```\n:::\n:::\n\n\n## 9GB CSV file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"5\"}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n 12.758   1.238  12.129 \n```\n:::\n:::\n\n\n42 million rows -- not bad, but could be faster....\n\n## File Format: Apache Parquet\n\n![](images/apache-parquet.png){.absolute top=\"100\" left=\"200\" width=\"700\"}\n\n::: {style=\"font-size: 60%; margin-top: 450px;\"}\n<https://parquet.apache.org/>\n:::\n\n## Parquet\n\n-   usually smaller than equivalent CSV file\n-   rich type system & stores the data type along with the data\n-   \"column-oriented\" == better performance over CSV's row-by-row\n-   \"row-chunked\" == work on different parts of the file at the same time or skip some chunks all together\n\n::: notes\n-   efficient encodings to keep file size down, and supports file compression, less data to move from disk to memory\n-   CSV has no info about data types, inferred by each parser\n:::\n\n## Parquet Files: \"row-chunked\"\n\n![](images/parquet-chunking.png)\n\n## Parquet Files: \"row-chunked & column-oriented\"\n\n![](images/parquet-columnar.png)\n\n## Writing to Parquet\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- here::here(\"data/seattle-library-checkouts-parquet\")\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n## Storage: Parquet vs CSV\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile <- list.files(seattle_parquet)\nfile.size(file.path(seattle_parquet, file)) / 10**9\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 4.423348\n```\n:::\n:::\n\n\n<br>\n\nParquet about half the size of the CSV file on-disk 💾\n\n## Your Turn\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 4.5GB Parquet file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.155   0.342   0.640 \n```\n:::\n:::\n\n\n42 million rows -- much better! But could be *even* faster....\n\n## File Storage:<br>Partitioning\n\n<br>\n\n::: columns\n::: {.column width=\"50%\"}\nDividing data into smaller pieces, making it more easily accessible and manageable\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n::: notes\nalso called multi-files or sometimes shards\n:::\n\n## Poll: Partitioning?\n\nHave you partitioned your data or used partitioned data before today?\n\n-   Yes\n-   No\n-   Not sure, the data engineers sort that out!\n\n## Art & Science of Partitioning\n\n<br>\n\n-   avoid files \\< 20MB and \\> 2GB\n-   avoid \\> 10,000 files (🤯)\n-   partition on variables used in `filter()`\n\n::: notes\n-   guidelines not rules, results vary\n-   experiment\n-   arrow suggests avoid files smaller than 20MB and larger than 2GB\n-   avoid partitions that produce more than 10,000 files\n-   partition by variables that you filter by, allows arrow to only read relevant files\n:::\n\n## Rewriting the Data Again\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n## What Did We \"Engineer\"?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nsizes <- tibble(\n  files = list.files(seattle_parquet_part, recursive = TRUE),\n  size_GB = file.size(file.path(seattle_parquet_part, files)) / 10**9\n)\n\nsizes\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 18 × 2\n   files                            size_GB\n   <chr>                              <dbl>\n 1 CheckoutYear=2005/part-0.parquet   0.114\n 2 CheckoutYear=2006/part-0.parquet   0.172\n 3 CheckoutYear=2007/part-0.parquet   0.186\n 4 CheckoutYear=2008/part-0.parquet   0.204\n 5 CheckoutYear=2009/part-0.parquet   0.224\n 6 CheckoutYear=2010/part-0.parquet   0.233\n 7 CheckoutYear=2011/part-0.parquet   0.250\n 8 CheckoutYear=2012/part-0.parquet   0.261\n 9 CheckoutYear=2013/part-0.parquet   0.282\n10 CheckoutYear=2014/part-0.parquet   0.296\n11 CheckoutYear=2015/part-0.parquet   0.308\n12 CheckoutYear=2016/part-0.parquet   0.315\n13 CheckoutYear=2017/part-0.parquet   0.319\n14 CheckoutYear=2018/part-0.parquet   0.306\n15 CheckoutYear=2019/part-0.parquet   0.302\n16 CheckoutYear=2020/part-0.parquet   0.158\n17 CheckoutYear=2021/part-0.parquet   0.240\n18 CheckoutYear=2022/part-0.parquet   0.252\n```\n:::\n:::\n\n\n## 4.5GB partitioned Parquet files + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- here::here(\"data/seattle-library-checkouts\")\n\nopen_dataset(seattle_parquet_part,\n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.055   0.193   0.310 \n```\n:::\n:::\n\n\n<br>\n\n42 million rows -- not too shabby!\n\n## Your Turn\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `ChekcoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## Partition Design\n\n::: columns\n::: {.column width=\"50%\"}\n-   Partitioning on variables commonly used in `filter()` often faster\n-   Number of partitions also important (Arrow reads the metadata of each file)\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n## Performance Review: Single CSV\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(\n  sources = here::here(\"data/seattle-library-checkouts.csv\"), \n  format = \"csv\"\n) |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n 14.047   1.922  13.150 \n```\n:::\n:::\n\n\n## Performance Review: Partitioned Parquet\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(here::here(\"data/seattle-library-checkouts\"),\n             format = \"parquet\") |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.340   0.047   0.100 \n```\n:::\n:::\n\n\n## Engineering Data Tips for Improved Storage & Performance\n\n<br>\n\n-   consider \"column-oriented\" file formats like Parquet\n-   consider partitioning, experiment to get an appropriate partition design 🗂️\n-   watch your schemas 👀\n\n## R for Data Science (2e)\n\n::: columns\n::: {.column width=\"50%\"}\n![](images/r4ds-cover.jpg){.absolute top=\"100\" width=\"400\"}\n:::\n\n::: {.column width=\"50%\"}\n<br>\n\n[Chapter 23: Arrow](https://r4ds.hadley.nz/arrow.html)\n\n<br>\n\n<https://r4ds.hadley.nz/>\n:::\n:::\n",
+    "supporting": [
+      "3_data_engineering_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json b/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json
index 25bfae3..cfeb9dd 100644
--- a/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json
+++ b/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json
@@ -1,8 +1,10 @@
 {
-  "hash": "64a9363f44f058cbff0cf64ccb5542f3",
+  "hash": "082a16c2424da763fe5f0117350b69f2",
   "result": {
-    "markdown": "---\ntitle: \"Data Manipulation Part 2 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(duckdb)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n::: {#exercise-joins .callout-tip}\n# Joins\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\npickup_location <- read_csv_arrow(here::here(\"data/taxi_zone_lookup.csv\"))\n\npickup_location <- pickup_location %>%\n  select(\n    pickup_location_id = LocationID,\n    borough = Borough,\n    pickup_zone = Zone\n  ) %>%\n  arrow_table(schema = schema(\n    pickup_location_id = int64(),\n    borough = utf8(),\n    pickup_zone = utf8()\n  ))\n\nnyc_taxi |>\n  filter(year == 2019) |>\n  left_join(pickup_location) |>\n  filter(str_detect(pickup_zone, \"Airport\")) |>\n  count(pickup_zone) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 2\n  pickup_zone             n\n  <chr>               <int>\n1 JFK Airport       2729336\n2 LaGuardia Airport 2159224\n3 Newark Airport       8643\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-window .callout-tip}\n# Window functions\n\n::: panel-tabset\n## Problem\n\n1.  How many trips in 2019 had a longer than average distance for that year?\n\n## Solution 1\n\n### Option 1 - via DuckDB\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year == 2019) %>%\n  to_duckdb() %>%\n  mutate(mean_distance = mean(trip_distance)) %>%\n  to_arrow() %>%\n  filter(trip_distance < mean_distance) %>%\n  count() %>%\n  collect()\n```\n:::\n\n\n### Option 2 - via a join\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year == 2019) %>%\n  left_join(\n    nyc_taxi %>%\n      group_by(year) %>%\n      summarise(mean_distance = mean(trip_distance))\n    ) %>%\n  filter(trip_distance < mean_distance) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n         n\n     <int>\n1 63051184\n```\n:::\n:::\n\n:::\n:::\n",
-    "supporting": [],
+    "markdown": "---\ntitle: \"Data Manipulation Part 2 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(duckdb)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n::: {#exercise-joins .callout-tip}\n# Joins\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\npickup_location <- read_csv_arrow(here::here(\"data/taxi_zone_lookup.csv\"))\n\npickup_location <- pickup_location %>%\n  select(\n    pickup_location_id = LocationID,\n    borough = Borough,\n    pickup_zone = Zone\n  ) %>%\n  arrow_table(schema = schema(\n    pickup_location_id = int64(),\n    borough = utf8(),\n    pickup_zone = utf8()\n  ))\n\nnyc_taxi |>\n  filter(year == 2019) |>\n  left_join(pickup_location) |>\n  filter(str_detect(pickup_zone, \"Airport\")) |>\n  count(pickup_zone) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 2\n  pickup_zone             n\n  <chr>               <int>\n1 LaGuardia Airport 2159224\n2 JFK Airport       2729336\n3 Newark Airport       8643\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-window .callout-tip}\n# Window functions\n\n::: panel-tabset\n## Problem\n\n1.  How many trips in September 2019 had a longer than average distance for that month?\n\n## Solution 1\n\n### Option 1 - via DuckDB\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year == 2019, month == 9) %>%\n  to_duckdb() %>%\n  mutate(mean_distance = mean(trip_distance)) %>%\n  to_arrow() %>%\n  filter(trip_distance < mean_distance) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n        n\n    <int>\n1 4881580\n```\n:::\n:::\n\n\n### Option 2 - via a join\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi %>%\n  filter(year == 2019, month == 9) %>%\n  left_join(\n    nyc_taxi %>%\n      filter(year == 2019, month == 9) %>%\n      group_by(year) %>%\n      summarise(mean_distance = mean(trip_distance))\n    ) %>%\n  filter(trip_distance < mean_distance) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n        n\n    <int>\n1 4881580\n```\n:::\n:::\n\n:::\n:::\n",
+    "supporting": [
+      "4_data_manipulation_2-exercises_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/4_data_manipulation_2/execute-results/html.json b/_freeze/materials/4_data_manipulation_2/execute-results/html.json
index 34c5cbf..ff8460f 100644
--- a/_freeze/materials/4_data_manipulation_2/execute-results/html.json
+++ b/_freeze/materials/4_data_manipulation_2/execute-results/html.json
@@ -1,8 +1,10 @@
 {
-  "hash": "4c40ca3848fd3250b315eb99c787d387",
+  "hash": "0f906adf6bd6d529f3e717f657aa775b",
   "result": {
-    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n\n# Data Manipulation---Part 2 {#data-manip-2}\n\n\n::: {.cell}\n\n:::\n\n\n# Joins\n\n## Joining a reference table\n\n\n::: {.cell}\n\n```{.r .cell-code}\nvendors <- tibble::tibble(\n  code = c(\"VTS\", \"CMT\", \"DDS\"),\n  full_name = c(\n    \"Verifone Transportation Systems\",\n    \"Creative Mobile Technologies\",\n    \"Digital Dispatch Systems\"\n  )\n)\n\nnyc_taxi %>%\n  left_join(vendors, by = c(\"vendor_name\" = \"code\")) %>%\n  select(vendor_name, full_name, pickup_datetime) %>%\n  head(3) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  vendor_name full_name                    pickup_datetime    \n  <chr>       <chr>                        <dttm>             \n1 CMT         Creative Mobile Technologies 2012-11-01 01:18:26\n2 CMT         Creative Mobile Technologies 2012-11-01 01:18:27\n3 CMT         Creative Mobile Technologies 2012-11-01 01:18:45\n```\n:::\n:::\n\n\n## Traps for the unwary\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi_zones <-\n  read_csv_arrow(here::here(\"data/taxi_zone_lookup.csv\")) %>%\n  select(location_id = LocationID,\n         borough = Borough)\n\nnyc_taxi_zones\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 265 × 2\n   location_id borough      \n         <int> <chr>        \n 1           1 EWR          \n 2           2 Queens       \n 3           3 Bronx        \n 4           4 Manhattan    \n 5           5 Staten Island\n 6           6 Staten Island\n 7           7 Queens       \n 8           8 Queens       \n 9           9 Queens       \n10          10 Queens       \n# ℹ 255 more rows\n```\n:::\n:::\n\n\n## Why didn't this work?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  left_join(nyc_taxi_zones, by = c(\"pickup_location_id\" = \"location_id\")) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `compute.arrow_dplyr_query()`:\n! Invalid: Incompatible data types for corresponding join field keys: FieldRef.Name(pickup_location_id) of type int64 and FieldRef.Name(location_id) of type int32\n```\n:::\n:::\n\n\n## Schema for the `nyc_taxi` table\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n## Schema for the `nyc_taxi_zones` table\n\n\n::: {.cell}\n\n```{.r .cell-code}\narrow_table(nyc_taxi_zones)$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nlocation_id: int32\nborough: string\n```\n:::\n:::\n\n\n-   `pickup_location_id` is int64 in the `nyc_taxi` table\n-   `location_id` is int32 in the `nyc_taxi_zones` table\n\n## Take control of the schema\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi_zones_arrow <- arrow_table(\n  nyc_taxi_zones, \n  schema = schema(location_id = int64(), borough = utf8())\n)\n```\n:::\n\n\n-   `schema()` takes variable name / types as input\n-   {arrow} has various \"type\" functions: `int64()`, `utf8()`, `boolean()`, `date32()` etc\n\n## Take control of the schema\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi_zones_arrow <- arrow_table(\n  nyc_taxi_zones, \n  schema = schema(location_id = int64(), borough = utf8())\n)\nnyc_taxi_zones_arrow$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nlocation_id: int64\nborough: string\n```\n:::\n:::\n\n\n## Prepare the auxiliary tables\n\n\n::: {.cell}\n\n```{.r .cell-code}\npickup <- nyc_taxi_zones_arrow |>\n  select(pickup_location_id = location_id,\n         pickup_borough = borough)\n\ndropoff <- nyc_taxi_zones_arrow |>\n  select(dropoff_location_id = location_id,\n         dropoff_borough = borough)\n```\n:::\n\n\n-   Join separately for the pickup and dropoff zones\n\n\n```{=html}\n<!-- \n    * simple example of a join\n    * show a join with unmatched schemas\n    * making the schemas match\n-->\n```\n\n## Join and cross-tabulate\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nborough_counts <- nyc_taxi |> \n  left_join(pickup) |>\n  left_join(dropoff) |>\n  count(pickup_borough, dropoff_borough) |>\n  arrange(desc(n)) |>\n  collect()\ntoc()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n154.868 sec elapsed\n```\n:::\n:::\n\n\n<br>\n\n2-3 minutes to join twice and cross-tabulate on non-partition variables, with 1.15 billion rows of data 🙂\n\n## The results\n\n\n::: {.cell}\n\n```{.r .cell-code}\nborough_counts\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 50 × 3\n   pickup_borough dropoff_borough         n\n   <chr>          <chr>               <int>\n 1 <NA>           <NA>            732357953\n 2 Manhattan      Manhattan       355850092\n 3 Queens         Manhattan        14648891\n 4 Manhattan      Queens           13186443\n 5 Manhattan      Brooklyn         11294128\n 6 Queens         Queens            7537042\n 7 Unknown        Unknown           4519763\n 8 Queens         Brooklyn          3727686\n 9 Brooklyn       Brooklyn          3566671\n10 Manhattan      Bronx             2091627\n# ℹ 40 more rows\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?\n\n➡️ [Data Manipulation Part I Exercises Page](4_data_manipulation_2-exercises.html)\n\n# Window functions\n\n## What are window functions?\n\n-   calculations within groups\n\n## Grouped summaries\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year <- nyc_taxi %>%\n  filter(year %in% 2021:2022) %>%\n  select(year, fare_amount)\n\nfare_by_year %>%\n  group_by(year) %>%\n  summarise(mean_fare = mean(fare_amount)) %>% \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 2\n   year mean_fare\n  <int>     <dbl>\n1  2021      13.5\n2  2022      13.0\n```\n:::\n:::\n\n\n## Window functions\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year %>%\n  group_by(year) %>%\n  mutate(mean_fare = mean(fare_amount)) %>% \n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError: window functions not currently supported in Arrow\nCall collect() first to pull data into R.\n```\n:::\n:::\n\n\n## Window functions - via joins\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year %>%\n  left_join(\n    nyc_taxi %>%\n      group_by(year) %>%\n      summarise(mean_fare = mean(fare_amount))\n  ) %>% \n  arrange(desc(fare_amount)) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 36,345,864 × 3\n    year fare_amount mean_fare\n   <int>       <dbl>     <dbl>\n 1  2021     818283.      13.5\n 2  2022     401092.      13.0\n 3  2021     398466.      13.5\n 4  2021     395854.      13.5\n 5  2021       6965       13.5\n 6  2021       6960.      13.5\n 7  2021       6010       13.5\n 8  2021       5954       13.5\n 9  2021       4969       13.5\n10  2021       3555.      13.5\n# ℹ 36,345,854 more rows\n```\n:::\n:::\n\n\n## Window functions - via duckdb\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year %>%\n  group_by(year) %>%\n  to_duckdb() %>%\n  mutate(mean_fare = mean(fare_amount)) %>% \n  to_arrow() %>%\n  arrange(desc(fare_amount)) %>%\n  collect()\n```\n:::\n\n\n## Your Turn\n\n1.  How many trips in 2019 had a longer than average distance for that year?\n\n➡️ [Data Manipulation Part I Exercises Page](4_data_manipulation_2-exercises.html)\n\n## Custom functions\n\n-   Not officially supported\n-   Works for simple operations but not with bindings\n\n## Custom functions - supported\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmillions <- function(x) x / 10^6\n\nnyc_taxi |>\n  group_by(vendor_name) %>%\n  summarise(trips = n()) %>%\n  mutate(\n    trips_mil = millions(trips)\n  ) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  vendor_name     trips trips_mil\n  <chr>           <int>     <dbl>\n1 CMT         531806845    532.  \n2 VTS         621279093    621.  \n3 <NA>          2709974      2.71\n```\n:::\n:::\n\n\n## Custom functions - not supported\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmorning <- function(x) ifelse(lubridate::am(x), \"morning\", \"afternoon\")\nnyc_taxi |>\n  group_by(morning(pickup_datetime)) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError: Expression morning(pickup_datetime) not supported in Arrow\nCall collect() first to pull data into R.\n```\n:::\n:::\n\n\n-   recommendation: write code as dplyr expressions instead of functions, or look up docs on user-defined functions for datasets (see `?register_scalar_function`)\n",
-    "supporting": [],
+    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n\n# Data Manipulation---Part 2 {#data-manip-2}\n\n\n::: {.cell}\n\n:::\n\n\n# Joins\n\n## Joining a reference table\n\n\n::: {.cell}\n\n```{.r .cell-code}\nvendors <- tibble::tibble(\n  code = c(\"VTS\", \"CMT\", \"DDS\"),\n  full_name = c(\n    \"Verifone Transportation Systems\",\n    \"Creative Mobile Technologies\",\n    \"Digital Dispatch Systems\"\n  )\n)\n\nnyc_taxi %>%\n  left_join(vendors, by = c(\"vendor_name\" = \"code\")) %>%\n  select(vendor_name, full_name, pickup_datetime) %>%\n  head(3) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  vendor_name full_name                    pickup_datetime    \n  <chr>       <chr>                        <dttm>             \n1 CMT         Creative Mobile Technologies 2012-11-01 01:18:26\n2 CMT         Creative Mobile Technologies 2012-11-01 01:18:27\n3 CMT         Creative Mobile Technologies 2012-11-01 01:18:45\n```\n:::\n:::\n\n\n## Traps for the unwary\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi_zones <-\n  read_csv_arrow(here::here(\"data/taxi_zone_lookup.csv\")) %>%\n  select(location_id = LocationID,\n         borough = Borough)\n\nnyc_taxi_zones\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 265 × 2\n   location_id borough      \n         <int> <chr>        \n 1           1 EWR          \n 2           2 Queens       \n 3           3 Bronx        \n 4           4 Manhattan    \n 5           5 Staten Island\n 6           6 Staten Island\n 7           7 Queens       \n 8           8 Queens       \n 9           9 Queens       \n10          10 Queens       \n# ℹ 255 more rows\n```\n:::\n:::\n\n\n## Why didn't this work?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  left_join(nyc_taxi_zones, by = c(\"pickup_location_id\" = \"location_id\")) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `compute.arrow_dplyr_query()`:\n! Invalid: Incompatible data types for corresponding join field keys: FieldRef.Name(pickup_location_id) of type int64 and FieldRef.Name(location_id) of type int32\n```\n:::\n:::\n\n\n## Schema for the `nyc_taxi` table\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n## Schema for the `nyc_taxi_zones` table\n\n\n::: {.cell}\n\n```{.r .cell-code}\narrow_table(nyc_taxi_zones)$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nlocation_id: int32\nborough: string\n```\n:::\n:::\n\n\n-   `pickup_location_id` is int64 in the `nyc_taxi` table\n-   `location_id` is int32 in the `nyc_taxi_zones` table\n\n## Take control of the schema\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi_zones_arrow <- arrow_table(\n  nyc_taxi_zones, \n  schema = schema(location_id = int64(), borough = utf8())\n)\n```\n:::\n\n\n-   `schema()` takes variable name / types as input\n-   {arrow} has various \"type\" functions: `int64()`, `utf8()`, `boolean()`, `date32()` etc\n\n## Take control of the schema\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi_zones_arrow <- arrow_table(\n  nyc_taxi_zones, \n  schema = schema(location_id = int64(), borough = utf8())\n)\nnyc_taxi_zones_arrow$schema\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSchema\nlocation_id: int64\nborough: string\n```\n:::\n:::\n\n\n## Prepare the auxiliary tables\n\n\n::: {.cell}\n\n```{.r .cell-code}\npickup <- nyc_taxi_zones_arrow |>\n  select(pickup_location_id = location_id,\n         pickup_borough = borough)\n\ndropoff <- nyc_taxi_zones_arrow |>\n  select(dropoff_location_id = location_id,\n         dropoff_borough = borough)\n```\n:::\n\n\n-   Join separately for the pickup and dropoff zones\n\n\n```{=html}\n<!-- \n    * simple example of a join\n    * show a join with unmatched schemas\n    * making the schemas match\n-->\n```\n\n## Join and cross-tabulate\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nborough_counts <- nyc_taxi |> \n  left_join(pickup) |>\n  left_join(dropoff) |>\n  count(pickup_borough, dropoff_borough) |>\n  arrange(desc(n)) |>\n  collect()\ntoc()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n138.677 sec elapsed\n```\n:::\n:::\n\n\n<br>\n\n2-3 minutes to join twice and cross-tabulate on non-partition variables, with 1.15 billion rows of data 🙂\n\n## The results\n\n\n::: {.cell}\n\n```{.r .cell-code}\nborough_counts\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 50 × 3\n   pickup_borough dropoff_borough         n\n   <chr>          <chr>               <int>\n 1 <NA>           <NA>            732357953\n 2 Manhattan      Manhattan       355850092\n 3 Queens         Manhattan        14648891\n 4 Manhattan      Queens           13186443\n 5 Manhattan      Brooklyn         11294128\n 6 Queens         Queens            7537042\n 7 Unknown        Unknown           4519763\n 8 Queens         Brooklyn          3727686\n 9 Brooklyn       Brooklyn          3566671\n10 Manhattan      Bronx             2091627\n# ℹ 40 more rows\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?\n\n➡️ [Data Manipulation Part I Exercises Page](4_data_manipulation_2-exercises.html)\n\n# Window functions\n\n## What are window functions?\n\n-   calculations within groups\n\n## Grouped summaries\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year <- nyc_taxi %>%\n  filter(year %in% 2021:2022) %>%\n  select(year, fare_amount)\n\nfare_by_year %>%\n  group_by(year) %>%\n  summarise(mean_fare = mean(fare_amount)) %>% \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 2\n   year mean_fare\n  <int>     <dbl>\n1  2021      13.5\n2  2022      13.0\n```\n:::\n:::\n\n\n## Window functions\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year %>%\n  group_by(year) %>%\n  mutate(mean_fare = mean(fare_amount)) %>% \n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError: window functions not currently supported in Arrow\nCall collect() first to pull data into R.\n```\n:::\n:::\n\n\n## Window functions - via joins\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year %>%\n  left_join(\n    nyc_taxi %>%\n      filter(year %in% 2021:2022) %>%\n      group_by(year) %>%\n      summarise(mean_fare = mean(fare_amount))\n  ) %>% \n  arrange(desc(fare_amount)) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 36,345,864 × 3\n    year fare_amount mean_fare\n   <int>       <dbl>     <dbl>\n 1  2021     818283.      13.5\n 2  2022     401092.      13.0\n 3  2021     398466.      13.5\n 4  2021     395854.      13.5\n 5  2021       6965       13.5\n 6  2021       6960.      13.5\n 7  2021       6010       13.5\n 8  2021       5954       13.5\n 9  2021       4969       13.5\n10  2021       3555.      13.5\n# ℹ 36,345,854 more rows\n```\n:::\n:::\n\n\n## Window functions - via duckdb\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfare_by_year %>%\n  group_by(year) %>%\n  to_duckdb() %>%\n  mutate(mean_fare = mean(fare_amount)) %>% \n  to_arrow() %>%\n  arrange(desc(fare_amount)) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 36,345,864 × 3\n    year fare_amount mean_fare\n   <int>       <dbl>     <dbl>\n 1  2021     818283.      13.5\n 2  2022     401092.      13.0\n 3  2021     398466.      13.5\n 4  2021     395854.      13.5\n 5  2021       6965       13.5\n 6  2021       6960.      13.5\n 7  2021       6010       13.5\n 8  2021       5954       13.5\n 9  2021       4969       13.5\n10  2021       3555.      13.5\n# ℹ 36,345,854 more rows\n```\n:::\n:::\n\n\n## Your Turn\n\n1.  How many trips in 2019 had a longer than average distance for that year?\n\n➡️ [Data Manipulation Part I Exercises Page](4_data_manipulation_2-exercises.html)\n\n## Custom functions\n\n-   Not officially supported\n-   Works for simple operations but not with bindings\n\n## Custom functions - supported\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmillions <- function(x) x / 10^6\n\nnyc_taxi |>\n  group_by(vendor_name) %>%\n  summarise(trips = n()) %>%\n  mutate(\n    trips_mil = millions(trips)\n  ) %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 3\n  vendor_name     trips trips_mil\n  <chr>           <int>     <dbl>\n1 CMT         531806845    532.  \n2 VTS         621279093    621.  \n3 <NA>          2709974      2.71\n```\n:::\n:::\n\n\n## Custom functions - not supported\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmorning <- function(x) ifelse(lubridate::am(x), \"morning\", \"afternoon\")\nnyc_taxi |>\n  group_by(morning(pickup_datetime)) %>%\n  count() %>%\n  collect()\n```\n\n::: {.cell-output .cell-output-error}\n```\nError: Expression morning(pickup_datetime) not supported in Arrow\nCall collect() first to pull data into R.\n```\n:::\n:::\n\n\n-   recommendation: write code as dplyr expressions instead of functions, or look up docs on user-defined functions for datasets (see `?register_scalar_function`)\n",
+    "supporting": [
+      "4_data_manipulation_2_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/5_arrow_single_file/execute-results/html.json b/_freeze/materials/5_arrow_single_file/execute-results/html.json
index 2d7363d..982b257 100644
--- a/_freeze/materials/5_arrow_single_file/execute-results/html.json
+++ b/_freeze/materials/5_arrow_single_file/execute-results/html.json
@@ -1,7 +1,7 @@
 {
-  "hash": "d1d75f3a4805a848aa8770f3f8dc1acf",
+  "hash": "2113a423a790087d30e1f144f55d37d9",
   "result": {
-    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n\n# Arrow in R: In-Memory Workflows {#single-file-api}\n\n\n::: {.cell}\n\n:::\n\n\n## arrow 📦\n\n![](images/arrow-read-write-updated.png)\n\n## Arrow & Single Files\n\n<br>\n\n`library(arrow)`\n\n-   `read_parquet()`\n-   `read_csv_arrow()`\n-   `read_feather()`\n-   `read_json_arrow()`\n\n**Value**: `tibble` (the default), or an Arrow Table if `as_data_frame = FALSE` --- both *in-memory*\n\n## Your Turn\n\n1.  Read in a single NYC Taxi parquet file using `read_parquet()` as an Arrow Table\n2.  Convert your Arrow Table object to a `data.frame` or a `tibble`\n\n## Read a Parquet File (`tibble`)\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\n\nparquet_file <- here::here(\"data/nyc-taxi/year=2019/month=9/part-0.parquet\")\n\ntaxi_df <- read_parquet(parquet_file)\ntaxi_df\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6,567,396 × 22\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   <chr>       <dttm>              <dttm>                        <int>\n 1 VTS         2019-09-01 06:14:09 2019-09-01 06:31:52               2\n 2 VTS         2019-09-01 06:36:17 2019-09-01 07:12:44               1\n 3 VTS         2019-09-01 06:29:19 2019-09-01 06:54:13               1\n 4 CMT         2019-09-01 06:33:09 2019-09-01 06:52:14               2\n 5 VTS         2019-09-01 06:57:43 2019-09-01 07:26:21               1\n 6 CMT         2019-09-01 06:59:16 2019-09-01 07:28:12               1\n 7 CMT         2019-09-01 06:20:06 2019-09-01 06:52:19               1\n 8 CMT         2019-09-01 06:27:54 2019-09-01 06:32:56               0\n 9 CMT         2019-09-01 06:35:08 2019-09-01 06:55:51               0\n10 CMT         2019-09-01 06:19:37 2019-09-01 06:30:52               1\n# ℹ 6,567,386 more rows\n# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,\n#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,\n#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,\n#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,\n#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,\n#   congestion_surcharge <dbl>, pickup_location_id <int>, …\n```\n:::\n:::\n\n\n## Read a Parquet File (`Table`)\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxi_table <- read_parquet(parquet_file, as_data_frame = FALSE)\ntaxi_table\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTable\n6567396 rows x 22 columns\n$vendor_name <string>\n$pickup_datetime <timestamp[ms]>\n$dropoff_datetime <timestamp[ms]>\n$passenger_count <int64>\n$trip_distance <double>\n$pickup_longitude <double>\n$pickup_latitude <double>\n$rate_code <string>\n$store_and_fwd <string>\n$dropoff_longitude <double>\n$dropoff_latitude <double>\n$payment_type <string>\n$fare_amount <double>\n$extra <double>\n$mta_tax <double>\n$tip_amount <double>\n$tolls_amount <double>\n$total_amount <double>\n$improvement_surcharge <double>\n$congestion_surcharge <double>\n$pickup_location_id <int64>\n$dropoff_location_id <int64>\n```\n:::\n:::\n\n\n## `tibble` \\<-\\> `Table` \\<-\\> `data.frame`\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\n#change a df to a table\narrow_table(taxi_df)\n\n#change a table to a df\ntaxi_table |> collect()\n\nas.data.frame(taxi_table)\n\nas_tibble(taxi_table)\n```\n:::\n\n\n<br>\n\n-   `data.frame` & `tibble` are in R memory\n-   `Table` is in Arrow memory\n\n## Data frames\n\n![](images/tabular-structures-r.png)\n\n## Arrow Tables\n\n![](images/tabular-structures-arrow-1.png)\n\n::: notes\nArrow Tables are collections of chunked arrays\n:::\n\n## Table \\| Dataset: A `dplyr` pipeline\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 4\n  vendor_name all_trips shared_trips pct_shared\n  <chr>           <int>        <int>      <dbl>\n1 VTS           4238808      1339478       31.6\n2 CMT           2294473       470344       20.5\n3 <NA>            34115            0        0  \n```\n:::\n:::\n\n\n<br>\n\nFunctions available in Arrow dplyr queries: <https://arrow.apache.org/docs/r/reference/acero.html>\n\n::: notes\nAll the same capabilities as you practiced with Arrow `Dataset`\n:::\n\n## Arrow for Efficient In-Memory Processing\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet() |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 6567396\n```\n:::\n:::\n\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet() |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.222   0.631   0.761 \n```\n:::\n:::\n\n\n## Arrow for Efficient In-Memory Processing\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 6567396\n```\n:::\n:::\n\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.136   0.567   0.419 \n```\n:::\n:::\n\n\n## Read a Parquet File Selectively\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTable\n6567396 rows x 2 columns\n$vendor_name <string>\n$passenger_count <int64>\n```\n:::\n:::\n\n\n## Selective Reads Are Faster\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,3,11\"}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  ) |> \n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.405   0.074   0.209 \n```\n:::\n:::\n\n\n## Arrow Table or Dataset?\n\n![](images/2022-09-decision-map.png){.absolute left=\"200\" height=\"550\"}\n\n::: {style=\"font-size: 60%; margin-top: 575px; margin-left: 250px;\"}\n<https://francoismichonneau.net/2022/10/import-big-csv/>\n:::\n\n## Arrow for Improving Those Sluggish Worklows\n\n-   a \"drop-in\" for many `dplyr` workflows (`Table` or `Dataset`)\n-   works when your tabular data get too big for your RAM (`Dataset`)\n-   provides tools for re-engineering data storage for better performance (`arrow::write_dataset()`)\n\n::: notes\nLot's of ways to speed up sluggish workflows e.g. [writing more performant tidyverse code](https://www.tidyverse.org/blog/2023/04/performant-packages/), use other data frame libraries like data.table or polars, use duckDB or other databases, Spark + splarklyr ... However, Arrow offers some attractive features for tackling this challenge, especially for dplyr users.\n:::\n",
+    "markdown": "---\nfooter: \"[🔗 posit.io/arrow](https://posit-conf-2023.github.io/arrow)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\n---\n\n\n# Arrow in R: In-Memory Workflows {#single-file-api}\n\n\n::: {.cell}\n\n:::\n\n\n## arrow 📦\n\n![](images/arrow-read-write-updated.png)\n\n## Arrow & Single Files\n\n<br>\n\n`library(arrow)`\n\n-   `read_parquet()`\n-   `read_csv_arrow()`\n-   `read_feather()`\n-   `read_json_arrow()`\n\n**Value**: `tibble` (the default), or an Arrow Table if `as_data_frame = FALSE` --- both *in-memory*\n\n## Your Turn\n\n1.  Read in a single NYC Taxi parquet file using `read_parquet()` as an Arrow Table\n2.  Convert your Arrow Table object to a `data.frame` or a `tibble`\n\n## Read a Parquet File (`tibble`)\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\n\nparquet_file <- here::here(\"data/nyc-taxi/year=2019/month=9/part-0.parquet\")\n\ntaxi_df <- read_parquet(parquet_file)\ntaxi_df\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6,567,396 × 22\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   <chr>       <dttm>              <dttm>                        <int>\n 1 VTS         2019-09-01 06:14:09 2019-09-01 06:31:52               2\n 2 VTS         2019-09-01 06:36:17 2019-09-01 07:12:44               1\n 3 VTS         2019-09-01 06:29:19 2019-09-01 06:54:13               1\n 4 CMT         2019-09-01 06:33:09 2019-09-01 06:52:14               2\n 5 VTS         2019-09-01 06:57:43 2019-09-01 07:26:21               1\n 6 CMT         2019-09-01 06:59:16 2019-09-01 07:28:12               1\n 7 CMT         2019-09-01 06:20:06 2019-09-01 06:52:19               1\n 8 CMT         2019-09-01 06:27:54 2019-09-01 06:32:56               0\n 9 CMT         2019-09-01 06:35:08 2019-09-01 06:55:51               0\n10 CMT         2019-09-01 06:19:37 2019-09-01 06:30:52               1\n# ℹ 6,567,386 more rows\n# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,\n#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,\n#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,\n#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,\n#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,\n#   congestion_surcharge <dbl>, pickup_location_id <int>, …\n```\n:::\n:::\n\n\n## Read a Parquet File (`Table`)\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxi_table <- read_parquet(parquet_file, as_data_frame = FALSE)\ntaxi_table\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTable\n6567396 rows x 22 columns\n$vendor_name <string>\n$pickup_datetime <timestamp[ms]>\n$dropoff_datetime <timestamp[ms]>\n$passenger_count <int64>\n$trip_distance <double>\n$pickup_longitude <double>\n$pickup_latitude <double>\n$rate_code <string>\n$store_and_fwd <string>\n$dropoff_longitude <double>\n$dropoff_latitude <double>\n$payment_type <string>\n$fare_amount <double>\n$extra <double>\n$mta_tax <double>\n$tip_amount <double>\n$tolls_amount <double>\n$total_amount <double>\n$improvement_surcharge <double>\n$congestion_surcharge <double>\n$pickup_location_id <int64>\n$dropoff_location_id <int64>\n```\n:::\n:::\n\n\n## `tibble` \\<-\\> `Table` \\<-\\> `data.frame`\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\n#change a df to a table\narrow_table(taxi_df)\n\n#change a table to a df\ntaxi_table |> collect()\n\nas.data.frame(taxi_table)\n\nas_tibble(taxi_table)\n```\n:::\n\n\n<br>\n\n-   `data.frame` & `tibble` are in R memory\n-   `Table` is in Arrow memory\n\n## Data frames\n\n![](images/tabular-structures-r.png)\n\n## Arrow Tables\n\n![](images/tabular-structures-arrow-1.png)\n\n::: notes\nArrow Tables are collections of chunked arrays\n:::\n\n## Table \\| Dataset: A `dplyr` pipeline\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 4\n  vendor_name all_trips shared_trips pct_shared\n  <chr>           <int>        <int>      <dbl>\n1 CMT           2294473       470344       20.5\n2 VTS           4238808      1339478       31.6\n3 <NA>            34115            0        0  \n```\n:::\n:::\n\n\n<br>\n\nFunctions available in Arrow dplyr queries: <https://arrow.apache.org/docs/r/reference/acero.html>\n\n::: notes\nAll the same capabilities as you practiced with Arrow `Dataset`\n:::\n\n## Arrow for Efficient In-Memory Processing\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet() |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 6567396\n```\n:::\n:::\n\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet() |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  1.915   0.631   0.681 \n```\n:::\n:::\n\n\n## Arrow for Efficient In-Memory Processing\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 6567396\n```\n:::\n:::\n\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  1.846   0.313   0.317 \n```\n:::\n:::\n\n\n## Read a Parquet File Selectively\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTable\n6567396 rows x 2 columns\n$vendor_name <string>\n$passenger_count <int64>\n```\n:::\n:::\n\n\n## Selective Reads Are Faster\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,3,11\"}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  ) |> \n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  0.480   0.069   0.224 \n```\n:::\n:::\n\n\n## Arrow Table or Dataset?\n\n![](images/2022-09-decision-map.png){.absolute left=\"200\" height=\"550\"}\n\n::: {style=\"font-size: 60%; margin-top: 575px; margin-left: 250px;\"}\n<https://francoismichonneau.net/2022/10/import-big-csv/>\n:::\n\n## Arrow for Improving Those Sluggish Worklows\n\n-   a \"drop-in\" for many `dplyr` workflows (`Table` or `Dataset`)\n-   works when your tabular data get too big for your RAM (`Dataset`)\n-   provides tools for re-engineering data storage for better performance (`arrow::write_dataset()`)\n\n::: notes\nLot's of ways to speed up sluggish workflows e.g. [writing more performant tidyverse code](https://www.tidyverse.org/blog/2023/04/performant-packages/), use other data frame libraries like data.table or polars, use duckDB or other databases, Spark + splarklyr ... However, Arrow offers some attractive features for tackling this challenge, especially for dplyr users.\n:::\n\n\n",
     "supporting": [
       "5_arrow_single_file_files"
     ],
diff --git a/_freeze/setup/execute-results/html.json b/_freeze/setup/execute-results/html.json
index a32b18a..7259fe9 100644
--- a/_freeze/setup/execute-results/html.json
+++ b/_freeze/setup/execute-results/html.json
@@ -1,10 +1,8 @@
 {
-  "hash": "3c820f6479e794dce6ec58d527b0ebf3",
+  "hash": "7008aac1d07d30b251677f7e2d2c1c7c",
   "result": {
-    "markdown": "---\ntitle: \"Packages & Data\"\nexecute:\n  eval: false\n---\n\n\nWelcome to the `Big Data in R with Arrow` workshop! On this page you will find information about the software, packages and data we will be using during the workshop. Please try your very best to arrive on the day ready---software & packages installed and data downloaded on to your laptop.\n\n# Software\n\nYou will need a laptop with [R](https://cloud.r-project.org/) and the [RStudio Desktop IDE](https://posit.co/download/rstudio-desktop/) installed and with sufficient disk storage space for the workshop data sets and exercises---we recommend about \\~80GB to work with the \"larger-than-memory\" data option or \\~1-2GB to work with the smaller example---or \"tiny\"---data option. The workshop will also have a dedicated Posit Cloud work space ready with the packages and the tiny sample data sets loaded for those who need or prefer to participate through a browser.\n\n# Packages\n\nTo install the required core packages for the day, run the following:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ninstall.packages(c(\n  \"here\", \"arrow\", \"dplyr\", \"duckdb\", \"dbplyr\", \"stringr\", \"lubridate\", \"tictoc\"\n))\n```\n:::\n\n\nAnd to load them:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(here)\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(duckdb)\nlibrary(dbplyr)\nlibrary(stringr)\nlibrary(lubridate)\nlibrary(tictoc)\n```\n:::\n\n\nPlease come with the *latest CRAN versions* of the core packages installed on your laptops.\n\nWhile the core workshop doesn't focus on spatial data, the \"wrapping up\" section near the end of day results in a ggplot map. If you want to run the \"wrapping up\" workflow code you will need to install the following spatial and plotting packages:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ninstall.packages(c(\"ggplot2\", \"ggrepel\", \"sf\", \"scales\", \"janitor\"))\n```\n:::\n\n\nAnd to load them:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(ggplot2)\nlibrary(ggrepel)\nlibrary(sf)\nlibrary(scales)\nlibrary(janitor)\n```\n:::\n\n\n# Data\n\nDuring the 1-day workshop, you will need the following data sets:\n\n1.  *NYC Yellow Taxi Trip Record Data*: Partitioned parquet files released as open data from the [NYC Taxi & Limousine Commission (TLC)](https://www.nyc.gov/site/tlc/about/raw-data.page) with a pre-tidied subset (\\~40GB) downloaded with either `arrow` or via https from an AWS S3 bucket\n2.  *Seattle Public Library Checkouts by Title*: A single CSV file (9GB) from the [Seattle Open Data portal](https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6) downloaded via https from an AWS S3 bucket\n3.  *Taxi Zone Lookup CSV Table & Taxi Zone Shapefile*: two NYC Taxi trip ancillary data files from the [TLC open platform](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) downloaded via https directly from this repository\n\n## Larger-Than-Memory Data Option\n\n### 1. NYC Taxi Data\n\nThis is the main data set we will need for the day. It's pretty hefty---*40* GB in total---and there are a couple of options for how to acquire it, depending on your internet connection speed.\n\n#### Option 1---the simplest option---for those with a good internet connection and happy to let things run\n\nIf you have a solid internet connection, and especially if you're in the US/Canada, this option is the simplest. You can use `arrow` itself to download the data. Note that there are no progress bars displayed during download, and so your session will appear to hang, but you can check progress by inspecting the contents of the download directory. When we tested this with Steph's laptop and a fast internet connection, it took 67 minutes, though results will likely vary widely.\n\nAfter installing arrow, run the following code:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n\ndata_path <- here::here(\"data/nyc-taxi\") # Or set your own preferred path\n\nopen_dataset(\"s3://voltrondata-labs-datasets/nyc-taxi\") |>\n\tfilter(year %in% 2012:2021) |> \n\twrite_dataset(data_path, partitioning = c(\"year\", \"month\"))\n```\n:::\n\n\nOnce this has completed, you can check everything has downloaded correctly by calling:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(data_path) |>\n\tnrow()\n```\n:::\n\n\nIt might take a moment to run (the data has over a billion rows!), but you should expect to see:\n\n```         \n[1] 1150352666\n```\n\nIf you get an error message, your download may have been interrupted at some point. The error message will name the file which could not be read. Manually delete this file and run the `nrow()` code snippet again until you successfully load the remaining data. You can then download any missing files individually using option 2.\n\n#### Option 2---one file at a time via https\n\nIf you have a slower internet connection or are further away from the data S3 bucket location, it's probably going to be simpler to download the data file-by-file. Or, if you had any interruptions to your download process in the previous step, you can either try instead with this method, or delete the files which weren't downloaded properly, and use this method to just download the files you need.\n\nWe've created a script for you which downloads the data one file at a time via https. The script also checks for previously downloaded data, so if you encounter problems downloading any files, just delete the partially downloaded file and run again---the script will only download files which are missing.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload_via_https <- function(data_dir, years = 2012:2021){\n\n    # Set this option as we'll be downloading large files and R has a default\n    # timeout of 60 seconds, so we've updated this to 30 mins\n    options(timeout = 1800)\n    \n    # The S3 bucket where the data is stored\n    bucket <- \"https://voltrondata-labs-datasets.s3.us-east-2.amazonaws.com\"\n    \n    # Collect any errors raised during the download process\n    problems <- c()\n    \n    # Download the data from S3 - loops through the data files, downloading 1 file at a time\n    for (year in years) {\n      \n      # We only have 2 months for 2022 data\n      if(year ==2022){\n        months = 1:2\n      } else {\n        months = 1:12\n      }\n      \n      for (month in months) {\n        \n        # Work out where we're going to be saving the data\n        partition_dir <- paste0(\"year=\", year, \"/month=\", month)\n        dest_dir <- file.path(data_dir, partition_dir)\n        dest_file_path <- file.path(dest_dir, \"part-0.parquet\")\n        \n        # If the file doesn't exist\n        if (!file.exists(dest_file_path)) {\n          \n          # Create the partition to store the data in\n          if(!dir.exists(dest_dir)){\n            dir.create(dest_dir, recursive = TRUE)\n          }\n           \n          # Work out where we are going to be retrieving the data from\n          source_path <- file.path(bucket, \"nyc-taxi\", partition_dir, \"part-0.parquet\")\n          \n          # Download the data - save any error messages that occur\n          tryCatch(\n            download.file(source_path, dest_file_path, mode = \"wb\"),\n            error = function(e){\n              problems <- c(problems, e$message)\n            }\n          )\n        }\n      }\n    }\n    \n    print(\"Downloads complete\")\n    \n    if(length(problems) > 0){\n      warning(call. = FALSE, \"The following errors occurred during download:\\n\", paste(problems, collapse =  \"\\n\"))\n    }\n}\n\n\ndata_path <- here::here(\"data/nyc-taxi\") # Or set your own preferred path\n\ndownload_via_https(data_path)\n```\n:::\n\n\nOnce this has completed, you can check everything has downloaded correctly by calling:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(data_path) |>\n\tnrow()\n```\n:::\n\n\nIt might take a moment to run (the data has over a billion rows), but you should expect to see:\n\n```         \n[1] 1150352666\n```\n\nIf you get an error message, your download may have been interrupted at some point. The error message will name the file which could not be read. Manually delete this file and run the `nrow()` code snippet again until you successfully load the data. You can then download any missing files by re-running `download_via_https(data_path)`.\n\n### 2. Seattle Checkouts by Title Data\n\nThis is the data we will use to explore some data storage and engineering options. It's a good sized, single CSV file---*9GB* on-disk in total, which can be downloaded from the an AWS S3 bucket via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv\",\n  destfile = here::here(\"data/seattle-library-checkouts.csv\")\n)\n```\n:::\n\n\n## Tiny Data Option\n\nIf you don't have time or disk space to download the larger-than-memory data sets (and still have disk space do the exercises), you can run the code and exercises in the course with \"tiny\" versions of these data. Although the focus in this course is working with larger-than-memory data, you can still learn about the concepts and workflows with smaller data---although note you may not see the same performance improvements that you would get when working with larger data.\n\n### 1. Tiny NYC Taxi Data\n\nWe've created a \"tiny\" NYC Taxi data set which contains only 1 in 1000 rows from the original data set. So instead of working with 1.15 billion rows of data and about 40GB of files, the tiny taxi data set is 1.15 million rows and about 50MB of files. You can download the tiny NYC Taxi data directly from this repo via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/nyc-taxi-tiny.zip\",\n  destfile = here::here(\"data/nyc-taxi-tiny.zip\")\n)\n\n# Extract the partitioned parquet files from the zip folder:\nunzip(\n  zipfile = here::here(\"data/nyc-taxi-tiny.zip\"), \n  exdir = here::here(\"data/\")\n)\n```\n:::\n\n\n### 2. Tiny Seattle Checkouts by Title Data\n\nWe've created a \"tiny\" Seattle Checkouts by Title data set which contains only 1 in 100 rows from the original data set. So instead of working with \\~41 million rows of data in a 9GB file, the tiny Seattle checkouts data set is \\~410 thousand rows and in an 90MB file. You can download the tiny Seattle Checkouts by Title data directly from this repo via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/seattle-library-checkouts-tiny.csv\",\n  destfile = here::here(\"data/seattle-library-checkouts-tiny.csv\")\n)\n```\n:::\n\n\n## Both Data Options / Everyone\n\n### 3. Taxi Zone Lookup CSV Table & Taxi Zone Shapefile\n\nYou can download the two NYC Taxi trip ancillary data files directly from this repo via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/taxi_zone_lookup.csv\",\n  destfile = here::here(\"data/taxi_zone_lookup.csv\")\n)\n\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/taxi_zones.zip\",\n  destfile = here::here(\"data/taxi_zones.zip\")\n)\n\n# Extract the spatial files from the zip folder:\nunzip(\n  zipfile = here::here(\"data/taxi_zones.zip\"), \n  exdir = here::here(\"data/taxi_zones\")\n)\n```\n:::\n\n\n## Data on The Day Of\n\nWhile we ask that everyone do their best to arrive on the day ready with your software & packages installed and the data downloaded on to your laptops---we recognize that life happens. We will have 5 USB flash drives (and a few USB-C to USB adapters) on-hand the morning of the workshop with copies of ***all the larger-than-memory and tiny versions of the data sets***. So if you have trouble downloading any of the data beforehand, we should be able to get you sorted before the day starts or as soon as possible as the day begins. And if disk space or laptop permissions are blockers, the workshop will have a dedicated Posit Cloud work space ready for participants.\n",
-    "supporting": [
-      "setup_files"
-    ],
+    "markdown": "---\ntitle: \"Packages & Data\"\nexecute:\n  eval: false\n---\n\n\nWelcome to the `Big Data in R with Arrow` workshop! On this page you will find information about the software, packages and data we will be using during the workshop. Please try your very best to arrive on the day ready---software & packages installed and data downloaded on to your laptop.\n\n# Software\n\nYou will need a laptop with [R](https://cloud.r-project.org/) and the [RStudio Desktop IDE](https://posit.co/download/rstudio-desktop/) installed and with sufficient disk storage space for the workshop data sets and exercises---we recommend about \\~80GB to work with the \"larger-than-memory\" data option or \\~1-2GB to work with the smaller example---or \"tiny\"---data option. The workshop will also have a dedicated Posit Cloud work space ready with the packages and the tiny sample data sets loaded for those who need or prefer to participate through a browser.\n\n# Packages\n\nTo install the required core packages for the day, run the following:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ninstall.packages(c(\n  \"here\", \"arrow\", \"dplyr\", \"duckdb\", \"dbplyr\", \"stringr\", \"lubridate\", \"tictoc\"\n))\n```\n:::\n\n\nAnd to load them:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(here)\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(duckdb)\nlibrary(dbplyr)\nlibrary(stringr)\nlibrary(lubridate)\nlibrary(tictoc)\n```\n:::\n\n\nPlease come with the *latest CRAN versions* of the core packages installed on your laptops.  **You will need the latest version of the arrow package installed, which is 13.0.0**.  A small amount of the code in the workshop relies on functions introduced in arrow 13.0.0; let us know if for any reason you are unable to install this version, and we'll flag up any modifications you'll need to make to the code to run with earlier versions of arrow.\n\nWhile the core workshop doesn't focus on spatial data, the \"wrapping up\" section near the end of day results in a ggplot map. If you want to run the \"wrapping up\" workflow code you will need to install the following spatial and plotting packages:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ninstall.packages(c(\"ggplot2\", \"ggrepel\", \"sf\", \"scales\", \"janitor\"))\n```\n:::\n\n\nAnd to load them:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(ggplot2)\nlibrary(ggrepel)\nlibrary(sf)\nlibrary(scales)\nlibrary(janitor)\n```\n:::\n\n\n# Data\n\nDuring the 1-day workshop, you will need the following data sets:\n\n1.  *NYC Yellow Taxi Trip Record Data*: Partitioned parquet files released as open data from the [NYC Taxi & Limousine Commission (TLC)](https://www.nyc.gov/site/tlc/about/raw-data.page) with a pre-tidied subset (\\~40GB) downloaded with either `arrow` or via https from an AWS S3 bucket\n2.  *Seattle Public Library Checkouts by Title*: A single CSV file (9GB) from the [Seattle Open Data portal](https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6) downloaded via https from an AWS S3 bucket\n3.  *Taxi Zone Lookup CSV Table & Taxi Zone Shapefile*: two NYC Taxi trip ancillary data files from the [TLC open platform](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) downloaded via https directly from this repository\n\n## Larger-Than-Memory Data Option\n\n### 1. NYC Taxi Data\n\nThis is the main data set we will need for the day. It's pretty hefty---*40* GB in total---and there are a couple of options for how to acquire it, depending on your internet connection speed.\n\n#### Option 1---the simplest option---for those with a good internet connection and happy to let things run\n\nIf you have a solid internet connection, and especially if you're in the US/Canada, this option is the simplest. You can use `arrow` itself to download the data. Note that there are no progress bars displayed during download, and so your session will appear to hang, but you can check progress by inspecting the contents of the download directory. When we tested this with Steph's laptop and a fast internet connection, it took 67 minutes, though results will likely vary widely.\n\nAfter installing arrow, run the following code:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n\ndata_path <- here::here(\"data/nyc-taxi\") # Or set your own preferred path\n\nopen_dataset(\"s3://voltrondata-labs-datasets/nyc-taxi\") |>\n\tfilter(year %in% 2012:2021) |> \n\twrite_dataset(data_path, partitioning = c(\"year\", \"month\"))\n```\n:::\n\n\nOnce this has completed, you can check everything has downloaded correctly by calling:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(data_path) |>\n\tnrow()\n```\n:::\n\n\nIt might take a moment to run (the data has over a billion rows!), but you should expect to see:\n\n```         \n[1] 1150352666\n```\n\nIf you get an error message, your download may have been interrupted at some point. The error message will name the file which could not be read. Manually delete this file and run the `nrow()` code snippet again until you successfully load the remaining data. You can then download any missing files individually using option 2.\n\n#### Option 2---one file at a time via https\n\nIf you have a slower internet connection or are further away from the data S3 bucket location, it's probably going to be simpler to download the data file-by-file. Or, if you had any interruptions to your download process in the previous step, you can either try instead with this method, or delete the files which weren't downloaded properly, and use this method to just download the files you need.\n\nWe've created a script for you which downloads the data one file at a time via https. The script also checks for previously downloaded data, so if you encounter problems downloading any files, just delete the partially downloaded file and run again---the script will only download files which are missing.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload_via_https <- function(data_dir, years = 2012:2021){\n\n    # Set this option as we'll be downloading large files and R has a default\n    # timeout of 60 seconds, so we've updated this to 30 mins\n    options(timeout = 1800)\n    \n    # The S3 bucket where the data is stored\n    bucket <- \"https://voltrondata-labs-datasets.s3.us-east-2.amazonaws.com\"\n    \n    # Collect any errors raised during the download process\n    problems <- c()\n    \n    # Download the data from S3 - loops through the data files, downloading 1 file at a time\n    for (year in years) {\n      \n      # We only have 2 months for 2022 data\n      if(year ==2022){\n        months = 1:2\n      } else {\n        months = 1:12\n      }\n      \n      for (month in months) {\n        \n        # Work out where we're going to be saving the data\n        partition_dir <- paste0(\"year=\", year, \"/month=\", month)\n        dest_dir <- file.path(data_dir, partition_dir)\n        dest_file_path <- file.path(dest_dir, \"part-0.parquet\")\n        \n        # If the file doesn't exist\n        if (!file.exists(dest_file_path)) {\n          \n          # Create the partition to store the data in\n          if(!dir.exists(dest_dir)){\n            dir.create(dest_dir, recursive = TRUE)\n          }\n           \n          # Work out where we are going to be retrieving the data from\n          source_path <- file.path(bucket, \"nyc-taxi\", partition_dir, \"part-0.parquet\")\n          \n          # Download the data - save any error messages that occur\n          tryCatch(\n            download.file(source_path, dest_file_path, mode = \"wb\"),\n            error = function(e){\n              problems <- c(problems, e$message)\n            }\n          )\n        }\n      }\n    }\n    \n    print(\"Downloads complete\")\n    \n    if(length(problems) > 0){\n      warning(call. = FALSE, \"The following errors occurred during download:\\n\", paste(problems, collapse =  \"\\n\"))\n    }\n}\n\n\ndata_path <- here::here(\"data/nyc-taxi\") # Or set your own preferred path\n\ndownload_via_https(data_path)\n```\n:::\n\n\nOnce this has completed, you can check everything has downloaded correctly by calling:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(data_path) |>\n\tnrow()\n```\n:::\n\n\nIt might take a moment to run (the data has over a billion rows), but you should expect to see:\n\n```         \n[1] 1150352666\n```\n\nIf you get an error message, your download may have been interrupted at some point. The error message will name the file which could not be read. Manually delete this file and run the `nrow()` code snippet again until you successfully load the data. You can then download any missing files by re-running `download_via_https(data_path)`.\n\n### 2. Seattle Checkouts by Title Data\n\nThis is the data we will use to explore some data storage and engineering options. It's a good sized, single CSV file---*9GB* on-disk in total, which can be downloaded from the an AWS S3 bucket via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv\",\n  destfile = here::here(\"data/seattle-library-checkouts.csv\")\n)\n```\n:::\n\n\n## Tiny Data Option\n\nIf you don't have time or disk space to download the larger-than-memory data sets (and still have disk space do the exercises), you can run the code and exercises in the course with \"tiny\" versions of these data. Although the focus in this course is working with larger-than-memory data, you can still learn about the concepts and workflows with smaller data---although note you may not see the same performance improvements that you would get when working with larger data.\n\n### 1. Tiny NYC Taxi Data\n\nWe've created a \"tiny\" NYC Taxi data set which contains only 1 in 1000 rows from the original data set. So instead of working with 1.15 billion rows of data and about 40GB of files, the tiny taxi data set is 1.15 million rows and about 50MB of files. You can download the tiny NYC Taxi data directly from this repo via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/nyc-taxi-tiny.zip\",\n  destfile = here::here(\"data/nyc-taxi-tiny.zip\")\n)\n\n# Extract the partitioned parquet files from the zip folder:\nunzip(\n  zipfile = here::here(\"data/nyc-taxi-tiny.zip\"), \n  exdir = here::here(\"data/\")\n)\n```\n:::\n\n\n### 2. Tiny Seattle Checkouts by Title Data\n\nWe've created a \"tiny\" Seattle Checkouts by Title data set which contains only 1 in 100 rows from the original data set. So instead of working with \\~41 million rows of data in a 9GB file, the tiny Seattle checkouts data set is \\~410 thousand rows and in an 90MB file. You can download the tiny Seattle Checkouts by Title data directly from this repo via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/seattle-library-checkouts-tiny.csv\",\n  destfile = here::here(\"data/seattle-library-checkouts-tiny.csv\")\n)\n```\n:::\n\n\n## Both Data Options / Everyone\n\n### 3. Taxi Zone Lookup CSV Table & Taxi Zone Shapefile\n\nYou can download the two NYC Taxi trip ancillary data files directly from this repo via https:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/taxi_zone_lookup.csv\",\n  destfile = here::here(\"data/taxi_zone_lookup.csv\")\n)\n\ndownload.file(\n  url = \"https://github.com/posit-conf-2023/arrow/releases/download/v0.1.0/taxi_zones.zip\",\n  destfile = here::here(\"data/taxi_zones.zip\")\n)\n\n# Extract the spatial files from the zip folder:\nunzip(\n  zipfile = here::here(\"data/taxi_zones.zip\"), \n  exdir = here::here(\"data/taxi_zones\")\n)\n```\n:::\n\n\n## Data on The Day Of\n\nWhile we ask that everyone do their best to arrive on the day ready with your software & packages installed and the data downloaded on to your laptops---we recognize that life happens. We will have 5 USB flash drives (and a few USB-C to USB adapters) on-hand the morning of the workshop with copies of ***all the larger-than-memory and tiny versions of the data sets***. So if you have trouble downloading any of the data beforehand, we should be able to get you sorted before the day starts or as soon as possible as the day begins. And if disk space or laptop permissions are blockers, the workshop will have a dedicated Posit Cloud work space ready for participants.\n",
+    "supporting": [],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/materials/1_hello_arrow-exercises.qmd b/materials/1_hello_arrow-exercises.qmd
index bc7f1ca..88e6e9c 100644
--- a/materials/1_hello_arrow-exercises.qmd
+++ b/materials/1_hello_arrow-exercises.qmd
@@ -23,7 +23,6 @@ taxi_size <- tibble(
 taxi_size
 
 taxi_size |> summarise(total_GB = sum(size_GB))
-## or demo/show data size interactively using Finder
 ```
 
 ```{r}
diff --git a/materials/2_data_manipulation_1-exercises.qmd b/materials/2_data_manipulation_1-exercises.qmd
index f645a2c..2e6bb7b 100644
--- a/materials/2_data_manipulation_1-exercises.qmd
+++ b/materials/2_data_manipulation_1-exercises.qmd
@@ -27,7 +27,7 @@ nyc_taxi
 
 1.  How many taxi fares in the dataset had a total amount greater than \$100?
 
-2.  How many distinct pickup locations are in the dataset?
+2.  How many distinct pickup locations are in the dataset since 2016?
 
 ## Solution 1
 
@@ -44,8 +44,8 @@ nyc_taxi %>%
 ```{r}
 #| label: compute-collect-2
 #| cache: true
-#| eval: false
 nyc_taxi %>%
+  filter(year >= 2016) %>%
   distinct(pickup_longitude, pickup_latitude) %>%
   compute() %>%
   nrow()
@@ -69,6 +69,7 @@ nyc_taxi %>%
 
 ```{r}
 #| label: compute-collect-sol1
+#| eval: false
 nyc_taxi %>%
   filter(str_ends(vendor_name, "S"), year == 2020,  month == 9) %>%
   collect()
@@ -79,8 +80,10 @@ nyc_taxi %>%
 ```{r}
 #| label: compute-collect-sol2
 #| error: true
+#| eval: false
 nyc_taxi %>%
   mutate(vendor_name = stringr::str_replace_na(vendor_name, "No vendor")) %>%
+  head() %>%
   collect()
 ```
 
@@ -91,6 +94,7 @@ This won't work as `stringr::str_replace_na()` hasn't been implemented in Arrow.
 #| eval: false
 nyc_taxi %>%
   mutate(vendor_name = ifelse(is.na(vendor_name), "No vendor", vendor_name)) %>%
+  head() %>%
   collect()
 ```
 
@@ -98,6 +102,7 @@ Or, if you only needed a subset of the data, you could apply the function after
 
 ```{r}
 #| label: compute-collect-sol4
+#| eval: false
 nyc_taxi %>%
   filter(year == 2019, month == 10) %>% # smaller subset of the data
   collect() %>%
diff --git a/materials/2_data_manipulation_1.qmd b/materials/2_data_manipulation_1.qmd
index e7439ef..0f3ae86 100644
--- a/materials/2_data_manipulation_1.qmd
+++ b/materials/2_data_manipulation_1.qmd
@@ -151,16 +151,6 @@ nyc_taxi %>%
 
 ➡️ [Data Manipulation Part I Exercises Page](2_data_manipulation_1-exercises.html)
 
-## use `glimpse()` to preview datasets
-
-```{r}
-#| label: glimpse
-#| eval: false
-
-nyc_taxi %>%
-  glimpse()
-```
-
 ## use `head()` then `collect()` to preview output for large queries
 
 How much were fares in GBP (£)?
@@ -199,7 +189,6 @@ fares_pounds %>%
 
 ```{r}
 #| label: across
-#| eval: false
 
 taxis_gbp <- nyc_taxi %>%
   mutate(across(ends_with("amount"), list(pounds = ~.x * 0.79)))
@@ -211,7 +200,6 @@ taxis_gbp
 
 ```{r}
 #| label: across-2
-#| eval: false
 
 taxis_gbp %>%
   select(contains("amount")) %>%
@@ -241,9 +229,8 @@ long_rides_2021 %>%
 
 ```{r}
 #| label: get-help
-#| eval: false
 
-?arrow-dplyr
+?`arrow-dplyr`
 ```
 
 ## A different function
@@ -289,7 +276,6 @@ nyc_taxi %>%
 
 ```{r}
 #| label: pivot-duckdb
-#| eval: false
 
 library(duckdb)
 
@@ -301,6 +287,10 @@ nyc_taxi %>%
   to_arrow() %>% # return data back to arrow
   collect()
 ```
+::: {.callout-caution collapse="true"}
+## Requires arrow 13.0.0
+This code requires arrow 13.0.0 or above to run, due to a bug which was fixed in this version
+:::
 
 # Using functions inside verbs
 
@@ -337,9 +327,8 @@ nyc_taxi %>%
 
 ```{r}
 #| label: get-help-funcs
-#| eval: false
 
-?arrow-dplyr
+?`arrow-dplyr`
 ```
 
 ## Your Turn
diff --git a/materials/3_data_engineering-exercises.qmd b/materials/3_data_engineering-exercises.qmd
index d5ce5eb..96dc3e6 100644
--- a/materials/3_data_engineering-exercises.qmd
+++ b/materials/3_data_engineering-exercises.qmd
@@ -66,7 +66,7 @@ seattle_csv <- open_dataset(
   sources = here::here("data/seattle-library-checkouts.csv"),
   format = "csv",
   skip = 1,
- schema(
+  schema(
     UsageClass = utf8(),
     CheckoutType = utf8(),
     MaterialType = utf8(),
diff --git a/materials/3_data_engineering.qmd b/materials/3_data_engineering.qmd
index abd4944..70f1864 100644
--- a/materials/3_data_engineering.qmd
+++ b/materials/3_data_engineering.qmd
@@ -107,13 +107,6 @@ seattle_csv |> glimpse()
 seattle_csv$schema
 ```
 
-```{r}
-#| label: seattle-schema-new
-#| echo: false
-#| eval: false
-schema(seattle_csv)
-```
-
 ## Parsing the Metadata
 
 <br>
@@ -158,13 +151,6 @@ Arrow has a rich data type system, including direct analogs of many R data types
 seattle_csv$schema$code() 
 ```
 
-```{r}
-#| label: seattle-schema-code-new
-#| echo: false
-#| eval: false
-schema(seattle_csv)
-```
-
 ## Let's Control the Schema
 
 ```{r}
diff --git a/materials/4_data_manipulation_2-exercises.qmd b/materials/4_data_manipulation_2-exercises.qmd
index 420bfd0..02df53c 100644
--- a/materials/4_data_manipulation_2-exercises.qmd
+++ b/materials/4_data_manipulation_2-exercises.qmd
@@ -61,7 +61,7 @@ nyc_taxi |>
 ::: panel-tabset
 ## Problem
 
-1.  How many trips in 2019 had a longer than average distance for that year?
+1.  How many trips in September 2019 had a longer than average distance for that month?
 
 ## Solution 1
 
@@ -69,9 +69,8 @@ nyc_taxi |>
 
 ```{r}
 #| label: window-duckdb
-#| eval: false
 nyc_taxi %>%
-  filter(year == 2019) %>%
+  filter(year == 2019, month == 9) %>%
   to_duckdb() %>%
   mutate(mean_distance = mean(trip_distance)) %>%
   to_arrow() %>%
@@ -85,9 +84,10 @@ nyc_taxi %>%
 ```{r}
 #| label: window-join
 nyc_taxi %>%
-  filter(year == 2019) %>%
+  filter(year == 2019, month == 9) %>%
   left_join(
     nyc_taxi %>%
+      filter(year == 2019, month == 9) %>%
       group_by(year) %>%
       summarise(mean_distance = mean(trip_distance))
     ) %>%
diff --git a/materials/4_data_manipulation_2.qmd b/materials/4_data_manipulation_2.qmd
index 172eff8..c3f0993 100644
--- a/materials/4_data_manipulation_2.qmd
+++ b/materials/4_data_manipulation_2.qmd
@@ -218,6 +218,7 @@ fare_by_year %>%
 fare_by_year %>%
   left_join(
     nyc_taxi %>%
+      filter(year %in% 2021:2022) %>%
       group_by(year) %>%
       summarise(mean_fare = mean(fare_amount))
   ) %>% 
@@ -229,7 +230,6 @@ fare_by_year %>%
 
 ```{r}
 #| label: window-duckdb
-#| eval: false
 
 fare_by_year %>%
   group_by(year) %>%
diff --git a/setup.qmd b/setup.qmd
index 4f79b58..97014a6 100644
--- a/setup.qmd
+++ b/setup.qmd
@@ -38,7 +38,7 @@ library(lubridate)
 library(tictoc)
 ```
 
-Please come with the *latest CRAN versions* of the core packages installed on your laptops.
+Please come with the *latest CRAN versions* of the core packages installed on your laptops.  **You will need the latest version of the arrow package installed, which is 13.0.0**.  A small amount of the code in the workshop relies on functions introduced in arrow 13.0.0; let us know if for any reason you are unable to install this version, and we'll flag up any modifications you'll need to make to the code to run with earlier versions of arrow.
 
 While the core workshop doesn't focus on spatial data, the "wrapping up" section near the end of day results in a ggplot map. If you want to run the "wrapping up" workflow code you will need to install the following spatial and plotting packages: