Skip to content

Commit

Permalink
left join using duckdb_left_join
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Oct 20, 2024
1 parent 14a9f8a commit 88c46f8
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 20 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: censobr
Title: Download Data from Brazil's Population Census
Version: 0.4.0
Version: 0.4.0999
Authors@R:
c(person(given="Rafael H. M.", family="Pereira",
email="[email protected]",
Expand Down Expand Up @@ -38,7 +38,6 @@ Imports:
DBI,
dplyr,
duckdb,
duckplyr,
fs,
tools
Suggests:
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# censobr v0.4.0999 dev

* bug fixes
* Passing parameter `merge_households = TRUE` now returns the expected result.

# censobr v0.4.0

* Major changes
Expand Down
40 changes: 40 additions & 0 deletions R/duckdb_left_join.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#' Performs left_join with duckdb
#'
#' @param con A db connection
#' @param x String. Name of a table present in con
#' @param y String. Name of a table present in con
#' @param output_tb Name of the new table to be written in con
#' @param key_cols Vector. Vector with the names of columns to perform left join
#'
#' @return Writes the result of the left join as a new table in con
#'
#' @keywords internal
duckdb_left_join <- function(con, x, y, output_tb, key_cols){

# x = 'df'
# y = 'df_household'
# output_tb = 'df_geo'
# key_cols <- key_vars

# Create dynamic ON condition for matching key columns between `x` and `y`
match_conditions <- paste(
paste0(x, ".", key_cols, " = ", y, ".", key_cols),
collapse = " AND "
)

# Construct the SQL match query
query_match_case <- sprintf("
CREATE TEMPORARY TABLE %s AS
SELECT *
FROM %s
LEFT JOIN %s
ON %s;",
output_tb, # Name of output table
x, # Left table
y, # Right table
match_conditions # Dynamic matching conditions based on key columns
)

# parse(query_match_case)
DBI::dbExecute(con, query_match_case)
}
48 changes: 30 additions & 18 deletions R/merge_household.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,10 @@ merge_household_var <- function(df,
df_household <- dplyr::filter(df_household, get(key_key) %in% key_values) |>
dplyr::compute()

#### https://github.com/duckdb/duckdb-r/issues/72

# convert to duckdb
df <- arrow::to_duckdb(df)
df_household <- arrow::to_duckdb(df_household)

# register db connection
con <- DBI::dbConnect(
duckdb::duckdb(), ":memory:", read_only = FALSE,
con <- duckdb::dbConnect(
duckdb::duckdb(), ":memory:",
read_only = FALSE,
config=list("temp_directory" = fs::path_temp())
)

Expand All @@ -95,29 +90,46 @@ merge_household_var <- function(df,
# DBI::dbExecute(con, "PRAGMA threads=1; PRAGMA memory_limit='1GB';")
# dbExecute(conn = conn, paste0("PRAGMA memory_limit='12GB'"))
# appears to work.

# https://github.com/duckdb/duckdb-r/issues/83
# https://github.com/duckdb/duckdb-r/issues/72

## duckdb strategy

# register data to db
duckdb::duckdb_register(con, 'df', df)
duckdb::duckdb_register(con, 'df_household', df_household)
duckdb::duckdb_register_arrow(con, 'df', df)
duckdb::duckdb_register_arrow(con, 'df_household', df_household)

# merge
df_geo <- duckplyr::left_join(dplyr::tbl(con, "df"),
dplyr::tbl(con, "df_household"),
by = key_vars)
duckdb_left_join(con = con,
x = 'df',
y = 'df_household',
output_tb = 'geo_db',
key_cols = key_vars)

df_geo <- dplyr::tbl(con, "geo_db")
df_geo <- arrow::to_arrow(df_geo)
df_geo <- dplyr::compute(df_geo)

# back to arrow
df_geo <- arrow::to_arrow(df_geo)


# # DBPLYR strategy
# df <- arrow::to_duckdb(df)
# df_household <- arrow::to_duckdb(df_household)
# dplyr::copy_to(con, df)
# dplyr::copy_to(con, df_household)
#
# # merge
# df_geo <- dplyr::left_join(dplyr::tbl(con, "df"),
# dplyr::tbl(con, "df_household"),
# by = key_vars)
#
# df_geo <- arrow::to_arrow(df_geo)
# df_geo <- dplyr::compute(df_geo)

# remove duckdb instance
duckdb::duckdb_unregister_arrow(con, 'df')
duckdb::duckdb_unregister_arrow(con, 'df_household')
DBI::dbDisconnect(con, shutdown = TRUE)
rm(con)
gc()

return(df_geo)
}
26 changes: 26 additions & 0 deletions man/duckdb_left_join.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 88c46f8

Please sign in to comment.