Skip to content

Commit

Permalink
remove attach db and drop db
Browse files Browse the repository at this point in the history
  • Loading branch information
Tmonster committed Jul 3, 2024
1 parent 6adb62c commit a1e3845
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 182 deletions.
141 changes: 50 additions & 91 deletions duckdb-latest/join-duckdb-latest.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,6 @@ src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names
stopifnot(length(src_jn_y)==3L)
cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", ")))

attach_and_use <- function(con, db_file, db) {
if (on_disk) {
# in case a previous solution failed during query execution and left the file around.
if (file.exists(db_file)) {
unlink(db_file)
}
dbExecute(con, sprintf("ATTACH '%s'", db_file))
} else {
dbExecute(con, sprintf("CREATE SCHEMA %s", db))
}
}

detach_and_drop <- function(con, db_file, db) {
if (on_disk) {
dbExecute(con, sprintf("DETACH %s", db))
unlink(db_file)
} else {
dbExecute(con, sprintf("DROP SCHEMA %s CASCADE", db))
}
}

duckdb_join_db = sprintf('%s_%s_%s.db', gsub("-","_",solution), task, data_name)
if (file.exists(duckdb_join_db)) {
unlink(duckdb_join_db)
Expand Down Expand Up @@ -142,133 +121,113 @@ question = "small inner on int" # q1
fun = "inner_join"


attach_and_use(con, 'q1.db', 'q1')
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q1.ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q1.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q1.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q1.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
invisible(dbExecute(con, "DROP TABLE IF EXISTS q1.ans"))
detach_and_drop(con, 'q1.db', 'q1')
attach_and_use(con, 'q1.db', 'q1')
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q1.ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q1.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q1.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q1.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(dbGetQuery(con, "SELECT * FROM q1.ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM q1.ans WHERE ROWID > (SELECT count(*) FROM q1.ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS q1.ans"))
detach_and_drop(con, 'q1.db', 'q1')
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

question = "medium inner on int" # q2
fun = "inner_join"


attach_and_use(con, 'q2.db', 'q2')
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q2.ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q2.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q2.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q2.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
invisible(dbExecute(con, "DROP TABLE IF EXISTS q2.ans"))
detach_and_drop(con, 'q2.db', 'q2')
attach_and_use(con, 'q2.db', 'q2')
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q2.ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q2.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q2.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q2.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(dbGetQuery(con, "SELECT * FROM q2.ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM q2.ans WHERE ROWID > (SELECT count(*) FROM q2.ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS q2.ans"))
detach_and_drop(con, 'q2.db', 'q2')
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

question = "medium outer on int" # q3
fun = "left_join"

attach_and_use(con, 'q3.db', 'q3')
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q3.ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q3.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q3.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q3.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
invisible(dbExecute(con, "DROP TABLE IF EXISTS q3.ans"))
detach_and_drop(con, 'q3.db', 'q3')
attach_and_use(con, 'q3.db', 'q3')
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q3.ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q3.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q3.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q3.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(dbGetQuery(con, "SELECT * FROM q3.ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM q3.ans WHERE ROWID > (SELECT count(*) FROM q3.ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS q3.ans"))
detach_and_drop(con, 'q3.db', 'q3')
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

question = "medium inner on factor" # q4
fun = "inner_join"

attach_and_use(con, 'q4.db', 'q4')
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q4.ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q4.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q4.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q4.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
invisible(dbExecute(con, "DROP TABLE IF EXISTS q4.ans"))
detach_and_drop(con, 'q4.db', 'q4')
attach_and_use(con, 'q4.db', 'q4')
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q4.ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q4.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q4.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q4.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(dbGetQuery(con, "SELECT * FROM q4.ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM q4.ans WHERE ROWID > (SELECT count(*) FROM q4.ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS q4.ans"))
detach_and_drop(con, 'q4.db', 'q4')
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

question = "big inner on int" # q5
fun = "inner_join"

attach_and_use(con, 'q5.db', 'q5')
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q5.ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q5.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q5.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q5.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
invisible(dbExecute(con, "DROP TABLE IF EXISTS q5.ans"))
detach_and_drop(con, 'q5.db', 'q5')
attach_and_use(con, 'q5.db', 'q5')
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, "CREATE TEMP TABLE q5.ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM q5.ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM q5.ans LIMIT 0"))))
dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)")
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM q5.ans"))[["elapsed"]]
chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(dbGetQuery(con, "SELECT * FROM q5.ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM q5.ans WHERE ROWID > (SELECT count(*) FROM q5.ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS q5.ans"))
detach_and_drop(con, 'q5.db', 'q5')
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

dbDisconnect(con, shutdown=TRUE)

Expand Down
Loading

0 comments on commit a1e3845

Please sign in to comment.