Skip to content

Commit

Permalink
chore: Restore #131, #137, #138 and #141
Browse files Browse the repository at this point in the history
This reverts commit bff2007.
  • Loading branch information
RasmusSkytte committed Sep 26, 2024
1 parent 8d8b404 commit 891cbd9
Show file tree
Hide file tree
Showing 19 changed files with 1,220 additions and 184 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ man-roxygen/*
^Meta$
^README.Rmd$
^revdep$
^data-raw$
225 changes: 225 additions & 0 deletions .github/workflows/benchmark.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
on:
workflow_dispatch


name: "⏱️ Benchmark"
jobs:
benchmark:
runs-on: ubuntu-latest

services:
postgres:
image: postgres:latest
env:
POSTGRES_DB: test
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
ports:
- 5432:5432
options: --health-cmd "pg_isready -U postgres" --health-interval 10s --health-timeout 5s --health-retries 5

env:
PGHOST: localhost
PGPORT: 5432
PGDATABASE: test
PGUSER: postgres
PGPASSWORD: postgres

steps:
- name: Install a SQL Server suite of tools
uses: potatoqualitee/[email protected]
with:
install: sqlengine, sqlpackage, sqlclient
show-log: true

- name: Configure SQL server
run: |
set -o xtrace
sqlcmd -V 10 -S localhost -U SA -P dbatools.I0 -Q "ALTER LOGIN SA WITH DEFAULT_DATABASE = master;"
- uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false

- name: Configure git
run: |
git config --local user.name "$GITHUB_ACTOR"
git config --local user.email "[email protected]"
git switch ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
- uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::devtools


- name: Delete previous benchmark files
if: always()
run: rm -rf inst/extdata/benchmark-*.rds



- name: Get SQLite version
run: |
version=$(Rscript -e "cat(DBI::dbGetQuery(DBI::dbConnect(RSQLite::SQLite()), 'SELECT sqlite_version();')[[1]])")
echo "SQLITE_VERSION=SQLite v$version" >> $GITHUB_ENV
- name: Get DuckDB version
run: |
version=$(Rscript -e "cat(DBI::dbGetQuery(DBI::dbConnect(duckdb::duckdb()), 'SELECT version();')[[1]])")
echo "DUCKDB_VERSION=DuckDB $version" >> $GITHUB_ENV
- name: Get PostgreSQL version
run: |
version=$(psql --version | awk '{print $3}')
echo "POSTGRES_VERSION=PostgreSQL v$version" >> $GITHUB_ENV
- name: Get SQL Server version
run: |
version=$(sqlcmd -S localhost -U SA -P dbatools.I0 -Q "SET NOCOUNT ON; SELECT SERVERPROPERTY('productversion') AS version" -h -1 -W -b)
echo "SQL_SERVER_VERSION=SQL Server v$version" >> $GITHUB_ENV
- name: Install libraries to benchmark
if: always()
run: source("./data-raw/benchmark.R", echo=TRUE)
shell: Rscript {0}



- name: Run benchmark (${{ env.SQLITE_VERSION }})
if: always()
env:
BACKEND: ${{ env.SQLITE_VERSION }}
BACKEND_DRV: RSQLite::SQLite
BACKEND_ARGS: 'list(dbname = file.path(tempdir(), "SQLite.SQLite"))'
run: source("./data-raw/benchmark.R", echo=TRUE)
shell: Rscript {0}

- name: Run benchmark (${{ env.DUCKDB_VERSION }})
if: always()
env:
BACKEND: ${{ env.DUCKDB_VERSION }}
BACKEND_DRV: duckdb::duckdb
BACKEND_ARGS: 'list(dbdir = file.path(tempdir(), "DuckDB.duckdb"))'
run: source("./data-raw/benchmark.R", echo=TRUE)
shell: Rscript {0}

- name: Run benchmark (${{ env.POSTGRES_VERSION }})
if: always()
env:
BACKEND: ${{ env.POSTGRES_VERSION }}
BACKEND_DRV: RPostgres::Postgres
run: source("./data-raw/benchmark.R", echo=TRUE)
shell: Rscript {0}

- name: Run benchmark (${{ env.SQL_SERVER_VERSION }})
if: always()
env:
BACKEND: ${{ env.SQL_SERVER_VERSION }}
BACKEND_DRV: odbc::odbc
CONN_ARGS_JSON: >
{
"${{ env.SQL_SERVER_VERSION }}": {
"driver": "ODBC Driver 17 for SQL Server",
"server": "localhost",
"database": "master",
"UID": "SA",
"PWD": "dbatools.I0"
}
}
run: source("./data-raw/benchmark.R", echo=TRUE)
shell: Rscript {0}



- name: Display structure of benchmark files
if: always()
run: ls -R data

- name: Combine benchmark results
if: always()
run: |
benchmark_files <- list.files(
"data",
pattern = "^benchmark-",
full.names = TRUE,
recursive = TRUE
)
benchmarks <- benchmark_files |>
purrr::map(readRDS) |>
purrr::map(tibble::as_tibble) |>
purrr::reduce(rbind)
benchmarks <- benchmarks |>
dplyr::mutate(
"version" = factor(
.data$version,
levels = c("CRAN", "main", setdiff(unique(benchmarks$version), c("CRAN", "main")))
)
)
# Save the combined benchmark results and delete the individual files
dir.create(file.path("inst", "extdata"), recursive = TRUE, showWarnings = FALSE)
saveRDS(benchmarks, file.path("inst", "extdata", "benchmarks.rds"))
file.remove(benchmark_files)
# Add note slow backends
slow_backends <- benchmarks |>
dplyr::distinct(.data$database, .data$n) |>
dplyr::filter(.data$n < max(.data$n)) |>
dplyr::pull("database")
benchmarks <- benchmarks |>
dplyr::mutate("database" = paste0(database, ifelse(database %in% slow_backends, "*", "")))
# Mean and standard deviation (see ggplot2::mean_se())
mean_sd <- function(x) {
mu <- mean(x)
sd <- sd(x)
data.frame(y = mu, ymin = mu - sd, ymax = mu + sd)
}
g <- ggplot2::ggplot(
benchmarks,
ggplot2::aes(x = version, y = time / 1e9)
) +
ggplot2::stat_summary(fun.data = mean_sd, geom = "pointrange", size = 0.5, linewidth = 1) +
ggplot2::facet_grid(rows = ggplot2::vars(benchmark_function), cols = ggplot2::vars(database)) +
ggplot2::labs(x = "Codebase version", y = "Time (s)")
if (length(slow_backends) > 1) {
g <- g + ggplot2::labs(caption = "* IMPORTANT: Benchmark data halved for this backend!")
}
ggplot2::ggsave("benchmarks.pdf")
shell: Rscript {0}

- name: Upload benchmark summary
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-summary
path: benchmarks.pdf

- name: Commit and push changes
run: |
git remote set-url origin https://$GITHUB_ACTOR:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY.git
git stash --include-untracked
git pull --rebase origin ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
git stash list | grep stash@{0} && git stash pop || echo "No stash to pop"
git add inst/extdata/\*
git commit -m "chore: Update benchmark data" || echo "No changes to commit"
git push origin ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
6 changes: 6 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ License: GPL-3
Encoding: UTF-8
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE, r6 = TRUE)
Depends:
R (>= 3.5.0)
Imports:
checkmate,
DBI,
Expand All @@ -42,10 +44,14 @@ Suggests:
callr,
conflicted,
duckdb,
ggplot2,
here,
jsonlite,
knitr,
lintr,
microbenchmark,
odbc,
pak,
rmarkdown,
roxygen2,
pkgdown,
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Generated by roxygen2: do not edit by hand

S3method(as.character,Id)
S3method(create_index,DBIConnection)
S3method(create_index,PqConnection)
S3method(create_index,SQLiteConnection)
S3method(db_timestamp,"NULL")
S3method(db_timestamp,SQLiteConnection)
S3method(db_timestamp,default)
Expand Down Expand Up @@ -54,6 +57,7 @@ S3method(tidyr::unite,tbl_dbi)
export(Logger)
export(LoggerNull)
export(close_connection)
export(create_index)
export(create_logs_if_missing)
export(create_table)
export(db_timestamp)
Expand Down
14 changes: 14 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# SCDB (development version)

## New features

* Added function `create_index` to allow easy creating of an index on a table (#137).

## Improvements and Fixes

* `update_snapshot()` has been optimized and now runs faster on all the supported backends (#137).

## Documentation

* A vignette including benchmarks of `update_snapshot()` across various backends is added (#138).


# SCDB 0.4.1

## Improvements and Fixes
Expand All @@ -16,6 +29,7 @@

* Improved tests for `get_tables()` (#145).


# SCDB 0.4.0

## BREAKING CHANGES:
Expand Down
8 changes: 8 additions & 0 deletions R/connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,14 @@ get_connection.OdbcDriver <- function(
checkmate::assert_choice(timezone_out, OlsonNames(), null.ok = TRUE, add = coll)
checkmate::reportAssertions(coll)

# Recommend batch processing for ODBC connections
if (is.null(getOption("odbc.batch_rows"))) {
message(
"Transfer of large data sets may be slow. ",
"Consider using options(\"odbc.batch_rows\" = 1000) to speed up transfer."
)
}

# Check if connection can be established given these settings
status <- do.call(DBI::dbCanConnect, args = args)
if (!status) stop(attr(status, "reason"))
Expand Down
76 changes: 76 additions & 0 deletions R/create_index.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' Create the indexes on table
#' @param conn (`DBIConnection`)\cr
#' A connection to a database.
#' @template db_table
#' @param columns (`character()`)\cr
#' The columns that should be unique.
#' @return
#' NULL (called for side effects)
#' @examplesIf requireNamespace("RSQLite", quietly = TRUE)
#' conn <- get_connection()
#'
#' mt <- dplyr::copy_to(conn, dplyr::distinct(mtcars, .data$mpg, .data$cyl), name = "mtcars")
#' create_index(conn, mt, c("mpg", "cyl"))
#'
#' close_connection(conn)
#' @export
create_index <- function(conn, db_table, columns) {
checkmate::assert_class(conn, "DBIConnection")
assert_id_like(db_table)
checkmate::assert_character(columns)
checkmate::assert_true(table_exists(conn, db_table))

UseMethod("create_index")
}

#' @export
create_index.PqConnection <- function(conn, db_table, columns) {
db_table <- id(db_table, conn)

DBI::dbExecute(
conn,
glue::glue(
"CREATE UNIQUE INDEX ON {as.character(db_table, explicit = TRUE)} ({toString(columns)})"
)
)
}

#' @export
create_index.SQLiteConnection <- function(conn, db_table, columns) {
db_table <- id(db_table, conn)

schema <- purrr::pluck(db_table, "name", "schema")
table <- purrr::pluck(db_table, "name", "table")

if (schema %in% c("main", "temp")) schema <- NULL

# Generate index name
index <- paste(
c(
shQuote(schema),
shQuote(paste0(c(table, "scdb_index", columns), collapse = "_"))
),
collapse = "."
)

DBI::dbExecute(
conn,
glue::glue(
"CREATE UNIQUE INDEX {index} ON {shQuote(table)} ({toString(columns)})"
)
)
}

#' @export
create_index.DBIConnection <- function(conn, db_table, columns) {
db_table <- id(db_table, conn)

index <- glue::glue("{db_table}_scdb_index_{paste(columns, collapse = '_')}") |>
stringr::str_replace_all(stringr::fixed("."), "_")

query <- glue::glue(
"CREATE UNIQUE INDEX {index} ON {as.character(db_table, explicit = TRUE)} ({toString(columns)})"
)

DBI::dbExecute(conn, query)
}
2 changes: 2 additions & 0 deletions R/create_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,7 @@ create_table <- function(.data, conn = NULL, db_table, ...) {
...
)

create_index(conn, db_table_id, columns = c("checksum", "from_ts"))

return(invisible(dplyr::tbl(conn, db_table_id)))
}
Loading

0 comments on commit 891cbd9

Please sign in to comment.