chore: Restore #131, #137, #138 and #141

This reverts commit bff2007.
ssi-dk · Sep 26, 2024 · 891cbd9 · 891cbd9
1 parent 8d8b404
commit 891cbd9
Show file tree

Hide file tree

Showing 19 changed files with 1,220 additions and 184 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -14,3 +14,4 @@ man-roxygen/*
 ^Meta$
 ^README.Rmd$
 ^revdep$
+^data-raw$
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
@@ -0,0 +1,225 @@
+on:
+  workflow_dispatch
+
+
+name: "⏱️ Benchmark"
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+
+    services:
+      postgres:
+        image: postgres:latest
+        env:
+          POSTGRES_DB: test
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5432:5432
+        options: --health-cmd "pg_isready -U postgres" --health-interval 10s --health-timeout 5s --health-retries 5
+
+    env:
+      PGHOST: localhost
+      PGPORT: 5432
+      PGDATABASE: test
+      PGUSER: postgres
+      PGPASSWORD: postgres
+
+    steps:
+      - name: Install a SQL Server suite of tools
+        uses: potatoqualitee/[email protected]
+        with:
+          install: sqlengine, sqlpackage, sqlclient
+          show-log: true
+
+      - name: Configure SQL server
+        run: |
+          set -o xtrace
+          sqlcmd -V 10 -S localhost -U SA -P dbatools.I0 -Q "ALTER LOGIN SA WITH DEFAULT_DATABASE = master;"
+
+
+
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Configure git
+        run: |
+          git config --local user.name "$GITHUB_ACTOR"
+          git config --local user.email "[email protected]"
+          git switch ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
+
+
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::devtools
+
+
+      - name: Delete previous benchmark files
+        if: always()
+        run: rm -rf inst/extdata/benchmark-*.rds
+
+
+
+      - name: Get SQLite version
+        run: |
+          version=$(Rscript -e "cat(DBI::dbGetQuery(DBI::dbConnect(RSQLite::SQLite()), 'SELECT sqlite_version();')[[1]])")
+          echo "SQLITE_VERSION=SQLite v$version" >> $GITHUB_ENV
+
+      - name: Get DuckDB version
+        run: |
+          version=$(Rscript -e "cat(DBI::dbGetQuery(DBI::dbConnect(duckdb::duckdb()), 'SELECT version();')[[1]])")
+          echo "DUCKDB_VERSION=DuckDB $version" >> $GITHUB_ENV
+
+      - name: Get PostgreSQL version
+        run: |
+          version=$(psql --version | awk '{print $3}')
+          echo "POSTGRES_VERSION=PostgreSQL v$version" >> $GITHUB_ENV
+
+      - name: Get SQL Server version
+        run: |
+          version=$(sqlcmd -S localhost -U SA -P dbatools.I0 -Q "SET NOCOUNT ON; SELECT SERVERPROPERTY('productversion') AS version" -h -1 -W -b)
+          echo "SQL_SERVER_VERSION=SQL Server v$version" >> $GITHUB_ENV
+
+
+      - name: Install libraries to benchmark
+        if: always()
+        run: source("./data-raw/benchmark.R", echo=TRUE)
+        shell: Rscript {0}
+
+
+
+      - name: Run benchmark (${{ env.SQLITE_VERSION }})
+        if: always()
+        env:
+          BACKEND: ${{ env.SQLITE_VERSION }}
+          BACKEND_DRV: RSQLite::SQLite
+          BACKEND_ARGS: 'list(dbname = file.path(tempdir(), "SQLite.SQLite"))'
+        run: source("./data-raw/benchmark.R", echo=TRUE)
+        shell: Rscript {0}
+
+      - name: Run benchmark (${{ env.DUCKDB_VERSION }})
+        if: always()
+        env:
+          BACKEND: ${{ env.DUCKDB_VERSION }}
+          BACKEND_DRV: duckdb::duckdb
+          BACKEND_ARGS: 'list(dbdir = file.path(tempdir(), "DuckDB.duckdb"))'
+        run: source("./data-raw/benchmark.R", echo=TRUE)
+        shell: Rscript {0}
+
+      - name: Run benchmark (${{ env.POSTGRES_VERSION }})
+        if: always()
+        env:
+          BACKEND: ${{ env.POSTGRES_VERSION }}
+          BACKEND_DRV: RPostgres::Postgres
+        run: source("./data-raw/benchmark.R", echo=TRUE)
+        shell: Rscript {0}
+
+      - name: Run benchmark (${{ env.SQL_SERVER_VERSION }})
+        if: always()
+        env:
+          BACKEND: ${{ env.SQL_SERVER_VERSION }}
+          BACKEND_DRV: odbc::odbc
+          CONN_ARGS_JSON: >
+            {
+              "${{ env.SQL_SERVER_VERSION }}": {
+                "driver": "ODBC Driver 17 for SQL Server",
+                "server": "localhost",
+                "database": "master",
+                "UID": "SA",
+                "PWD": "dbatools.I0"
+              }
+            }
+        run: source("./data-raw/benchmark.R", echo=TRUE)
+        shell: Rscript {0}
+
+
+
+      - name: Display structure of benchmark files
+        if: always()
+        run: ls -R data
+
+      - name: Combine benchmark results
+        if: always()
+        run: |
+          benchmark_files <- list.files(
+            "data",
+            pattern = "^benchmark-",
+            full.names = TRUE,
+            recursive = TRUE
+          )
+
+          benchmarks <- benchmark_files |>
+            purrr::map(readRDS) |>
+            purrr::map(tibble::as_tibble) |>
+            purrr::reduce(rbind)
+
+          benchmarks <- benchmarks |>
+            dplyr::mutate(
+              "version" = factor(
+                .data$version,
+                levels = c("CRAN", "main", setdiff(unique(benchmarks$version), c("CRAN", "main")))
+              )
+            )
+
+          # Save the combined benchmark results and delete the individual files
+          dir.create(file.path("inst", "extdata"), recursive = TRUE, showWarnings = FALSE)
+          saveRDS(benchmarks, file.path("inst", "extdata", "benchmarks.rds"))
+          file.remove(benchmark_files)
+
+
+          # Add note slow backends
+          slow_backends <- benchmarks |>
+            dplyr::distinct(.data$database, .data$n) |>
+            dplyr::filter(.data$n < max(.data$n)) |>
+            dplyr::pull("database")
+
+          benchmarks <- benchmarks |>
+            dplyr::mutate("database" = paste0(database, ifelse(database %in% slow_backends, "*", "")))
+
+
+          # Mean and standard deviation (see ggplot2::mean_se())
+          mean_sd <- function(x) {
+            mu <- mean(x)
+            sd <- sd(x)
+            data.frame(y = mu, ymin = mu - sd, ymax = mu + sd)
+          }
+
+          g <- ggplot2::ggplot(
+            benchmarks,
+            ggplot2::aes(x = version, y = time / 1e9)
+          ) +
+            ggplot2::stat_summary(fun.data = mean_sd, geom = "pointrange", size = 0.5, linewidth = 1) +
+            ggplot2::facet_grid(rows = ggplot2::vars(benchmark_function), cols = ggplot2::vars(database)) +
+            ggplot2::labs(x = "Codebase version", y = "Time (s)")
+
+          if (length(slow_backends) > 1) {
+            g <- g + ggplot2::labs(caption = "* IMPORTANT: Benchmark data halved for this backend!")
+          }
+
+          ggplot2::ggsave("benchmarks.pdf")
+
+        shell: Rscript {0}
+
+      - name: Upload benchmark summary
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-summary
+          path: benchmarks.pdf
+
+      - name: Commit and push changes
+        run: |
+          git remote set-url origin https://$GITHUB_ACTOR:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY.git
+          git stash --include-untracked
+          git pull --rebase origin ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
+          git stash list | grep stash@{0} && git stash pop || echo "No stash to pop"
+          git add inst/extdata/\*
+          git commit -m "chore: Update benchmark data" || echo "No changes to commit"
+          git push origin ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,6 +21,8 @@ License: GPL-3
 Encoding: UTF-8
 RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE, r6 = TRUE)
+Depends:
+    R (>= 3.5.0)
 Imports:
     checkmate,
     DBI,
@@ -42,10 +44,14 @@ Suggests:
     callr,
     conflicted,
     duckdb,
+    ggplot2,
+    here,
     jsonlite,
     knitr,
     lintr,
+    microbenchmark,
     odbc,
+    pak,
     rmarkdown,
     roxygen2,
     pkgdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(as.character,Id)
+S3method(create_index,DBIConnection)
+S3method(create_index,PqConnection)
+S3method(create_index,SQLiteConnection)
 S3method(db_timestamp,"NULL")
 S3method(db_timestamp,SQLiteConnection)
 S3method(db_timestamp,default)
@@ -54,6 +57,7 @@ S3method(tidyr::unite,tbl_dbi)
 export(Logger)
 export(LoggerNull)
 export(close_connection)
+export(create_index)
 export(create_logs_if_missing)
 export(create_table)
 export(db_timestamp)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,18 @@
 # SCDB (development version)
 
+## New features
+
+* Added function `create_index` to allow easy creating of an index on a table (#137).
+
+## Improvements and Fixes
+
+* `update_snapshot()` has been optimized and now runs faster on all the supported backends (#137).
+
+## Documentation
+
+* A vignette including benchmarks of `update_snapshot()` across various backends is added (#138).
+
+
 # SCDB 0.4.1
 
 ## Improvements and Fixes
@@ -16,6 +29,7 @@
 
 * Improved tests for `get_tables()` (#145).
 
+
 # SCDB 0.4.0
 
 ## BREAKING CHANGES:

diff --git a/R/connection.R b/R/connection.R
@@ -155,6 +155,14 @@ get_connection.OdbcDriver <- function(
   checkmate::assert_choice(timezone_out, OlsonNames(), null.ok = TRUE, add = coll)
   checkmate::reportAssertions(coll)
 
+  # Recommend batch processing for ODBC connections
+  if (is.null(getOption("odbc.batch_rows"))) {
+    message(
+      "Transfer of large data sets may be slow. ",
+      "Consider using options(\"odbc.batch_rows\" = 1000) to speed up transfer."
+    )
+  }
+
   # Check if connection can be established given these settings
   status <- do.call(DBI::dbCanConnect, args = args)
   if (!status) stop(attr(status, "reason"))

diff --git a/R/create_index.R b/R/create_index.R
@@ -0,0 +1,76 @@
+#' Create the indexes on table
+#' @param conn (`DBIConnection`)\cr
+#'   A connection to a database.
+#' @template db_table
+#' @param columns (`character()`)\cr
+#'   The columns that should be unique.
+#' @return
+#'   NULL (called for side effects)
+#' @examplesIf requireNamespace("RSQLite", quietly = TRUE)
+#'   conn <- get_connection()
+#'
+#'   mt <- dplyr::copy_to(conn, dplyr::distinct(mtcars, .data$mpg, .data$cyl), name = "mtcars")
+#'   create_index(conn, mt, c("mpg", "cyl"))
+#'
+#'   close_connection(conn)
+#' @export
+create_index <- function(conn, db_table, columns) {
+  checkmate::assert_class(conn, "DBIConnection")
+  assert_id_like(db_table)
+  checkmate::assert_character(columns)
+  checkmate::assert_true(table_exists(conn, db_table))
+
+  UseMethod("create_index")
+}
+
+#' @export
+create_index.PqConnection <- function(conn, db_table, columns) {
+  db_table <- id(db_table, conn)
+
+  DBI::dbExecute(
+    conn,
+    glue::glue(
+      "CREATE UNIQUE INDEX ON {as.character(db_table, explicit = TRUE)} ({toString(columns)})"
+    )
+  )
+}
+
+#' @export
+create_index.SQLiteConnection <- function(conn, db_table, columns) {
+  db_table <- id(db_table, conn)
+
+  schema <- purrr::pluck(db_table, "name", "schema")
+  table  <- purrr::pluck(db_table, "name", "table")
+
+  if (schema %in% c("main", "temp")) schema <- NULL
+
+  # Generate index name
+  index <- paste(
+    c(
+      shQuote(schema),
+      shQuote(paste0(c(table, "scdb_index", columns), collapse = "_"))
+    ),
+    collapse = "."
+  )
+
+  DBI::dbExecute(
+    conn,
+    glue::glue(
+      "CREATE UNIQUE INDEX {index} ON {shQuote(table)} ({toString(columns)})"
+    )
+  )
+}
+
+#' @export
+create_index.DBIConnection <- function(conn, db_table, columns) {
+  db_table <- id(db_table, conn)
+
+  index <- glue::glue("{db_table}_scdb_index_{paste(columns, collapse = '_')}") |>
+    stringr::str_replace_all(stringr::fixed("."), "_")
+
+  query <- glue::glue(
+    "CREATE UNIQUE INDEX {index} ON {as.character(db_table, explicit = TRUE)} ({toString(columns)})"
+  )
+
+  DBI::dbExecute(conn, query)
+}
diff --git a/R/create_table.R b/R/create_table.R
@@ -86,5 +86,7 @@ create_table <- function(.data, conn = NULL, db_table, ...) {
     ...
   )
 
+  create_index(conn, db_table_id, columns = c("checksum", "from_ts"))
+
   return(invisible(dplyr::tbl(conn, db_table_id)))
 }