From e91025c88002d4cf53c672e1ded2d6f33137b96c Mon Sep 17 00:00:00 2001 From: giulietk <134016032+giulietk@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:51:12 -0700 Subject: [PATCH] Lab 9.html https://github.com/USCbiostats/PM566/issues/85#issue-2614845245 --- Lab 9.html | 3759 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Lab 9.qmd | 175 +++ 2 files changed, 3934 insertions(+) create mode 100644 Lab 9.html create mode 100644 Lab 9.qmd diff --git a/Lab 9.html b/Lab 9.html new file mode 100644 index 0000000..807dbf7 --- /dev/null +++ b/Lab 9.html @@ -0,0 +1,3759 @@ + + + + + + + + + + +Lab 9 + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

Lab 9

+
+ + + +
+ +
+
Author
+
+

Giuliet Kibler

+
+
+ + + +
+ + + +
+ + +
+

Learning goals

+

In this lab, you are expected to learn/put in practice the following skills:

+
    +
  • Evaluate whether a problem can be parallelized or not.
  • +
  • Practice with the parallel package.
  • +
+
+

Problem 1: Vectorization

+

The following functions can be written to be more efficient without using parallel. Write a faster version of each function and show that (1) the outputs are the same as the slow version, and (2) your version is faster.

+
    +
  1. This function generates an n x k dataset with all its entries drawn from a Poission distribution with mean lambda.
  2. +
+
+
fun1 <- function(n = 100, k = 4, lambda = 4) {
+  x <- NULL
+  
+  for (i in 1:n){
+    x <- rbind(x, rpois(k, lambda))    
+  }
+  
+  return(x)
+}
+
+fun1alt <- function(n = 100, k = 4, lambda = 4) {
+  x <- matrix(rpois(n * k, lambda), nrow = n, ncol = k)
+  return(x)
+}
+
+

Show that fun1alt generates a matrix with the same dimensions as fun1 and that the values inside the two matrices follow similar distributions. Then check the speed of the two functions with the following code:

+
+
library(microbenchmark)
+# Set parameters
+n <- 100
+k <- 4
+lambda <- 4
+
+# Test both functions for equality
+set.seed(42)
+output_fun1 <- fun1(n, k, lambda)
+set.seed(42)
+output_alt <- fun1alt(n, k, lambda)
+
+hist(output_fun1)
+
+
+
+

+
+
+
+
hist(output_alt)
+
+
+
+

+
+
+
+
# The outputs are similar according to this histograms
+
+dim(output_fun1)
+
+
[1] 100   4
+
+
dim(output_alt)
+
+
[1] 100   4
+
+
# Same dimensions
+
+# Benchmarking
+microbenchmark::microbenchmark(
+  fun1(),
+  fun1alt()
+)
+
+
Warning in microbenchmark::microbenchmark(fun1(), fun1alt()): less accurate
+nanosecond times to avoid potential integer overflows
+
+
+
Unit: microseconds
+      expr     min      lq      mean   median       uq      max neval
+    fun1() 121.483 125.132 128.77854 127.3255 131.4460  149.855   100
+ fun1alt()  13.325  13.735  24.27118  14.1860  14.7395 1014.381   100
+
+
# The mean is much faster for the alternate function
+
+
    +
  1. This function finds the maximum value of each column of a matrix (hint: check out the max.col() function).
  2. +
+
+
library(matrixStats)
+# Data Generating Process (10 x 10,000 matrix)
+set.seed(1234)
+x <- matrix(rnorm(1e4), nrow=10)
+
+# Find each column's max value
+fun2 <- function(x) {
+  apply(x, 2, max)
+}
+
+fun2alt <- function(x) {
+  max_values <- colMaxs(x)
+}
+
+

Show that both functions return the same output for a given input matrix, x. Then check the speed of the two functions.

+
+
# Run functions
+output_fun2 <- fun2(x)
+output_alt2 <- fun2alt(x)
+
+identical_output <- all(output_fun2 == output_alt2)
+print(paste("Outputs are identical:", identical_output))
+
+
[1] "Outputs are identical: TRUE"
+
+
# Functions create the same output
+
+microbenchmark::microbenchmark(
+  fun2(x),
+  fun2alt(x),
+  times = 1000
+)
+
+
Unit: microseconds
+       expr     min      lq      mean   median       uq      max neval
+    fun2(x) 514.591 527.137 563.06809 532.6925 540.9745 1901.908  1000
+ fun2alt(x)  29.520  30.258  31.63384  30.6270  31.1190  797.163  1000
+
+
# The alternate function is much faster
+
+
+
+

Problem 3: Parallelization

+

We will now turn our attention to the statistical concept of bootstrapping. Among its many uses, non-parametric bootstrapping allows us to obtain confidence intervals for parameter estimates without relying on parametric assumptions. Don’t worry if these concepts are unfamiliar, we only care about the computation methods in this lab, not the statistics.

+

The main assumption is that we can approximate the results of many repeated experiments by resampling observations from our original dataset, which reflects the population.

+
    +
  1. This function implements a serial version of the bootstrap. Edit this function to parallelize the lapply loop, using whichever method you prefer. Rather than specifying the number of cores to use, use the number given by the ncpus argument, so that we can test it with different numbers of cores later.
  2. +
+
+
library(parallel)
+my_boot <- function(dat, stat, R, ncpus = 1L) {
+  
+  # Getting the random indices
+  n <- nrow(dat)
+  idx <- matrix(sample.int(n, n*R, TRUE), nrow=n, ncol=R)
+  
+  # Set up parallel processing
+  cl <- makeCluster(ncpus)
+  on.exit(stopCluster(cl))
+  
+  clusterExport(cl=cl, "my_stat")
+
+  # Parallelize the lapply loop
+  ans <- parLapply(cl, seq_len(R), function(i) {
+    stat(dat[idx[,i], , drop=FALSE])
+  })
+  
+  # Converting the list into a matrix
+  ans <- do.call(rbind, ans)
+
+  return(ans)
+}
+
+
    +
  1. Once you have a version of the my_boot() function that runs on multiple cores, check that it provides accurate results by comparing it to a parametric model:
  2. +
+
+
# Bootstrap of an OLS
+my_stat <- function(d) coef(lm(y ~ x, data=d))
+
+# DATA SIM
+set.seed(1)
+n <- 500; R <- 1e4
+
+x <- cbind(rnorm(n)); y <- x*5 + rnorm(n)
+
+# Checking if we get something similar as lm
+ans0 <- confint(lm(y~x))
+ans1 <- my_boot(dat = data.frame(x, y), my_stat, R = R, ncpus = 2L)
+
+# You should get something like this
+t(apply(ans1, 2, quantile, c(.025,.975)))
+
+
                  2.5%      97.5%
+(Intercept) -0.1386903 0.04856752
+x            4.8685162 5.04351239
+
+
##                   2.5%      97.5%
+## (Intercept) -0.1372435 0.05074397
+## x            4.8680977 5.04539763
+ans0
+
+
                 2.5 %     97.5 %
+(Intercept) -0.1379033 0.04797344
+x            4.8650100 5.04883353
+
+
##                  2.5 %     97.5 %
+## (Intercept) -0.1379033 0.04797344
+## x            4.8650100 5.04883353
+
+
    +
  1. Check whether your version actually goes faster when it’s run on multiple cores (since this might take a little while to run, we’ll use system.time and just run each version once, rather than microbenchmark, which would run each version 100 times, by default):
  2. +
+
+
system.time(my_boot(dat = data.frame(x, y), my_stat, R = 4000, ncpus = 1L))
+
+
   user  system elapsed 
+  0.028   0.005   1.294 
+
+
system.time(my_boot(dat = data.frame(x, y), my_stat, R = 4000, ncpus = 2L))
+
+
   user  system elapsed 
+  0.032   0.008   0.758 
+
+
# My boot with 2 cpus is faster according to time elasped
+
+
+
+ +
+ + +
+ + + + + \ No newline at end of file diff --git a/Lab 9.qmd b/Lab 9.qmd new file mode 100644 index 0000000..6e9472b --- /dev/null +++ b/Lab 9.qmd @@ -0,0 +1,175 @@ +--- +title: "Lab 9" +author: "Giuliet Kibler" +format: + html: + embed-resources: true +editor: visual +--- + +# Learning goals + +In this lab, you are expected to learn/put in practice the following skills: + +- Evaluate whether a problem can be parallelized or not. +- Practice with the parallel package. + +## Problem 1: Vectorization + +The following functions can be written to be more efficient without using parallel. Write a faster version of each function and show that (1) the outputs are the same as the slow version, and (2) your version is faster. + +1. This function generates an `n x k` dataset with all its entries drawn from a Poission distribution with mean `lambda`. + +```{r p2-fun1} +fun1 <- function(n = 100, k = 4, lambda = 4) { + x <- NULL + + for (i in 1:n){ + x <- rbind(x, rpois(k, lambda)) + } + + return(x) +} + +fun1alt <- function(n = 100, k = 4, lambda = 4) { + x <- matrix(rpois(n * k, lambda), nrow = n, ncol = k) + return(x) +} +``` + +Show that `fun1alt` generates a matrix with the same dimensions as `fun1` and that the values inside the two matrices follow similar distributions. Then check the speed of the two functions with the following code: + +```{r check} +library(microbenchmark) +# Set parameters +n <- 100 +k <- 4 +lambda <- 4 + +# Test both functions for equality +set.seed(42) +output_fun1 <- fun1(n, k, lambda) +set.seed(42) +output_alt <- fun1alt(n, k, lambda) + +hist(output_fun1) +hist(output_alt) +# The outputs are similar according to this histograms + +dim(output_fun1) +dim(output_alt) +# Same dimensions + +# Benchmarking +microbenchmark::microbenchmark( + fun1(), + fun1alt() +) +# The mean is much faster for the alternate function +``` + +2. This function finds the maximum value of each column of a matrix (hint: check out the `max.col()` function). + +```{r p2-fun2} +library(matrixStats) +# Data Generating Process (10 x 10,000 matrix) +set.seed(1234) +x <- matrix(rnorm(1e4), nrow=10) + +# Find each column's max value +fun2 <- function(x) { + apply(x, 2, max) +} + +fun2alt <- function(x) { + max_values <- colMaxs(x) +} +``` + +Show that both functions return the same output for a given input matrix, `x`. Then check the speed of the two functions. + +```{r} +# Run functions +output_fun2 <- fun2(x) +output_alt2 <- fun2alt(x) + +identical_output <- all(output_fun2 == output_alt2) +print(paste("Outputs are identical:", identical_output)) +# Functions create the same output + +microbenchmark::microbenchmark( + fun2(x), + fun2alt(x), + times = 1000 +) +# The alternate function is much faster +``` + +## Problem 3: Parallelization + +We will now turn our attention to the statistical concept of [bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)). Among its many uses, non-parametric bootstrapping allows us to obtain confidence intervals for parameter estimates without relying on parametric assumptions. Don't worry if these concepts are unfamiliar, we only care about the computation methods in this lab, not the statistics. + +The main assumption is that we can approximate the results of many repeated experiments by resampling observations from our original dataset, which reflects the population. + +1. This function implements a serial version of the bootstrap. Edit this function to parallelize the `lapply` loop, using whichever method you prefer. Rather than specifying the number of cores to use, use the number given by the `ncpus` argument, so that we can test it with different numbers of cores later. + +```{r p3-boot-fun} +library(parallel) +my_boot <- function(dat, stat, R, ncpus = 1L) { + + # Getting the random indices + n <- nrow(dat) + idx <- matrix(sample.int(n, n*R, TRUE), nrow=n, ncol=R) + + # Set up parallel processing + cl <- makeCluster(ncpus) + on.exit(stopCluster(cl)) + + clusterExport(cl=cl, "my_stat") + + # Parallelize the lapply loop + ans <- parLapply(cl, seq_len(R), function(i) { + stat(dat[idx[,i], , drop=FALSE]) + }) + + # Converting the list into a matrix + ans <- do.call(rbind, ans) + + return(ans) +} +``` + +2. Once you have a version of the `my_boot()` function that runs on multiple cores, check that it provides accurate results by comparing it to a parametric model: + +```{r p3-test-boot} +# Bootstrap of an OLS +my_stat <- function(d) coef(lm(y ~ x, data=d)) + +# DATA SIM +set.seed(1) +n <- 500; R <- 1e4 + +x <- cbind(rnorm(n)); y <- x*5 + rnorm(n) + +# Checking if we get something similar as lm +ans0 <- confint(lm(y~x)) +ans1 <- my_boot(dat = data.frame(x, y), my_stat, R = R, ncpus = 2L) + +# You should get something like this +t(apply(ans1, 2, quantile, c(.025,.975))) +## 2.5% 97.5% +## (Intercept) -0.1372435 0.05074397 +## x 4.8680977 5.04539763 +ans0 +## 2.5 % 97.5 % +## (Intercept) -0.1379033 0.04797344 +## x 4.8650100 5.04883353 +``` + +3. Check whether your version actually goes faster when it's run on multiple cores (since this might take a little while to run, we'll use `system.time` and just run each version once, rather than `microbenchmark`, which would run each version 100 times, by default): + +```{r benchmark-problem3} +system.time(my_boot(dat = data.frame(x, y), my_stat, R = 4000, ncpus = 1L)) +system.time(my_boot(dat = data.frame(x, y), my_stat, R = 4000, ncpus = 2L)) +# My boot with 2 cpus is faster according to time elasped +```