<- function(n = 100, k = 4, lambda = 4) {
+ fun1 <- NULL
+ x
+ for (i in 1:n){
+ <- rbind(x, rpois(k, lambda))
+ x
+ }
+ return(x)
+
+ }
+<- function(n = 100, k = 4, lambda = 4) {
+ fun1alt <- matrix(rpois(n * k, lambda), nrow = n, ncol = k)
+ x return(x)
+ }
Lab 9
+Learning goals
+In this lab, you are expected to learn/put in practice the following skills:
+-
+
- Evaluate whether a problem can be parallelized or not. +
- Practice with the parallel package. +
Problem 1: Vectorization
+The following functions can be written to be more efficient without using parallel. Write a faster version of each function and show that (1) the outputs are the same as the slow version, and (2) your version is faster.
+-
+
- This function generates an
n x k
dataset with all its entries drawn from a Poission distribution with meanlambda
.
+
Show that fun1alt
generates a matrix with the same dimensions as fun1
and that the values inside the two matrices follow similar distributions. Then check the speed of the two functions with the following code:
library(microbenchmark)
+# Set parameters
+<- 100
+ n <- 4
+ k <- 4
+ lambda
+# Test both functions for equality
+set.seed(42)
+<- fun1(n, k, lambda)
+ output_fun1 set.seed(42)
+<- fun1alt(n, k, lambda)
+ output_alt
+hist(output_fun1)
hist(output_alt)
# The outputs are similar according to this histograms
+
+dim(output_fun1)
[1] 100 4
+dim(output_alt)
[1] 100 4
+# Same dimensions
+
+# Benchmarking
+::microbenchmark(
+ microbenchmarkfun1(),
+ fun1alt()
+ )
Warning in microbenchmark::microbenchmark(fun1(), fun1alt()): less accurate
+nanosecond times to avoid potential integer overflows
+Unit: microseconds
+ expr min lq mean median uq max neval
+ fun1() 121.483 125.132 128.77854 127.3255 131.4460 149.855 100
+ fun1alt() 13.325 13.735 24.27118 14.1860 14.7395 1014.381 100
+# The mean is much faster for the alternate function
-
+
- This function finds the maximum value of each column of a matrix (hint: check out the
max.col()
function).
+
library(matrixStats)
+# Data Generating Process (10 x 10,000 matrix)
+set.seed(1234)
+<- matrix(rnorm(1e4), nrow=10)
+ x
+# Find each column's max value
+<- function(x) {
+ fun2 apply(x, 2, max)
+
+ }
+<- function(x) {
+ fun2alt <- colMaxs(x)
+ max_values }
Show that both functions return the same output for a given input matrix, x
. Then check the speed of the two functions.
# Run functions
+<- fun2(x)
+ output_fun2 <- fun2alt(x)
+ output_alt2
+<- all(output_fun2 == output_alt2)
+ identical_output print(paste("Outputs are identical:", identical_output))
[1] "Outputs are identical: TRUE"
+# Functions create the same output
+
+::microbenchmark(
+ microbenchmarkfun2(x),
+ fun2alt(x),
+ times = 1000
+ )
Unit: microseconds
+ expr min lq mean median uq max neval
+ fun2(x) 514.591 527.137 563.06809 532.6925 540.9745 1901.908 1000
+ fun2alt(x) 29.520 30.258 31.63384 30.6270 31.1190 797.163 1000
+# The alternate function is much faster
Problem 3: Parallelization
+We will now turn our attention to the statistical concept of bootstrapping. Among its many uses, non-parametric bootstrapping allows us to obtain confidence intervals for parameter estimates without relying on parametric assumptions. Don’t worry if these concepts are unfamiliar, we only care about the computation methods in this lab, not the statistics.
+The main assumption is that we can approximate the results of many repeated experiments by resampling observations from our original dataset, which reflects the population.
+-
+
- This function implements a serial version of the bootstrap. Edit this function to parallelize the
lapply
loop, using whichever method you prefer. Rather than specifying the number of cores to use, use the number given by thencpus
argument, so that we can test it with different numbers of cores later.
+
library(parallel)
+<- function(dat, stat, R, ncpus = 1L) {
+ my_boot
+ # Getting the random indices
+ <- nrow(dat)
+ n <- matrix(sample.int(n, n*R, TRUE), nrow=n, ncol=R)
+ idx
+ # Set up parallel processing
+ <- makeCluster(ncpus)
+ cl on.exit(stopCluster(cl))
+
+ clusterExport(cl=cl, "my_stat")
+
+# Parallelize the lapply loop
+ <- parLapply(cl, seq_len(R), function(i) {
+ ans stat(dat[idx[,i], , drop=FALSE])
+
+ })
+ # Converting the list into a matrix
+ <- do.call(rbind, ans)
+ ans
+return(ans)
+ }
-
+
- Once you have a version of the
my_boot()
function that runs on multiple cores, check that it provides accurate results by comparing it to a parametric model:
+
# Bootstrap of an OLS
+<- function(d) coef(lm(y ~ x, data=d))
+ my_stat
+# DATA SIM
+set.seed(1)
+<- 500; R <- 1e4
+ n
+<- cbind(rnorm(n)); y <- x*5 + rnorm(n)
+ x
+# Checking if we get something similar as lm
+<- confint(lm(y~x))
+ ans0 <- my_boot(dat = data.frame(x, y), my_stat, R = R, ncpus = 2L)
+ ans1
+# You should get something like this
+t(apply(ans1, 2, quantile, c(.025,.975)))
2.5% 97.5%
+(Intercept) -0.1386903 0.04856752
+x 4.8685162 5.04351239
+## 2.5% 97.5%
+## (Intercept) -0.1372435 0.05074397
+## x 4.8680977 5.04539763
+ ans0
2.5 % 97.5 %
+(Intercept) -0.1379033 0.04797344
+x 4.8650100 5.04883353
+## 2.5 % 97.5 %
+## (Intercept) -0.1379033 0.04797344
+## x 4.8650100 5.04883353
-
+
- Check whether your version actually goes faster when it’s run on multiple cores (since this might take a little while to run, we’ll use
system.time
and just run each version once, rather thanmicrobenchmark
, which would run each version 100 times, by default):
+
system.time(my_boot(dat = data.frame(x, y), my_stat, R = 4000, ncpus = 1L))
user system elapsed
+ 0.028 0.005 1.294
+system.time(my_boot(dat = data.frame(x, y), my_stat, R = 4000, ncpus = 2L))
user system elapsed
+ 0.032 0.008 0.758
+# My boot with 2 cpus is faster according to time elasped