diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92e63fbfa..36813f070 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ project adheres to [Semantic Versioning](http://semver.org/).
 ==================
 ### Changed
 * Switch CI to Github Actions. (Adds Windows and macOS builds.)
+* Speed up `putImageData` for RGBA32 canvases.
 ### Added
 * Export `rsvgVersion`.
 ### Fixed
diff --git a/benchmarks/run.js b/benchmarks/run.js
index 4914ea97b..58ca2e066 100644
--- a/benchmarks/run.js
+++ b/benchmarks/run.js
@@ -4,7 +4,7 @@
  * milliseconds to complete.
  */
 
-var createCanvas = require('../').createCanvas
+var {createCanvas, ImageData} = require('../')
 var canvas = createCanvas(200, 200)
 var largeCanvas = createCanvas(1000, 1000)
 var ctx = canvas.getContext('2d')
@@ -64,6 +64,28 @@ function done (benchmark, times, start, isAsync) {
 
 // node-canvas
 
+const id0 = new ImageData(200, 200)
+
+bm('putImageData, all a=0', function () {
+  ctx.putImageData(id0, 0, 0)
+})
+
+const id255 = new ImageData(200, 200)
+id255.data.fill(0xFF)
+
+bm('putImageData, all a=0xFF', function () {
+  ctx.putImageData(id255, 0, 0)
+})
+
+const idRand = new ImageData(200, 200)
+for (let i = 0; i < idRand.data.length; i++) {
+  idRand.data[i] = 255 * Math.random()
+}
+
+bm('putImageData, mixed a', function () {
+  ctx.putImageData(idRand, 0, 0)
+})
+
 bm('fillStyle= name', function () {
   ctx.fillStyle = 'transparent'
 })
diff --git a/src/CanvasRenderingContext2d.cc b/src/CanvasRenderingContext2d.cc
index 774612708..5c4f6726f 100644
--- a/src/CanvasRenderingContext2d.cc
+++ b/src/CanvasRenderingContext2d.cc
@@ -21,12 +21,29 @@
 
 using namespace v8;
 
-// Windows doesn't support the C99 names for these
 #ifdef _MSC_VER
-#define isnan(x) _isnan(x)
-#define isinf(x) (!_finite(x))
+// Windows doesn't support the C99 names for these. TODO unnecessary,
+// should be using std::isnan.
+# define isnan(x) _isnan(x)
+# define isinf(x) (!_finite(x))
+# include <intrin.h>
+# define bswap32 _byteswap_ulong
+#else
+# ifdef __x86_64__
+#  include <x86intrin.h>
+# endif
+# define bswap32 __builtin_bswap32
 #endif
 
+static inline uint32_t rotr(uint32_t n, unsigned int c) {
+  // GCC has no portable _rotr intrinsic, so rely on idiom recognition. Works
+  // for all supported versions of MSVC, GCC x86, GCC ARM, Clang.
+  // https://stackoverflow.com/a/776523/1218408
+  const unsigned int mask = CHAR_BIT * sizeof(n) - 1;
+  c &= mask;
+  return (n >> c) | (n << ((~c + 1) & mask));
+}
+
 #ifndef isnan
 #define isnan(x) std::isnan(x)
 #define isinf(x) std::isinf(x)
@@ -852,32 +869,52 @@ NAN_METHOD(Context2d::PutImageData) {
     for (int y = 0; y < rows; ++y) {
       uint8_t *dstRow = dst;
       uint8_t *srcRow = src;
-      for (int x = 0; x < cols; ++x) {
-        // rgba
-        uint8_t r = *srcRow++;
-        uint8_t g = *srcRow++;
-        uint8_t b = *srcRow++;
-        uint8_t a = *srcRow++;
-
-        // argb
-        // performance optimization: fully transparent/opaque pixels can be
-        // processed more efficiently.
+#if defined(__x86_64__) || defined(_M_X64)
+      int x = 0;
+      for (; x < cols - 2; x += 2) {
+        __m128i px;
+        memcpy(&px, srcRow, 8); // gcc doesn't define _mm_loadu_si64
+        px = _mm_unpacklo_epi8(px, _mm_setzero_si128());
+        // rgba -> bgra
+        px = _mm_shufflelo_epi16(px, 0b11000110);
+        px = _mm_shufflehi_epi16(px, 0b11000110);
+        // broadcast alpha
+        __m128i av = _mm_shufflelo_epi16(px, 0b11111111);
+                av = _mm_shufflehi_epi16(av, 0b11111111);
+        // Set alpha channel to 255 to undo upcoming division by 255
+        av = _mm_and_si128(av, _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0));
+        av = _mm_or_si128(av, _mm_setr_epi16(0, 0, 0, 255, 0, 0, 0, 255));
+        px = _mm_mullo_epi16(px, av);
+        // divide by 255
+        px = _mm_mulhi_epu16(px, _mm_set1_epi16(0x8081));
+        px = _mm_srli_epi16(px, 7);
+        // pack int16 to int8
+        px = _mm_packus_epi16(px, px);
+        memcpy(dstRow, &px, 8);
+        dstRow += 8;
+        srcRow += 8;
+      }
+      if (x < cols) {
+#else
+      for (int x = 0; x < cols; x++) {
+#endif
+        uint32_t c;
+        memcpy(&c, srcRow, 4); // rgba (LE)
+        srcRow += 4;
+        uint32_t a = c >> 24;
         if (a == 0) {
-          *dstRow++ = 0;
-          *dstRow++ = 0;
-          *dstRow++ = 0;
-          *dstRow++ = 0;
-        } else if (a == 255) {
-          *dstRow++ = b;
-          *dstRow++ = g;
-          *dstRow++ = r;
-          *dstRow++ = a;
+          uint32_t zero = 0;
+          memcpy(dstRow, &zero, 4);
+        } else if (a == 255) { // rgba (LE)
+          c = bswap32(c);      // abgr
+          c = rotr(c, 8);      // bgra
+          memcpy(dstRow, &c, 4);
         } else {
-          float alpha = (float)a / 255;
-          *dstRow++ = b * alpha;
-          *dstRow++ = g * alpha;
-          *dstRow++ = r * alpha;
-          *dstRow++ = a;
+          uint8_t r = (c & 0xFF) * a / 255;
+          uint8_t g = (c >> 8 & 0xFF) * a / 255;
+          uint8_t b = (c >> 16 & 0xFF) * a / 255;
+          uint32_t bgra = (a << 24) | (r << 16) | (g << 8) | b;
+          memcpy(dstRow, &bgra, 4);
         }
       }
       dst += dstStride;
@@ -892,13 +929,13 @@ NAN_METHOD(Context2d::PutImageData) {
       uint8_t *dstRow = dst;
       uint8_t *srcRow = src;
       for (int x = 0; x < cols; ++x) {
-        // rgba
+        // rgb[a]
         uint8_t r = *srcRow++;
         uint8_t g = *srcRow++;
         uint8_t b = *srcRow++;
         srcRow++;
 
-        // argb
+        // bgra
         *dstRow++ = b;
         *dstRow++ = g;
         *dstRow++ = r;