diff --git a/Makefile.am b/Makefile.am
index 6bc72fb1..4fa1c011 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -31,6 +31,11 @@ MyPaint_introspectable_headers = \
 	mypaint-rectangle.h				\
 	mypaint-surface.h				\
 	mypaint-tiled-surface.h			\
+	fastapprox/fastpow.h 		\
+	fastapprox/sse.h 		\
+	fastapprox/fastexp.h 		\
+	fastapprox/cast.h 		\
+	fastapprox/fastlog.h 		\
 	$(libmypaint_glib)
 
 if HAVE_INTROSPECTION
diff --git a/brushmodes.c b/brushmodes.c
index 29435123..bbcad09a 100644
--- a/brushmodes.c
+++ b/brushmodes.c
@@ -18,6 +18,8 @@
 
 #include <stdint.h>
 #include <assert.h>
+#include <math.h>
+#include "fastapprox/fastpow.h"
 
 #include "helpers.h"
 
@@ -43,6 +45,7 @@
 // resultAlpha = topAlpha + (1.0 - topAlpha) * bottomAlpha
 // resultColor = topColor + (1.0 - topAlpha) * bottomColor
 //
+
 void draw_dab_pixels_BlendMode_Normal (uint16_t * mask,
                                        uint16_t * rgba,
                                        uint16_t color_r,
@@ -66,7 +69,89 @@ void draw_dab_pixels_BlendMode_Normal (uint16_t * mask,
   }
 };
 
+void draw_dab_pixels_BlendMode_Normal_Paint (uint16_t * mask,
+                                       uint16_t * rgba,
+                                       uint16_t color_r,
+                                       uint16_t color_g,
+                                       uint16_t color_b,
+                                       uint16_t opacity) {
+
+  while (1) {
+    for (; mask[0]; mask++, rgba+=4) {
+      uint32_t opa_a = mask[0]*(uint32_t)opacity/(1<<15); // topAlpha
+      uint32_t opa_b = (1<<15)-opa_a; // bottomAlpha
+      
+      //alpha-weighted ratio for WGM (sums to 1.0)
+      float fac_a = (float)opa_a / (opa_a + opa_b * rgba[3] / (1<<15));
+      //fac_a *= fac_a;
+      float fac_b = 1.0 - fac_a;
+
+      //convert bottom to spectral.  Un-premult alpha to obtain reflectance
+      //color noise is not a problem since low alpha also implies low weight
+      float spectral_b[10] = {0};
+      if (rgba[3] > 0) {
+        rgb_to_spectral((float)rgba[0] / rgba[3], (float)rgba[1] / rgba[3], (float)rgba[2] / rgba[3], spectral_b);
+      } else {
+        rgb_to_spectral((float)rgba[0]/ (1<<15), (float)rgba[1]/ (1<<15), (float)rgba[2]/ (1<<15), spectral_b);
+      }
+      // convert top to spectral.  Already straight color
+      float spectral_a[10] = {0};
+      rgb_to_spectral((float)color_r / (1<<15), (float)color_g / (1<<15), (float)color_b / (1<<15), spectral_a);
+
+      // mix to the two spectral reflectances using WGM
+      float spectral_result[10] = {0};
+      for (int i=0; i<10; i++) {
+        spectral_result[i] = fastpow(spectral_a[i], fac_a) * fastpow(spectral_b[i], fac_b);
+      }
+      
+      // convert back to RGB and premultiply alpha
+      float rgb_result[3] = {0};
+      spectral_to_rgb(spectral_result, rgb_result);
+      rgba[3] = opa_a + opa_b * rgba[3] / (1<<15);
+
+      for (int i=0; i<3; i++) {
+        rgba[i] =(rgb_result[i] * rgba[3]);
+      }
+    }
+    if (!mask[1]) break;
+    rgba += mask[1];
+    mask += 2;
+  }
+};
+
+//Posterize.  Basically exactly like GIMP's posterize
+//reduces colors by adjustable amount (posterize_num).
+//posterize the canvas, then blend that via opacity
+//does not affect alpha
+
+void draw_dab_pixels_BlendMode_Posterize (uint16_t * mask,
+                                       uint16_t * rgba,
+                                       uint16_t opacity,
+                                       uint16_t posterize_num) {
 
+  while (1) {
+    for (; mask[0]; mask++, rgba+=4) {
+     
+      float r = (float)rgba[0] / (1<<15);
+      float g = (float)rgba[1] / (1<<15);
+      float b = (float)rgba[2] / (1<<15);
+
+      uint32_t post_r = (1<<15) * ROUND(r * posterize_num) / posterize_num;
+      uint32_t post_g = (1<<15) * ROUND(g * posterize_num) / posterize_num;
+      uint32_t post_b = (1<<15) * ROUND(b * posterize_num) / posterize_num;
+      
+      uint32_t opa_a = mask[0]*(uint32_t)opacity/(1<<15); // topAlpha
+      uint32_t opa_b = (1<<15)-opa_a; // bottomAlpha
+      rgba[0] = (opa_a*post_r + opa_b*rgba[0])/(1<<15);
+      rgba[1] = (opa_a*post_g + opa_b*rgba[1])/(1<<15);
+      rgba[2] = (opa_a*post_b + opa_b*rgba[2])/(1<<15);
+
+    }
+    if (!mask[1]) break;
+    rgba += mask[1];
+    mask += 2;
+  }
+};
 
 // Colorize: apply the source hue and saturation, retaining the target
 // brightness. Same thing as in the PDF spec addendum, and upcoming SVG
@@ -82,9 +167,9 @@ void draw_dab_pixels_BlendMode_Normal (uint16_t * mask,
 // http://dvcs.w3.org/hg/FXTF/rawfile/tip/compositing/index.html.
 // Same as ITU Rec. BT.601 (SDTV) rounded to 2 decimal places.
 
-static const float LUMA_RED_COEFF   = 0.3 * (1<<15);
-static const float LUMA_GREEN_COEFF = 0.59 * (1<<15);
-static const float LUMA_BLUE_COEFF  = 0.11 * (1<<15);
+static const float LUMA_RED_COEFF   = 0.2126 * (1<<15);
+static const float LUMA_GREEN_COEFF = 0.7152 * (1<<15);
+static const float LUMA_BLUE_COEFF  = 0.0722 * (1<<15);
 
 // See also http://en.wikipedia.org/wiki/YCbCr
 
@@ -224,6 +309,59 @@ void draw_dab_pixels_BlendMode_Normal_and_Eraser (uint16_t * mask,
   }
 };
 
+void draw_dab_pixels_BlendMode_Normal_and_Eraser_Paint (uint16_t * mask,
+                                                  uint16_t * rgba,
+                                                  uint16_t color_r,
+                                                  uint16_t color_g,
+                                                  uint16_t color_b,
+                                                  uint16_t color_a,
+                                                  uint16_t opacity) {
+
+  while (1) {
+    for (; mask[0]; mask++, rgba+=4) {
+        uint32_t opa_a = mask[0]*(uint32_t)opacity/(1<<15); // topAlpha
+        uint32_t opa_b = (1<<15)-opa_a; // bottomAlpha
+
+      float fac_a = (float)opa_a / (opa_a + opa_b * rgba[3] / (1<<15));
+      //fac_a *= fac_a;
+      float fac_b = 1.0 - fac_a;
+      //fac_a *= (float)color_a / (1<<15);
+      float spectral_b[10] = {0};
+      if (rgba[3] > 0) {
+        rgb_to_spectral((float)rgba[0] / rgba[3], (float)rgba[1] / rgba[3], (float)rgba[2] / rgba[3], spectral_b);
+      } else {
+        rgb_to_spectral((float)rgba[0]/ (1<<15), (float)rgba[1]/ (1<<15), (float)rgba[2]/ (1<<15), spectral_b);
+      }
+      // convert top to spectral.  Already straight color
+      float spectral_a[10] = {0};
+      rgb_to_spectral((float)color_r / (1<<15), (float)color_g / (1<<15), (float)color_b / (1<<15), spectral_a);
+
+      // mix to the two spectral colors using WGM
+      float spectral_result[10] = {0};
+      for (int i=0; i<10; i++) {
+        spectral_result[i] = fastpow(spectral_a[i], fac_a) * fastpow(spectral_b[i], fac_b);
+      }
+      // convert back to RGB
+      float rgb_result[3] = {0};
+      spectral_to_rgb(spectral_result, rgb_result);
+      
+      // apply eraser
+      opa_a = opa_a * color_a / (1<<15);
+      
+      // calculate alpha normally
+      rgba[3] = opa_a + opa_b * rgba[3] / (1<<15);
+
+      for (int i=0; i<3; i++) {
+        rgba[i] =(rgb_result[i] * rgba[3]);
+      }
+
+    }
+    if (!mask[1]) break;
+    rgba += mask[1];
+    mask += 2;
+  }
+};
+
 // This is BlendMode_Normal with locked alpha channel.
 //
 void draw_dab_pixels_BlendMode_LockAlpha (uint16_t * mask,
@@ -251,6 +389,53 @@ void draw_dab_pixels_BlendMode_LockAlpha (uint16_t * mask,
   }
 };
 
+void draw_dab_pixels_BlendMode_LockAlpha_Paint (uint16_t * mask,
+                                          uint16_t * rgba,
+                                          uint16_t color_r,
+                                          uint16_t color_g,
+                                          uint16_t color_b,
+                                          uint16_t opacity) {
+
+  while (1) {
+    for (; mask[0]; mask++, rgba+=4) {
+
+      uint32_t opa_a = mask[0]*(uint32_t)opacity/(1<<15); // topAlpha
+      uint32_t opa_b = (1<<15)-opa_a; // bottomAlpha
+      opa_a *= rgba[3];
+      opa_a /= (1<<15);
+      float fac_a = (float)opa_a / (opa_a + opa_b * rgba[3] / (1<<15));
+      //fac_a *= fac_a;
+      float fac_b = 1.0 - fac_a;
+      float spectral_b[10] = {0};
+      if (rgba[3] > 0) {
+        rgb_to_spectral((float)rgba[0] / rgba[3], (float)rgba[1] / rgba[3], (float)rgba[2] / rgba[3], spectral_b);
+      } else {
+        rgb_to_spectral((float)rgba[0]/ (1<<15), (float)rgba[1]/ (1<<15), (float)rgba[2]/ (1<<15), spectral_b);
+      }
+      // convert top to spectral.  Already straight color
+      float spectral_a[10] = {0};
+      rgb_to_spectral((float)color_r / (1<<15), (float)color_g / (1<<15), (float)color_b / (1<<15), spectral_a);
+
+      // mix to the two spectral colors using WGM
+      float spectral_result[10] = {0};
+      for (int i=0; i<10; i++) {
+        spectral_result[i] = fastpow(spectral_a[i], fac_a) * fastpow(spectral_b[i], fac_b);
+      }
+      // convert back to RGB
+      float rgb_result[3] = {0};
+      spectral_to_rgb(spectral_result, rgb_result);
+      rgba[3] = opa_a + opa_b * rgba[3] / (1<<15);
+
+      for (int i=0; i<3; i++) {
+        rgba[i] =(rgb_result[i] * rgba[3]);
+      }
+    }
+    if (!mask[1]) break;
+    rgba += mask[1];
+    mask += 2;
+  }
+};
+
 
 // Sum up the color/alpha components inside the masked region.
 // Called by get_color().
@@ -299,3 +484,5 @@ void get_color_pixels_accumulate (uint16_t * mask,
   *sum_a += a;
 };
 
+
+
diff --git a/brushmodes.h b/brushmodes.h
index 93b61fd2..c8e47e20 100644
--- a/brushmodes.h
+++ b/brushmodes.h
@@ -7,6 +7,13 @@ void draw_dab_pixels_BlendMode_Normal (uint16_t * mask,
                                        uint16_t color_g,
                                        uint16_t color_b,
                                        uint16_t opacity);
+
+void draw_dab_pixels_BlendMode_Normal_Paint (uint16_t * mask,
+                                       uint16_t * rgba,
+                                       uint16_t color_r,
+                                       uint16_t color_g,
+                                       uint16_t color_b,
+                                       uint16_t opacity);
 void
 draw_dab_pixels_BlendMode_Color (uint16_t *mask,
                                  uint16_t *rgba, // b=bottom, premult
@@ -14,6 +21,12 @@ draw_dab_pixels_BlendMode_Color (uint16_t *mask,
                                  uint16_t color_g,  // }-- a=top, !premult
                                  uint16_t color_b,  // }
                                  uint16_t opacity);
+void
+draw_dab_pixels_BlendMode_Posterize (uint16_t *mask,
+                                 uint16_t *rgba, // b=bottom, premult
+                                 uint16_t posterize,
+                                 uint16_t posterize_num);
+
 void draw_dab_pixels_BlendMode_Normal_and_Eraser (uint16_t * mask,
                                                   uint16_t * rgba,
                                                   uint16_t color_r,
@@ -21,12 +34,29 @@ void draw_dab_pixels_BlendMode_Normal_and_Eraser (uint16_t * mask,
                                                   uint16_t color_b,
                                                   uint16_t color_a,
                                                   uint16_t opacity);
+
+void draw_dab_pixels_BlendMode_Normal_and_Eraser_Paint (uint16_t * mask,
+                                                  uint16_t * rgba,
+                                                  uint16_t color_r,
+                                                  uint16_t color_g,
+                                                  uint16_t color_b,
+                                                  uint16_t color_a,
+                                                  uint16_t opacity);
+
 void draw_dab_pixels_BlendMode_LockAlpha (uint16_t * mask,
                                           uint16_t * rgba,
                                           uint16_t color_r,
                                           uint16_t color_g,
                                           uint16_t color_b,
                                           uint16_t opacity);
+
+void draw_dab_pixels_BlendMode_LockAlpha_Paint (uint16_t * mask,
+                                          uint16_t * rgba,
+                                          uint16_t color_r,
+                                          uint16_t color_g,
+                                          uint16_t color_b,
+                                          uint16_t opacity);
+
 void get_color_pixels_accumulate (uint16_t * mask,
                                   uint16_t * rgba,
                                   float * sum_weight,
diff --git a/brushsettings.json b/brushsettings.json
index f2175d67..f3aef7d2 100644
--- a/brushsettings.json
+++ b/brushsettings.json
@@ -227,7 +227,7 @@
             "tooltip": "This setting decreases the hardness when necessary to prevent a pixel staircase effect (aliasing) by making the dab more blurred.\n 0.0 disable (for very strong erasers and pixel brushes)\n 1.0 blur one pixel (good value)\n 5.0 notable blur, thin strokes will disappear"
         },
         {
-            "constant": true,
+            "constant": false,
             "default": 0.0,
             "displayed_name": "Dabs per basic radius",
             "internal_name": "dabs_per_basic_radius",
@@ -236,7 +236,7 @@
             "tooltip": "How many dabs to draw while the pointer moves a distance of one brush radius (more precise: the base value of the radius)"
         },
         {
-            "constant": true,
+            "constant": false,
             "default": 2.0,
             "displayed_name": "Dabs per actual radius",
             "internal_name": "dabs_per_actual_radius",
@@ -245,7 +245,7 @@
             "tooltip": "Same as above, but the radius actually drawn is used, which can change dynamically"
         },
         {
-            "constant": true,
+            "constant": false,
             "default": 0.0,
             "displayed_name": "Dabs per second",
             "internal_name": "dabs_per_second",
@@ -559,15 +559,52 @@
             "minimum": 0.0,
             "tooltip": "Paint with the smudge color instead of the brush color. The smudge color is slowly changed to the color you are painting on.\n 0.0 do not use the smudge color\n 0.5 mix the smudge color with the brush color\n 1.0 use only the smudge color"
         },
+
         {
-            "constant": false,
-            "default": 0.5,
-            "displayed_name": "Smudge length",
-            "internal_name": "smudge_length",
-            "maximum": 1.0,
+            "constant": false, 
+            "default": 1.0, 
+            "displayed_name": "Pigment", 
+            "internal_name": "paint_mode", 
+            "maximum": 1.0, 
+            "minimum": 0.0, 
+            "tooltip": "Spectral pigment Mixing mode.  0 is normal RGB, 1 is 10 channel spectral pigment mode"
+        },
+        {
+            "constant": false, 
+            "default": 0.0, 
+            "displayed_name": "Smudge Transparency", 
+            "internal_name": "smudge_transparency", 
+            "maximum": 1.0, 
+            "minimum": -1.0, 
+            "tooltip": "Control how much transparency is picked up and smudged, similar to lock alpha.  1.0 will not move any transparency.\n0.5 will move only 50% transparency and above. 0.0 will have no effect.  Negative values do the reverse"
+        },
+        {
+            "constant": false, 
+            "default": 0.5, 
+            "displayed_name": "Smudge length", 
+            "internal_name": "smudge_length", 
+            "maximum": 1.0, 
             "minimum": 0.0,
             "tooltip": "This controls how fast the smudge color becomes the color you are painting on.\n0.0 immediately update the smudge color (requires more CPU cycles because of the frequent color checks)\n0.5 change the smudge color steadily towards the canvas color\n1.0 never change the smudge color"
         },
+        {
+            "constant": false,
+            "default": 0.0,
+            "displayed_name": "Smudge Length Multiplier",
+            "internal_name": "smudge_length_log",
+            "maximum": 20.0,
+            "minimum": 0.0,
+            "tooltip": "Lengthens the smudge_length, logarithmic.\nUseful to correct for high-definition/large brushes with lots of dabs.\nThe longer the smudge length the more a paint will spread and will also boost performance dramatically, as the canvas is sampled less often"
+        },
+        {
+            "constant": false, 
+            "default": 0.0, 
+            "displayed_name": "Smudge Bucket", 
+            "internal_name": "smudge_bucket", 
+            "maximum": 255.0, 
+            "minimum": 0.0, 
+            "tooltip": "There are 256 buckets that hold a bit of color picked up from the canvas.\nYou can control which bucket to use to improve variability and realism of the brush.\nEspecially useful with Custom Input to correlate buckets with other settings such as offset"
+        },
         {
             "constant": false,
             "default": 0.0,
@@ -600,7 +637,7 @@
             "default": 4.0,
             "displayed_name": "Stroke duration",
             "internal_name": "stroke_duration_logarithmic",
-            "maximum": 7.0,
+            "maximum": 14.0,
             "minimum": -1.0,
             "tooltip": "How far you have to move until the stroke input reaches 1.0. This value is logarithmic (negative values will not invert the process)."
         },
@@ -676,6 +713,24 @@
             "minimum": 0.0,
             "tooltip": "Colorize the target layer, setting its hue and saturation from the active brush color while retaining its value and alpha."
         },
+        {
+            "constant": false,
+            "default": 0.0,
+            "displayed_name": "Posterize",
+            "internal_name": "posterize",
+            "maximum": 1.0,
+            "minimum": 0.0,
+            "tooltip": "Strength of posteriztion, reducing its colors via the Posterize Levels setting, while retaining alpha."
+        },
+        {
+            "constant": false,
+            "default": 0.05,
+            "displayed_name": "Posterize Levels",
+            "internal_name": "posterize_num",
+            "maximum": 1.28,
+            "minimum": 0.01,
+            "tooltip": "Level of posterization (x100).  Values above 0.5 may not be noticable."
+        },
         {
             "constant": false,
             "default": 0.0,
@@ -736,6 +791,9 @@
         "gridmap_x",
         "gridmap_y",
         "declinationx",
-        "declinationy"
+        "declinationy",
+        "dabs_per_basic_radius",
+        "dabs_per_actual_radius",
+        "dabs_per_second"
     ]
 }
diff --git a/fastapprox/Makefile.am.local b/fastapprox/Makefile.am.local
new file mode 100644
index 00000000..30a1bc1d
--- /dev/null
+++ b/fastapprox/Makefile.am.local
@@ -0,0 +1,21 @@
+# put whatever (auto)make commands here, they will be included from Makefile.am
+#
+
+fastonebigheader.h: $(filter-out config.h fastonebigheader.h, $(wildcard *.h))
+	cat 			\
+	cast.h			\
+	sse.h 			\
+        fastexp.h		\
+        fastlog.h		\
+        fasterf.h		\
+        fastgamma.h		\
+        fasthyperbolic.h	\
+	fastlambertw.h		\
+        fastpow.h		\
+        fastsigmoid.h		\
+        fasttrig.h		\
+	| grep -v '#include "' > "$@"
+
+myinclude_HEADERS +=					\
+  fastonebigheader.h 					\
+  $(filter-out config.h fastonebigheader.h, $(wildcard *.h))
diff --git a/fastapprox/cast.h b/fastapprox/cast.h
new file mode 100644
index 00000000..60484000
--- /dev/null
+++ b/fastapprox/cast.h
@@ -0,0 +1,49 @@
+/*=====================================================================*
+ *                   Copyright (C) 2012 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __CAST_H_
+
+#ifdef __cplusplus
+#define cast_uint32_t static_cast<uint32_t>
+#else
+#define cast_uint32_t (uint32_t)
+#endif
+
+#endif // __CAST_H_
diff --git a/fastapprox/fasterf.h b/fastapprox/fasterf.h
new file mode 100644
index 00000000..2bd8e87b
--- /dev/null
+++ b/fastapprox/fasterf.h
@@ -0,0 +1,182 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_ERF_H_
+#define __FAST_ERF_H_
+
+#include <math.h>
+#include <stdint.h>
+#include "sse.h"
+#include "fastexp.h"
+#include "fastlog.h"
+
+// fasterfc: not actually faster than erfcf(3) on newer machines!
+// ... although vectorized version is interesting
+//     and fastererfc is very fast
+
+static inline float
+fasterfc (float x)
+{
+  static const float k = 3.3509633149424609f;
+  static const float a = 0.07219054755431126f;
+  static const float b = 15.418191568719577f;
+  static const float c = 5.609846028328545f;
+
+  union { float f; uint32_t i; } vc = { c * x };
+  float xsq = x * x;
+  float xquad = xsq * xsq;
+
+  vc.i |= 0x80000000;
+
+  return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
+}
+
+static inline float
+fastererfc (float x)
+{
+  static const float k = 3.3509633149424609f;
+
+  return 2.0f / (1.0f + fasterpow2 (k * x));
+}
+
+// fasterf: not actually faster than erff(3) on newer machines! 
+// ... although vectorized version is interesting
+//     and fastererf is very fast
+
+static inline float
+fasterf (float x)
+{
+  return 1.0f - fasterfc (x);
+}
+
+static inline float
+fastererf (float x)
+{
+  return 1.0f - fastererfc (x);
+}
+
+static inline float
+fastinverseerf (float x)
+{
+  static const float invk = 0.30004578719350504f;
+  static const float a = 0.020287853348211326f;
+  static const float b = 0.07236892874789555f;
+  static const float c = 0.9913030456864257f;
+  static const float d = 0.8059775923760193f;
+
+  float xsq = x * x;
+
+  return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 
+       + x * (a - b * xsq) / (c - d * xsq);
+}
+
+static inline float
+fasterinverseerf (float x)
+{
+  static const float invk = 0.30004578719350504f;
+
+  return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfasterfc (v4sf x)
+{
+  const v4sf k = v4sfl (3.3509633149424609f);
+  const v4sf a = v4sfl (0.07219054755431126f);
+  const v4sf b = v4sfl (15.418191568719577f);
+  const v4sf c = v4sfl (5.609846028328545f);
+
+  union { v4sf f; v4si i; } vc; vc.f = c * x;
+  vc.i |= v4sil (0x80000000);
+
+  v4sf xsq = x * x;
+  v4sf xquad = xsq * xsq;
+
+  return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
+}
+
+static inline v4sf
+vfastererfc (const v4sf x)
+{
+  const v4sf k = v4sfl (3.3509633149424609f);
+
+  return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
+}
+
+static inline v4sf
+vfasterf (v4sf x)
+{
+  return v4sfl (1.0f) - vfasterfc (x);
+}
+
+static inline v4sf
+vfastererf (const v4sf x)
+{
+  return v4sfl (1.0f) - vfastererfc (x);
+}
+
+static inline v4sf
+vfastinverseerf (v4sf x)
+{
+  const v4sf invk = v4sfl (0.30004578719350504f);
+  const v4sf a = v4sfl (0.020287853348211326f);
+  const v4sf b = v4sfl (0.07236892874789555f);
+  const v4sf c = v4sfl (0.9913030456864257f);
+  const v4sf d = v4sfl (0.8059775923760193f);
+
+  v4sf xsq = x * x;
+
+  return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 
+       + x * (a - b * xsq) / (c - d * xsq);
+}
+
+static inline v4sf
+vfasterinverseerf (v4sf x)
+{
+  const v4sf invk = v4sfl (0.30004578719350504f);
+
+  return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_ERF_H_
diff --git a/fastapprox/fastexp.h b/fastapprox/fastexp.h
new file mode 100644
index 00000000..dc9219c1
--- /dev/null
+++ b/fastapprox/fastexp.h
@@ -0,0 +1,137 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_EXP_H_
+#define __FAST_EXP_H_
+
+#include <stdint.h>
+#include "cast.h"
+#include "sse.h"
+
+// Underflow of exponential is common practice in numerical routines,
+// so handle it here.
+
+static inline float
+fastpow2 (float p)
+{
+  float offset = (p < 0) ? 1.0f : 0.0f;
+  float clipp = (p < -126) ? -126.0f : p;
+  int w = clipp;
+  float z = clipp - w + offset;
+  union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
+
+  return v.f;
+}
+
+static inline float
+fastexp (float p)
+{
+  return fastpow2 (1.442695040f * p);
+}
+
+static inline float
+fasterpow2 (float p)
+{
+  float clipp = (p < -126) ? -126.0f : p;
+  union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
+  return v.f;
+}
+
+static inline float
+fasterexp (float p)
+{
+  return fasterpow2 (1.442695040f * p);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastpow2 (const v4sf p)
+{
+  v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
+  v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
+  v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
+  v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
+  v4si w = v4sf_to_v4si (clipp);
+  v4sf z = clipp - v4si_to_v4sf (w) + offset;
+
+  const v4sf c_121_2740838 = v4sfl (121.2740575f);
+  const v4sf c_27_7280233 = v4sfl (27.7280233f);
+  const v4sf c_4_84252568 = v4sfl (4.84252568f);
+  const v4sf c_1_49012907 = v4sfl (1.49012907f);
+  union { v4si i; v4sf f; } v = {
+    v4sf_to_v4si (
+      v4sfl (1 << 23) * 
+      (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
+    )
+  };
+
+  return v.f;
+}
+
+static inline v4sf
+vfastexp (const v4sf p)
+{
+  const v4sf c_invlog_2 = v4sfl (1.442695040f);
+
+  return vfastpow2 (c_invlog_2 * p);
+}
+
+static inline v4sf
+vfasterpow2 (const v4sf p)
+{
+  const v4sf c_126_94269504 = v4sfl (126.94269504f);
+  v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
+  v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
+  union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
+  return v.f;
+}
+
+static inline v4sf
+vfasterexp (const v4sf p)
+{
+  const v4sf c_invlog_2 = v4sfl (1.442695040f);
+
+  return vfasterpow2 (c_invlog_2 * p);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_EXP_H_
diff --git a/fastapprox/fastgamma.h b/fastapprox/fastgamma.h
new file mode 100644
index 00000000..012bcd38
--- /dev/null
+++ b/fastapprox/fastgamma.h
@@ -0,0 +1,149 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_GAMMA_H_
+#define __FAST_GAMMA_H_
+
+#include <stdint.h>
+#include "sse.h"
+#include "fastlog.h"
+
+/* gamma/digamma functions only work for positive inputs */
+
+static inline float
+fastlgamma (float x)
+{
+  float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
+  float xp3 = 3.0f + x;
+
+  return - 2.081061466f 
+         - x 
+         + 0.0833333f / xp3 
+         - logterm 
+         + (2.5f + x) * fastlog (xp3);
+}
+
+static inline float
+fasterlgamma (float x)
+{
+  return - 0.0810614667f 
+         - x
+         - fasterlog (x)
+         + (0.5f + x) * fasterlog (1.0f + x);
+}
+
+static inline float
+fastdigamma (float x)
+{
+  float twopx = 2.0f + x;
+  float logterm = fastlog (twopx);
+
+  return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
+         (12.0f * x * (1.0f + x) * twopx * twopx)
+         + logterm;
+}
+
+static inline float
+fasterdigamma (float x)
+{
+  float onepx = 1.0f + x;
+
+  return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlgamma (v4sf x)
+{
+  const v4sf c_1_0 = v4sfl (1.0f);
+  const v4sf c_2_0 = v4sfl (2.0f);
+  const v4sf c_3_0 = v4sfl (3.0f);
+  const v4sf c_2_081061466 = v4sfl (2.081061466f);
+  const v4sf c_0_0833333 = v4sfl (0.0833333f);
+  const v4sf c_2_5 = v4sfl (2.5f);
+
+  v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
+  v4sf xp3 = c_3_0 + x;
+
+  return - c_2_081061466
+         - x 
+         + c_0_0833333 / xp3 
+         - logterm 
+         + (c_2_5 + x) * vfastlog (xp3);
+}
+
+static inline v4sf
+vfasterlgamma (v4sf x)
+{
+  const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
+  const v4sf c_0_5 = v4sfl (0.5f);
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return - c_0_0810614667
+         - x
+         - vfasterlog (x)
+         + (c_0_5 + x) * vfasterlog (c_1 + x);
+}
+
+static inline v4sf
+vfastdigamma (v4sf x)
+{
+  v4sf twopx = v4sfl (2.0f) + x;
+  v4sf logterm = vfastlog (twopx);
+
+  return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
+         (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
+         + logterm;
+}
+
+static inline v4sf
+vfasterdigamma (v4sf x)
+{
+  const v4sf c_1_0 = v4sfl (1.0f);
+  const v4sf c_2_0 = v4sfl (2.0f);
+  v4sf onepx = c_1_0 + x;
+
+  return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_GAMMA_H_
diff --git a/fastapprox/fasthyperbolic.h b/fastapprox/fasthyperbolic.h
new file mode 100644
index 00000000..b5783d16
--- /dev/null
+++ b/fastapprox/fasthyperbolic.h
@@ -0,0 +1,138 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_HYPERBOLIC_H_
+#define __FAST_HYPERBOLIC_H_
+
+#include <stdint.h>
+#include "sse.h"
+#include "fastexp.h"
+
+static inline float
+fastsinh (float p)
+{
+  return 0.5f * (fastexp (p) - fastexp (-p));
+}
+
+static inline float
+fastersinh (float p)
+{
+  return 0.5f * (fasterexp (p) - fasterexp (-p));
+}
+
+static inline float
+fastcosh (float p)
+{
+  return 0.5f * (fastexp (p) + fastexp (-p));
+}
+
+static inline float
+fastercosh (float p)
+{
+  return 0.5f * (fasterexp (p) + fasterexp (-p));
+}
+
+static inline float
+fasttanh (float p)
+{
+  return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
+}
+
+static inline float
+fastertanh (float p)
+{
+  return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsinh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfastexp (p) - vfastexp (-p));
+}
+
+static inline v4sf
+vfastersinh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
+}
+
+static inline v4sf
+vfastcosh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfastexp (p) + vfastexp (-p));
+}
+
+static inline v4sf
+vfastercosh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
+}
+
+static inline v4sf
+vfasttanh (const v4sf p)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+  const v4sf c_2 = v4sfl (2.0f);
+
+  return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
+}
+
+static inline v4sf
+vfastertanh (const v4sf p)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+  const v4sf c_2 = v4sfl (2.0f);
+
+  return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_HYPERBOLIC_H_
diff --git a/fastapprox/fastlambertw.h b/fastapprox/fastlambertw.h
new file mode 100644
index 00000000..f98f086a
--- /dev/null
+++ b/fastapprox/fastlambertw.h
@@ -0,0 +1,216 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_LAMBERT_W_H_
+#define __FAST_LAMBERT_W_H_
+
+#include <stdint.h>
+#include "fastexp.h"
+#include "fastlog.h"
+#include "sse.h"
+
+// these functions compute the upper branch aka W_0
+
+static inline float
+fastlambertw (float x)
+{
+  static const float threshold = 2.26445f;
+
+  float c = (x < threshold) ? 1.546865557f : 1.0f;
+  float d = (x < threshold) ? 2.250366841f : 0.0f;
+  float a = (x < threshold) ? -0.737769969f : 0.0f;
+
+  float logterm = fastlog (c * x + d);
+  float loglogterm = fastlog (logterm);
+
+  float minusw = -a - logterm + loglogterm - loglogterm / logterm;
+  float expminusw = fastexp (minusw);
+  float xexpminusw = x * expminusw;
+  float pexpminusw = xexpminusw - minusw;
+
+  return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
+         (2.0f + pexpminusw * (2.0f - minusw));
+}
+
+static inline float
+fasterlambertw (float x)
+{
+  static const float threshold = 2.26445f;
+
+  float c = (x < threshold) ? 1.546865557f : 1.0f;
+  float d = (x < threshold) ? 2.250366841f : 0.0f;
+  float a = (x < threshold) ? -0.737769969f : 0.0f;
+
+  float logterm = fasterlog (c * x + d);
+  float loglogterm = fasterlog (logterm);
+
+  float w = a + logterm - loglogterm + loglogterm / logterm;
+  float expw = fasterexp (-w);
+
+  return (w * w + expw * x) / (1.0f + w);
+}
+
+static inline float
+fastlambertwexpx (float x)
+{
+  static const float k = 1.1765631309f;
+  static const float a = 0.94537622168f;
+
+  float logarg = fmaxf (x, k);
+  float powarg = (x < k) ? a * (x - k) : 0;
+
+  float logterm = fastlog (logarg);
+  float powterm = fasterpow2 (powarg);  // don't need accuracy here
+
+  float w = powterm * (logarg - logterm + logterm / logarg);
+  float logw = fastlog (w);
+  float p = x - logw;
+
+  return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
+         (2.0f - p + w * (5.0f + 2.0f * w));
+}
+
+static inline float
+fasterlambertwexpx (float x)
+{
+  static const float k = 1.1765631309f;
+  static const float a = 0.94537622168f;
+
+  float logarg = fmaxf (x, k);
+  float powarg = (x < k) ? a * (x - k) : 0;
+
+  float logterm = fasterlog (logarg);
+  float powterm = fasterpow2 (powarg);
+
+  float w = powterm * (logarg - logterm + logterm / logarg);
+  float logw = fasterlog (w);
+
+  return w * (1.0f + x - logw) / (1.0f + w);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlambertw (v4sf x)
+{
+  const v4sf threshold = v4sfl (2.26445f);
+
+  v4sf under = _mm_cmplt_ps (x, threshold);
+  v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
+                      _mm_andnot_ps (under, v4sfl (1.0f)));
+  v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
+  v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
+
+  v4sf logterm = vfastlog (c * x + d);
+  v4sf loglogterm = vfastlog (logterm);
+
+  v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
+  v4sf expminusw = vfastexp (minusw);
+  v4sf xexpminusw = x * expminusw;
+  v4sf pexpminusw = xexpminusw - minusw;
+
+  return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 
+         (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
+}
+
+static inline v4sf
+vfasterlambertw (v4sf x)
+{
+  const v4sf threshold = v4sfl (2.26445f);
+
+  v4sf under = _mm_cmplt_ps (x, threshold);
+  v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
+                      _mm_andnot_ps (under, v4sfl (1.0f)));
+  v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
+  v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
+
+  v4sf logterm = vfasterlog (c * x + d);
+  v4sf loglogterm = vfasterlog (logterm);
+
+  v4sf w = a + logterm - loglogterm + loglogterm / logterm;
+  v4sf expw = vfasterexp (-w);
+
+  return (w * w + expw * x) / (v4sfl (1.0f) + w);
+}
+
+static inline v4sf
+vfastlambertwexpx (v4sf x)
+{
+  const v4sf k = v4sfl (1.1765631309f);
+  const v4sf a = v4sfl (0.94537622168f);
+  const v4sf two = v4sfl (2.0f);
+  const v4sf three = v4sfl (3.0f);
+  const v4sf five = v4sfl (5.0f);
+
+  v4sf logarg = _mm_max_ps (x, k);
+  v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
+
+  v4sf logterm = vfastlog (logarg);
+  v4sf powterm = vfasterpow2 (powarg);  // don't need accuracy here
+
+  v4sf w = powterm * (logarg - logterm + logterm / logarg);
+  v4sf logw = vfastlog (w);
+  v4sf p = x - logw;
+
+  return w * (two + p + w * (three + two * p)) /
+         (two - p + w * (five + two * w));
+}
+
+static inline v4sf
+vfasterlambertwexpx (v4sf x)
+{
+  const v4sf k = v4sfl (1.1765631309f);
+  const v4sf a = v4sfl (0.94537622168f);
+
+  v4sf logarg = _mm_max_ps (x, k);
+  v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
+
+  v4sf logterm = vfasterlog (logarg);
+  v4sf powterm = vfasterpow2 (powarg);
+
+  v4sf w = powterm * (logarg - logterm + logterm / logarg);
+  v4sf logw = vfasterlog (w);
+
+  return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
+}
+
+#endif // __SSE2__
+
+#endif // __FAST_LAMBERT_W_H_
diff --git a/fastapprox/fastlog.h b/fastapprox/fastlog.h
new file mode 100644
index 00000000..5c900da0
--- /dev/null
+++ b/fastapprox/fastlog.h
@@ -0,0 +1,144 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_LOG_H_
+#define __FAST_LOG_H_
+
+#include <stdint.h>
+#include "sse.h"
+
+static inline float 
+fastlog2 (float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+
+  return y - 124.22551499f
+           - 1.498030302f * mx.f 
+           - 1.72587999f / (0.3520887068f + mx.f);
+}
+
+static inline float
+fastlog (float x)
+{
+  return 0.69314718f * fastlog2 (x);
+}
+
+static inline float 
+fasterlog2 (float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+  return y - 126.94269504f;
+}
+
+static inline float
+fasterlog (float x)
+{
+//  return 0.69314718f * fasterlog2 (x);
+
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlog2 (v4sf x)
+{
+  union { v4sf f; v4si i; } vx = { x };
+  union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (1.1920928955078125e-7f);
+
+  const v4sf c_124_22551499 = v4sfl (124.22551499f);
+  const v4sf c_1_498030302 = v4sfl (1.498030302f);
+  const v4sf c_1_725877999 = v4sfl (1.72587999f);
+  const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
+
+  return y - c_124_22551499
+           - c_1_498030302 * mx.f 
+           - c_1_725877999 / (c_0_3520087068 + mx.f);
+}
+
+static inline v4sf
+vfastlog (v4sf x)
+{
+  const v4sf c_0_69314718 = v4sfl (0.69314718f);
+
+  return c_0_69314718 * vfastlog2 (x);
+}
+
+static inline v4sf 
+vfasterlog2 (v4sf x)
+{
+  union { v4sf f; v4si i; } vx = { x };
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (1.1920928955078125e-7f);
+
+  const v4sf c_126_94269504 = v4sfl (126.94269504f);
+
+  return y - c_126_94269504;
+}
+
+static inline v4sf
+vfasterlog (v4sf x)
+{
+//  const v4sf c_0_69314718 = v4sfl (0.69314718f);
+//
+//  return c_0_69314718 * vfasterlog2 (x);
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (8.2629582881927490e-8f);
+
+  const v4sf c_87_989971088 = v4sfl (87.989971088f);
+
+  return y - c_87_989971088;
+}
+
+#endif // __SSE2__
+
+#endif // __FAST_LOG_H_
diff --git a/fastapprox/fastonebigheader.h b/fastapprox/fastonebigheader.h
new file mode 100644
index 00000000..0daa0656
--- /dev/null
+++ b/fastapprox/fastonebigheader.h
@@ -0,0 +1,1652 @@
+/*=====================================================================*
+ *                   Copyright (C) 2012 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __CAST_H_
+
+#ifdef __cplusplus
+#define cast_uint32_t static_cast<uint32_t>
+#else
+#define cast_uint32_t (uint32_t)
+#endif
+
+#endif // __CAST_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __SSE_H_
+#define __SSE_H_
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+#ifdef __cplusplus
+namespace {
+#endif // __cplusplus
+
+typedef __m128 v4sf;
+typedef __m128i v4si;
+
+#define v4si_to_v4sf _mm_cvtepi32_ps
+#define v4sf_to_v4si _mm_cvttps_epi32
+
+#if _MSC_VER && !__INTEL_COMPILER
+  template <class T>
+  __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }
+
+  #define AS_4CHARS(a) \
+      GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
+      GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)
+
+  #define _MM_SETR_EPI32(a0, a1, a2, a3) \
+      { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }
+
+  #define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
+  #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))
+
+  __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
+  __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
+  __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
+  __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }
+
+  __forceinline const v4sf operator+(const v4sf& a) { return a; }
+  __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+  __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
+  __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
+  __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }
+
+  __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
+  __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
+  __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }
+
+  __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
+  __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
+  __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
+  __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }
+
+  __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
+  __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
+  __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
+#else
+  #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
+  #define v2dil(x) ((const v4si) { (x), (x) })
+  #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
+#endif
+
+typedef union { v4sf f; float array[4]; } v4sfindexer;
+#define v4sf_index(_findx, _findi)      \
+  ({                                    \
+     v4sfindexer _findvx = { _findx } ; \
+     _findvx.array[_findi];             \
+  })
+typedef union { v4si i; int array[4]; } v4siindexer;
+#define v4si_index(_iindx, _iindi)      \
+  ({                                    \
+     v4siindexer _iindvx = { _iindx } ; \
+     _iindvx.array[_iindi];             \
+  })
+
+typedef union { v4sf f; v4si i; } v4sfv4sipun;
+#if _MSC_VER && !__INTEL_COMPILER
+  #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
+#else
+  #define v4sf_fabs(x)                  \
+  ({                                    \
+     v4sfv4sipun vx;                    \
+     vx.f = x;                          \
+     vx.i &= v4sil (0x7FFFFFFF);        \
+     vx.f;                              \
+  })
+#endif
+
+#ifdef __cplusplus
+} // end namespace
+#endif // __cplusplus
+
+#endif // __SSE2__
+
+#endif // __SSE_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_EXP_H_
+#define __FAST_EXP_H_
+
+#include <stdint.h>
+
+// Underflow of exponential is common practice in numerical routines,
+// so handle it here.
+
+static inline float
+fastpow2 (float p)
+{
+  float offset = (p < 0) ? 1.0f : 0.0f;
+  float clipp = (p < -126) ? -126.0f : p;
+  int w = clipp;
+  float z = clipp - w + offset;
+  union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
+
+  return v.f;
+}
+
+static inline float
+fastexp (float p)
+{
+  return fastpow2 (1.442695040f * p);
+}
+
+static inline float
+fasterpow2 (float p)
+{
+  float clipp = (p < -126) ? -126.0f : p;
+  union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
+  return v.f;
+}
+
+static inline float
+fasterexp (float p)
+{
+  return fasterpow2 (1.442695040f * p);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastpow2 (const v4sf p)
+{
+  v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
+  v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
+  v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
+  v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
+  v4si w = v4sf_to_v4si (clipp);
+  v4sf z = clipp - v4si_to_v4sf (w) + offset;
+
+  const v4sf c_121_2740838 = v4sfl (121.2740575f);
+  const v4sf c_27_7280233 = v4sfl (27.7280233f);
+  const v4sf c_4_84252568 = v4sfl (4.84252568f);
+  const v4sf c_1_49012907 = v4sfl (1.49012907f);
+  union { v4si i; v4sf f; } v = {
+    v4sf_to_v4si (
+      v4sfl (1 << 23) * 
+      (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
+    )
+  };
+
+  return v.f;
+}
+
+static inline v4sf
+vfastexp (const v4sf p)
+{
+  const v4sf c_invlog_2 = v4sfl (1.442695040f);
+
+  return vfastpow2 (c_invlog_2 * p);
+}
+
+static inline v4sf
+vfasterpow2 (const v4sf p)
+{
+  const v4sf c_126_94269504 = v4sfl (126.94269504f);
+  v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
+  v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
+  union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
+  return v.f;
+}
+
+static inline v4sf
+vfasterexp (const v4sf p)
+{
+  const v4sf c_invlog_2 = v4sfl (1.442695040f);
+
+  return vfasterpow2 (c_invlog_2 * p);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_EXP_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_LOG_H_
+#define __FAST_LOG_H_
+
+#include <stdint.h>
+
+static inline float 
+fastlog2 (float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+
+  return y - 124.22551499f
+           - 1.498030302f * mx.f 
+           - 1.72587999f / (0.3520887068f + mx.f);
+}
+
+static inline float
+fastlog (float x)
+{
+  return 0.69314718f * fastlog2 (x);
+}
+
+static inline float 
+fasterlog2 (float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+  return y - 126.94269504f;
+}
+
+static inline float
+fasterlog (float x)
+{
+//  return 0.69314718f * fasterlog2 (x);
+
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlog2 (v4sf x)
+{
+  union { v4sf f; v4si i; } vx = { x };
+  union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (1.1920928955078125e-7f);
+
+  const v4sf c_124_22551499 = v4sfl (124.22551499f);
+  const v4sf c_1_498030302 = v4sfl (1.498030302f);
+  const v4sf c_1_725877999 = v4sfl (1.72587999f);
+  const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
+
+  return y - c_124_22551499
+           - c_1_498030302 * mx.f 
+           - c_1_725877999 / (c_0_3520087068 + mx.f);
+}
+
+static inline v4sf
+vfastlog (v4sf x)
+{
+  const v4sf c_0_69314718 = v4sfl (0.69314718f);
+
+  return c_0_69314718 * vfastlog2 (x);
+}
+
+static inline v4sf 
+vfasterlog2 (v4sf x)
+{
+  union { v4sf f; v4si i; } vx = { x };
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (1.1920928955078125e-7f);
+
+  const v4sf c_126_94269504 = v4sfl (126.94269504f);
+
+  return y - c_126_94269504;
+}
+
+static inline v4sf
+vfasterlog (v4sf x)
+{
+//  const v4sf c_0_69314718 = v4sfl (0.69314718f);
+//
+//  return c_0_69314718 * vfasterlog2 (x);
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (8.2629582881927490e-8f);
+
+  const v4sf c_87_989971088 = v4sfl (87.989971088f);
+
+  return y - c_87_989971088;
+}
+
+#endif // __SSE2__
+
+#endif // __FAST_LOG_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_ERF_H_
+#define __FAST_ERF_H_
+
+#include <math.h>
+#include <stdint.h>
+
+// fasterfc: not actually faster than erfcf(3) on newer machines!
+// ... although vectorized version is interesting
+//     and fastererfc is very fast
+
+static inline float
+fasterfc (float x)
+{
+  static const float k = 3.3509633149424609f;
+  static const float a = 0.07219054755431126f;
+  static const float b = 15.418191568719577f;
+  static const float c = 5.609846028328545f;
+
+  union { float f; uint32_t i; } vc = { c * x };
+  float xsq = x * x;
+  float xquad = xsq * xsq;
+
+  vc.i |= 0x80000000;
+
+  return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
+}
+
+static inline float
+fastererfc (float x)
+{
+  static const float k = 3.3509633149424609f;
+
+  return 2.0f / (1.0f + fasterpow2 (k * x));
+}
+
+// fasterf: not actually faster than erff(3) on newer machines! 
+// ... although vectorized version is interesting
+//     and fastererf is very fast
+
+static inline float
+fasterf (float x)
+{
+  return 1.0f - fasterfc (x);
+}
+
+static inline float
+fastererf (float x)
+{
+  return 1.0f - fastererfc (x);
+}
+
+static inline float
+fastinverseerf (float x)
+{
+  static const float invk = 0.30004578719350504f;
+  static const float a = 0.020287853348211326f;
+  static const float b = 0.07236892874789555f;
+  static const float c = 0.9913030456864257f;
+  static const float d = 0.8059775923760193f;
+
+  float xsq = x * x;
+
+  return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 
+       + x * (a - b * xsq) / (c - d * xsq);
+}
+
+static inline float
+fasterinverseerf (float x)
+{
+  static const float invk = 0.30004578719350504f;
+
+  return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfasterfc (v4sf x)
+{
+  const v4sf k = v4sfl (3.3509633149424609f);
+  const v4sf a = v4sfl (0.07219054755431126f);
+  const v4sf b = v4sfl (15.418191568719577f);
+  const v4sf c = v4sfl (5.609846028328545f);
+
+  union { v4sf f; v4si i; } vc; vc.f = c * x;
+  vc.i |= v4sil (0x80000000);
+
+  v4sf xsq = x * x;
+  v4sf xquad = xsq * xsq;
+
+  return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
+}
+
+static inline v4sf
+vfastererfc (const v4sf x)
+{
+  const v4sf k = v4sfl (3.3509633149424609f);
+
+  return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
+}
+
+static inline v4sf
+vfasterf (v4sf x)
+{
+  return v4sfl (1.0f) - vfasterfc (x);
+}
+
+static inline v4sf
+vfastererf (const v4sf x)
+{
+  return v4sfl (1.0f) - vfastererfc (x);
+}
+
+static inline v4sf
+vfastinverseerf (v4sf x)
+{
+  const v4sf invk = v4sfl (0.30004578719350504f);
+  const v4sf a = v4sfl (0.020287853348211326f);
+  const v4sf b = v4sfl (0.07236892874789555f);
+  const v4sf c = v4sfl (0.9913030456864257f);
+  const v4sf d = v4sfl (0.8059775923760193f);
+
+  v4sf xsq = x * x;
+
+  return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 
+       + x * (a - b * xsq) / (c - d * xsq);
+}
+
+static inline v4sf
+vfasterinverseerf (v4sf x)
+{
+  const v4sf invk = v4sfl (0.30004578719350504f);
+
+  return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_ERF_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_GAMMA_H_
+#define __FAST_GAMMA_H_
+
+#include <stdint.h>
+
+/* gamma/digamma functions only work for positive inputs */
+
+static inline float
+fastlgamma (float x)
+{
+  float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
+  float xp3 = 3.0f + x;
+
+  return - 2.081061466f 
+         - x 
+         + 0.0833333f / xp3 
+         - logterm 
+         + (2.5f + x) * fastlog (xp3);
+}
+
+static inline float
+fasterlgamma (float x)
+{
+  return - 0.0810614667f 
+         - x
+         - fasterlog (x)
+         + (0.5f + x) * fasterlog (1.0f + x);
+}
+
+static inline float
+fastdigamma (float x)
+{
+  float twopx = 2.0f + x;
+  float logterm = fastlog (twopx);
+
+  return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
+         (12.0f * x * (1.0f + x) * twopx * twopx)
+         + logterm;
+}
+
+static inline float
+fasterdigamma (float x)
+{
+  float onepx = 1.0f + x;
+
+  return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlgamma (v4sf x)
+{
+  const v4sf c_1_0 = v4sfl (1.0f);
+  const v4sf c_2_0 = v4sfl (2.0f);
+  const v4sf c_3_0 = v4sfl (3.0f);
+  const v4sf c_2_081061466 = v4sfl (2.081061466f);
+  const v4sf c_0_0833333 = v4sfl (0.0833333f);
+  const v4sf c_2_5 = v4sfl (2.5f);
+
+  v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
+  v4sf xp3 = c_3_0 + x;
+
+  return - c_2_081061466
+         - x 
+         + c_0_0833333 / xp3 
+         - logterm 
+         + (c_2_5 + x) * vfastlog (xp3);
+}
+
+static inline v4sf
+vfasterlgamma (v4sf x)
+{
+  const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
+  const v4sf c_0_5 = v4sfl (0.5f);
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return - c_0_0810614667
+         - x
+         - vfasterlog (x)
+         + (c_0_5 + x) * vfasterlog (c_1 + x);
+}
+
+static inline v4sf
+vfastdigamma (v4sf x)
+{
+  v4sf twopx = v4sfl (2.0f) + x;
+  v4sf logterm = vfastlog (twopx);
+
+  return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
+         (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
+         + logterm;
+}
+
+static inline v4sf
+vfasterdigamma (v4sf x)
+{
+  const v4sf c_1_0 = v4sfl (1.0f);
+  const v4sf c_2_0 = v4sfl (2.0f);
+  v4sf onepx = c_1_0 + x;
+
+  return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_GAMMA_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_HYPERBOLIC_H_
+#define __FAST_HYPERBOLIC_H_
+
+#include <stdint.h>
+
+static inline float
+fastsinh (float p)
+{
+  return 0.5f * (fastexp (p) - fastexp (-p));
+}
+
+static inline float
+fastersinh (float p)
+{
+  return 0.5f * (fasterexp (p) - fasterexp (-p));
+}
+
+static inline float
+fastcosh (float p)
+{
+  return 0.5f * (fastexp (p) + fastexp (-p));
+}
+
+static inline float
+fastercosh (float p)
+{
+  return 0.5f * (fasterexp (p) + fasterexp (-p));
+}
+
+static inline float
+fasttanh (float p)
+{
+  return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
+}
+
+static inline float
+fastertanh (float p)
+{
+  return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsinh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfastexp (p) - vfastexp (-p));
+}
+
+static inline v4sf
+vfastersinh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
+}
+
+static inline v4sf
+vfastcosh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfastexp (p) + vfastexp (-p));
+}
+
+static inline v4sf
+vfastercosh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
+}
+
+static inline v4sf
+vfasttanh (const v4sf p)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+  const v4sf c_2 = v4sfl (2.0f);
+
+  return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
+}
+
+static inline v4sf
+vfastertanh (const v4sf p)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+  const v4sf c_2 = v4sfl (2.0f);
+
+  return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_HYPERBOLIC_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_LAMBERT_W_H_
+#define __FAST_LAMBERT_W_H_
+
+#include <stdint.h>
+
+// these functions compute the upper branch aka W_0
+
+static inline float
+fastlambertw (float x)
+{
+  static const float threshold = 2.26445f;
+
+  float c = (x < threshold) ? 1.546865557f : 1.0f;
+  float d = (x < threshold) ? 2.250366841f : 0.0f;
+  float a = (x < threshold) ? -0.737769969f : 0.0f;
+
+  float logterm = fastlog (c * x + d);
+  float loglogterm = fastlog (logterm);
+
+  float minusw = -a - logterm + loglogterm - loglogterm / logterm;
+  float expminusw = fastexp (minusw);
+  float xexpminusw = x * expminusw;
+  float pexpminusw = xexpminusw - minusw;
+
+  return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
+         (2.0f + pexpminusw * (2.0f - minusw));
+}
+
+static inline float
+fasterlambertw (float x)
+{
+  static const float threshold = 2.26445f;
+
+  float c = (x < threshold) ? 1.546865557f : 1.0f;
+  float d = (x < threshold) ? 2.250366841f : 0.0f;
+  float a = (x < threshold) ? -0.737769969f : 0.0f;
+
+  float logterm = fasterlog (c * x + d);
+  float loglogterm = fasterlog (logterm);
+
+  float w = a + logterm - loglogterm + loglogterm / logterm;
+  float expw = fasterexp (-w);
+
+  return (w * w + expw * x) / (1.0f + w);
+}
+
+static inline float
+fastlambertwexpx (float x)
+{
+  static const float k = 1.1765631309f;
+  static const float a = 0.94537622168f;
+
+  float logarg = fmaxf (x, k);
+  float powarg = (x < k) ? a * (x - k) : 0;
+
+  float logterm = fastlog (logarg);
+  float powterm = fasterpow2 (powarg);  // don't need accuracy here
+
+  float w = powterm * (logarg - logterm + logterm / logarg);
+  float logw = fastlog (w);
+  float p = x - logw;
+
+  return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
+         (2.0f - p + w * (5.0f + 2.0f * w));
+}
+
+static inline float
+fasterlambertwexpx (float x)
+{
+  static const float k = 1.1765631309f;
+  static const float a = 0.94537622168f;
+
+  float logarg = fmaxf (x, k);
+  float powarg = (x < k) ? a * (x - k) : 0;
+
+  float logterm = fasterlog (logarg);
+  float powterm = fasterpow2 (powarg);
+
+  float w = powterm * (logarg - logterm + logterm / logarg);
+  float logw = fasterlog (w);
+
+  return w * (1.0f + x - logw) / (1.0f + w);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlambertw (v4sf x)
+{
+  const v4sf threshold = v4sfl (2.26445f);
+
+  v4sf under = _mm_cmplt_ps (x, threshold);
+  v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
+                      _mm_andnot_ps (under, v4sfl (1.0f)));
+  v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
+  v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
+
+  v4sf logterm = vfastlog (c * x + d);
+  v4sf loglogterm = vfastlog (logterm);
+
+  v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
+  v4sf expminusw = vfastexp (minusw);
+  v4sf xexpminusw = x * expminusw;
+  v4sf pexpminusw = xexpminusw - minusw;
+
+  return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 
+         (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
+}
+
+static inline v4sf
+vfasterlambertw (v4sf x)
+{
+  const v4sf threshold = v4sfl (2.26445f);
+
+  v4sf under = _mm_cmplt_ps (x, threshold);
+  v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
+                      _mm_andnot_ps (under, v4sfl (1.0f)));
+  v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
+  v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
+
+  v4sf logterm = vfasterlog (c * x + d);
+  v4sf loglogterm = vfasterlog (logterm);
+
+  v4sf w = a + logterm - loglogterm + loglogterm / logterm;
+  v4sf expw = vfasterexp (-w);
+
+  return (w * w + expw * x) / (v4sfl (1.0f) + w);
+}
+
+static inline v4sf
+vfastlambertwexpx (v4sf x)
+{
+  const v4sf k = v4sfl (1.1765631309f);
+  const v4sf a = v4sfl (0.94537622168f);
+  const v4sf two = v4sfl (2.0f);
+  const v4sf three = v4sfl (3.0f);
+  const v4sf five = v4sfl (5.0f);
+
+  v4sf logarg = _mm_max_ps (x, k);
+  v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
+
+  v4sf logterm = vfastlog (logarg);
+  v4sf powterm = vfasterpow2 (powarg);  // don't need accuracy here
+
+  v4sf w = powterm * (logarg - logterm + logterm / logarg);
+  v4sf logw = vfastlog (w);
+  v4sf p = x - logw;
+
+  return w * (two + p + w * (three + two * p)) /
+         (two - p + w * (five + two * w));
+}
+
+static inline v4sf
+vfasterlambertwexpx (v4sf x)
+{
+  const v4sf k = v4sfl (1.1765631309f);
+  const v4sf a = v4sfl (0.94537622168f);
+
+  v4sf logarg = _mm_max_ps (x, k);
+  v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
+
+  v4sf logterm = vfasterlog (logarg);
+  v4sf powterm = vfasterpow2 (powarg);
+
+  v4sf w = powterm * (logarg - logterm + logterm / logarg);
+  v4sf logw = vfasterlog (w);
+
+  return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
+}
+
+#endif // __SSE2__
+
+#endif // __FAST_LAMBERT_W_H_
+
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_POW_H_
+#define __FAST_POW_H_
+
+#include <stdint.h>
+
+static inline float
+fastpow (float x,
+         float p)
+{
+  return fastpow2 (p * fastlog2 (x));
+}
+
+static inline float
+fasterpow (float x,
+           float p)
+{
+  return fasterpow2 (p * fasterlog2 (x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastpow (const v4sf x,
+          const v4sf p)
+{
+  return vfastpow2 (p * vfastlog2 (x));
+}
+
+static inline v4sf
+vfasterpow (const v4sf x,
+            const v4sf p)
+{
+  return vfasterpow2 (p * vfasterlog2 (x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_POW_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_SIGMOID_H_
+#define __FAST_SIGMOID_H_
+
+#include <stdint.h>
+
+static inline float
+fastsigmoid (float x)
+{
+  return 1.0f / (1.0f + fastexp (-x));
+}
+
+static inline float
+fastersigmoid (float x)
+{
+  return 1.0f / (1.0f + fasterexp (-x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsigmoid (const v4sf x)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return c_1 / (c_1 + vfastexp (-x));
+}
+
+static inline v4sf
+vfastersigmoid (const v4sf x)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return c_1 / (c_1 + vfasterexp (-x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_SIGMOID_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_TRIG_H_
+#define __FAST_TRIG_H_
+
+#include <stdint.h>
+
+// http://www.devmaster.net/forums/showthread.php?t=5784
+// fast sine variants are for x \in [ -\pi, pi ]
+// fast cosine variants are for x \in [ -\pi, pi ]
+// fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
+// "full" versions of functions handle the entire range of inputs
+// although the range reduction technique used here will be hopelessly
+// inaccurate for |x| >> 1000
+//
+// WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
+// libc calls on older machines (!) and on newer machines are only 
+// slighly faster.  however:
+//   * vectorized versions are competitive
+//   * faster full versions are competitive
+
+static inline float
+fastsin (float x)
+{
+  static const float fouroverpi = 1.2732395447351627f;
+  static const float fouroverpisq = 0.40528473456935109f;
+  static const float q = 0.78444488374548933f;
+  union { float f; uint32_t i; } p = { 0.20363937680730309f };
+  union { float f; uint32_t i; } r = { 0.015124940802184233f };
+  union { float f; uint32_t i; } s = { -0.0032225901625579573f };
+
+  union { float f; uint32_t i; } vx = { x };
+  uint32_t sign = vx.i & 0x80000000;
+  vx.i = vx.i & 0x7FFFFFFF;
+
+  float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+  float qpproxsq = qpprox * qpprox;
+
+  p.i |= sign;
+  r.i |= sign;
+  s.i ^= sign;
+
+  return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
+}
+
+static inline float
+fastersin (float x)
+{
+  static const float fouroverpi = 1.2732395447351627f;
+  static const float fouroverpisq = 0.40528473456935109f;
+  static const float q = 0.77633023248007499f;
+  union { float f; uint32_t i; } p = { 0.22308510060189463f };
+
+  union { float f; uint32_t i; } vx = { x };
+  uint32_t sign = vx.i & 0x80000000;
+  vx.i &= 0x7FFFFFFF;
+
+  float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+
+  p.i |= sign;
+
+  return qpprox * (q + p.f * qpprox);
+}
+
+static inline float
+fastsinfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  return fastsin ((half + k) * twopi - x);
+}
+
+static inline float
+fastersinfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  return fastersin ((half + k) * twopi - x);
+}
+
+static inline float
+fastcos (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  static const float halfpiminustwopi = -4.7123889803846899f;
+  float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
+  return fastsin (x + offset);
+}
+
+static inline float
+fastercos (float x)
+{
+  static const float twooverpi = 0.63661977236758134f;
+  static const float p = 0.54641335845679634f;
+
+  union { float f; uint32_t i; } vx = { x };
+  vx.i &= 0x7FFFFFFF;
+
+  float qpprox = 1.0f - twooverpi * vx.f;
+
+  return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
+}
+
+static inline float
+fastcosfull (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastsinfull (x + halfpi);
+}
+
+static inline float
+fastercosfull (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastersinfull (x + halfpi);
+}
+
+static inline float
+fasttan (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastsin (x) / fastsin (x + halfpi);
+}
+
+static inline float
+fastertan (float x)
+{
+  return fastersin (x) / fastercos (x);
+}
+
+static inline float
+fasttanfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  float xnew = x - (half + k) * twopi;
+
+  return fastsin (xnew) / fastcos (xnew);
+}
+
+static inline float
+fastertanfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  float xnew = x - (half + k) * twopi;
+
+  return fastersin (xnew) / fastercos (xnew);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsin (const v4sf x)
+{
+  const v4sf fouroverpi = v4sfl (1.2732395447351627f);
+  const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
+  const v4sf q = v4sfl (0.78444488374548933f);
+  const v4sf p = v4sfl (0.20363937680730309f);
+  const v4sf r = v4sfl (0.015124940802184233f);
+  const v4sf s = v4sfl (-0.0032225901625579573f);
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4si sign = vx.i & v4sil (0x80000000);
+  vx.i &= v4sil (0x7FFFFFFF);
+
+  v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+  v4sf qpproxsq = qpprox * qpprox;
+  union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
+  vy.i ^= sign;
+
+  return q * qpprox + vy.f;
+}
+
+static inline v4sf
+vfastersin (const v4sf x)
+{
+  const v4sf fouroverpi = v4sfl (1.2732395447351627f);
+  const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
+  const v4sf q = v4sfl (0.77633023248007499f);
+  const v4sf plit = v4sfl (0.22308510060189463f);
+  union { v4sf f; v4si i; } p = { plit };
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4si sign = vx.i & v4sil (0x80000000);
+  vx.i &= v4sil (0x7FFFFFFF);
+
+  v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+
+  p.i |= sign;
+
+  return qpprox * (q + p.f * qpprox);
+}
+
+static inline v4sf
+vfastsinfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+
+  return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
+}
+
+static inline v4sf
+vfastersinfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+
+  return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
+}
+
+static inline v4sf
+vfastcos (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
+  v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
+  v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
+                           _mm_andnot_ps (lthalfpi, halfpi));
+  return vfastsin (x + offset);
+}
+
+static inline v4sf
+vfastercos (v4sf x)
+{
+  const v4sf twooverpi = v4sfl (0.63661977236758134f);
+  const v4sf p = v4sfl (0.54641335845679634);
+
+  v4sf vx = v4sf_fabs (x);
+  v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
+
+  return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
+}
+
+static inline v4sf
+vfastcosfull (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastsinfull (x + halfpi);
+}
+
+static inline v4sf
+vfastercosfull (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastersinfull (x + halfpi);
+}
+
+static inline v4sf
+vfasttan (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastsin (x) / vfastsin (x + halfpi);
+}
+
+static inline v4sf
+vfastertan (const v4sf x)
+{
+  return vfastersin (x) / vfastercos (x);
+}
+
+static inline v4sf
+vfasttanfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+  v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
+
+  return vfastsin (xnew) / vfastcos (xnew);
+}
+
+static inline v4sf
+vfastertanfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+  v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
+
+  return vfastersin (xnew) / vfastercos (xnew);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_TRIG_H_
diff --git a/fastapprox/fastpow.h b/fastapprox/fastpow.h
new file mode 100644
index 00000000..8f9a3eab
--- /dev/null
+++ b/fastapprox/fastpow.h
@@ -0,0 +1,82 @@
+
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_POW_H_
+#define __FAST_POW_H_
+
+#include <stdint.h>
+#include "sse.h"
+#include "fastexp.h"
+#include "fastlog.h"
+
+static inline float
+fastpow (float x,
+         float p)
+{
+  return fastpow2 (p * fastlog2 (x));
+}
+
+static inline float
+fasterpow (float x,
+           float p)
+{
+  return fasterpow2 (p * fasterlog2 (x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastpow (const v4sf x,
+          const v4sf p)
+{
+  return vfastpow2 (p * vfastlog2 (x));
+}
+
+static inline v4sf
+vfasterpow (const v4sf x,
+            const v4sf p)
+{
+  return vfasterpow2 (p * vfasterlog2 (x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_POW_H_
diff --git a/fastapprox/fastsigmoid.h b/fastapprox/fastsigmoid.h
new file mode 100644
index 00000000..75fb2899
--- /dev/null
+++ b/fastapprox/fastsigmoid.h
@@ -0,0 +1,80 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_SIGMOID_H_
+#define __FAST_SIGMOID_H_
+
+#include <stdint.h>
+#include "sse.h"
+#include "fastexp.h"
+
+static inline float
+fastsigmoid (float x)
+{
+  return 1.0f / (1.0f + fastexp (-x));
+}
+
+static inline float
+fastersigmoid (float x)
+{
+  return 1.0f / (1.0f + fasterexp (-x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsigmoid (const v4sf x)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return c_1 / (c_1 + vfastexp (-x));
+}
+
+static inline v4sf
+vfastersigmoid (const v4sf x)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return c_1 / (c_1 + vfasterexp (-x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_SIGMOID_H_
diff --git a/fastapprox/fasttrig.h b/fastapprox/fasttrig.h
new file mode 100644
index 00000000..0199270b
--- /dev/null
+++ b/fastapprox/fasttrig.h
@@ -0,0 +1,360 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_TRIG_H_
+#define __FAST_TRIG_H_
+
+#include <stdint.h>
+#include "sse.h"
+
+// http://www.devmaster.net/forums/showthread.php?t=5784
+// fast sine variants are for x \in [ -\pi, pi ]
+// fast cosine variants are for x \in [ -\pi, pi ]
+// fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
+// "full" versions of functions handle the entire range of inputs
+// although the range reduction technique used here will be hopelessly
+// inaccurate for |x| >> 1000
+//
+// WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
+// libc calls on older machines (!) and on newer machines are only 
+// slighly faster.  however:
+//   * vectorized versions are competitive
+//   * faster full versions are competitive
+
+static inline float
+fastsin (float x)
+{
+  static const float fouroverpi = 1.2732395447351627f;
+  static const float fouroverpisq = 0.40528473456935109f;
+  static const float q = 0.78444488374548933f;
+  union { float f; uint32_t i; } p = { 0.20363937680730309f };
+  union { float f; uint32_t i; } r = { 0.015124940802184233f };
+  union { float f; uint32_t i; } s = { -0.0032225901625579573f };
+
+  union { float f; uint32_t i; } vx = { x };
+  uint32_t sign = vx.i & 0x80000000;
+  vx.i = vx.i & 0x7FFFFFFF;
+
+  float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+  float qpproxsq = qpprox * qpprox;
+
+  p.i |= sign;
+  r.i |= sign;
+  s.i ^= sign;
+
+  return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
+}
+
+static inline float
+fastersin (float x)
+{
+  static const float fouroverpi = 1.2732395447351627f;
+  static const float fouroverpisq = 0.40528473456935109f;
+  static const float q = 0.77633023248007499f;
+  union { float f; uint32_t i; } p = { 0.22308510060189463f };
+
+  union { float f; uint32_t i; } vx = { x };
+  uint32_t sign = vx.i & 0x80000000;
+  vx.i &= 0x7FFFFFFF;
+
+  float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+
+  p.i |= sign;
+
+  return qpprox * (q + p.f * qpprox);
+}
+
+static inline float
+fastsinfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  return fastsin ((half + k) * twopi - x);
+}
+
+static inline float
+fastersinfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  return fastersin ((half + k) * twopi - x);
+}
+
+static inline float
+fastcos (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  static const float halfpiminustwopi = -4.7123889803846899f;
+  float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
+  return fastsin (x + offset);
+}
+
+static inline float
+fastercos (float x)
+{
+  static const float twooverpi = 0.63661977236758134f;
+  static const float p = 0.54641335845679634f;
+
+  union { float f; uint32_t i; } vx = { x };
+  vx.i &= 0x7FFFFFFF;
+
+  float qpprox = 1.0f - twooverpi * vx.f;
+
+  return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
+}
+
+static inline float
+fastcosfull (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastsinfull (x + halfpi);
+}
+
+static inline float
+fastercosfull (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastersinfull (x + halfpi);
+}
+
+static inline float
+fasttan (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastsin (x) / fastsin (x + halfpi);
+}
+
+static inline float
+fastertan (float x)
+{
+  return fastersin (x) / fastercos (x);
+}
+
+static inline float
+fasttanfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  float xnew = x - (half + k) * twopi;
+
+  return fastsin (xnew) / fastcos (xnew);
+}
+
+static inline float
+fastertanfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  float xnew = x - (half + k) * twopi;
+
+  return fastersin (xnew) / fastercos (xnew);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsin (const v4sf x)
+{
+  const v4sf fouroverpi = v4sfl (1.2732395447351627f);
+  const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
+  const v4sf q = v4sfl (0.78444488374548933f);
+  const v4sf p = v4sfl (0.20363937680730309f);
+  const v4sf r = v4sfl (0.015124940802184233f);
+  const v4sf s = v4sfl (-0.0032225901625579573f);
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4si sign = vx.i & v4sil (0x80000000);
+  vx.i &= v4sil (0x7FFFFFFF);
+
+  v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+  v4sf qpproxsq = qpprox * qpprox;
+  union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
+  vy.i ^= sign;
+
+  return q * qpprox + vy.f;
+}
+
+static inline v4sf
+vfastersin (const v4sf x)
+{
+  const v4sf fouroverpi = v4sfl (1.2732395447351627f);
+  const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
+  const v4sf q = v4sfl (0.77633023248007499f);
+  const v4sf plit = v4sfl (0.22308510060189463f);
+  union { v4sf f; v4si i; } p = { plit };
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4si sign = vx.i & v4sil (0x80000000);
+  vx.i &= v4sil (0x7FFFFFFF);
+
+  v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+
+  p.i |= sign;
+
+  return qpprox * (q + p.f * qpprox);
+}
+
+static inline v4sf
+vfastsinfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+
+  return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
+}
+
+static inline v4sf
+vfastersinfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+
+  return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
+}
+
+static inline v4sf
+vfastcos (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
+  v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
+  v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
+                           _mm_andnot_ps (lthalfpi, halfpi));
+  return vfastsin (x + offset);
+}
+
+static inline v4sf
+vfastercos (v4sf x)
+{
+  const v4sf twooverpi = v4sfl (0.63661977236758134f);
+  const v4sf p = v4sfl (0.54641335845679634);
+
+  v4sf vx = v4sf_fabs (x);
+  v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
+
+  return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
+}
+
+static inline v4sf
+vfastcosfull (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastsinfull (x + halfpi);
+}
+
+static inline v4sf
+vfastercosfull (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastersinfull (x + halfpi);
+}
+
+static inline v4sf
+vfasttan (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastsin (x) / vfastsin (x + halfpi);
+}
+
+static inline v4sf
+vfastertan (const v4sf x)
+{
+  return vfastersin (x) / vfastercos (x);
+}
+
+static inline v4sf
+vfasttanfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+  v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
+
+  return vfastsin (xnew) / vfastcos (xnew);
+}
+
+static inline v4sf
+vfastertanfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+  v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
+
+  return vfastersin (xnew) / vfastercos (xnew);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_TRIG_H_
diff --git a/fastapprox/sse.h b/fastapprox/sse.h
new file mode 100644
index 00000000..2571afd4
--- /dev/null
+++ b/fastapprox/sse.h
@@ -0,0 +1,134 @@
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __SSE_H_
+#define __SSE_H_
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+#ifdef __cplusplus
+namespace {
+#endif // __cplusplus
+
+typedef __m128 v4sf;
+typedef __m128i v4si;
+
+#define v4si_to_v4sf _mm_cvtepi32_ps
+#define v4sf_to_v4si _mm_cvttps_epi32
+
+#if _MSC_VER && !__INTEL_COMPILER
+  template <class T>
+  __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }
+
+  #define AS_4CHARS(a) \
+      GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
+      GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)
+
+  #define _MM_SETR_EPI32(a0, a1, a2, a3) \
+      { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }
+
+  #define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
+  #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))
+
+  __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
+  __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
+  __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
+  __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }
+
+  __forceinline const v4sf operator+(const v4sf& a) { return a; }
+  __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+  __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
+  __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
+  __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }
+
+  __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
+  __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
+  __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }
+
+  __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
+  __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
+  __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
+  __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }
+
+  __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
+  __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
+  __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
+#else
+  #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
+  #define v2dil(x) ((const v4si) { (x), (x) })
+  #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
+#endif
+
+typedef union { v4sf f; float array[4]; } v4sfindexer;
+#define v4sf_index(_findx, _findi)      \
+  ({                                    \
+     v4sfindexer _findvx = { _findx } ; \
+     _findvx.array[_findi];             \
+  })
+typedef union { v4si i; int array[4]; } v4siindexer;
+#define v4si_index(_iindx, _iindi)      \
+  ({                                    \
+     v4siindexer _iindvx = { _iindx } ; \
+     _iindvx.array[_iindi];             \
+  })
+
+typedef union { v4sf f; v4si i; } v4sfv4sipun;
+#if _MSC_VER && !__INTEL_COMPILER
+  #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
+#else
+  #define v4sf_fabs(x)                  \
+  ({                                    \
+     v4sfv4sipun vx;                    \
+     vx.f = x;                          \
+     vx.i &= v4sil (0x7FFFFFFF);        \
+     vx.f;                              \
+  })
+#endif
+
+#ifdef __cplusplus
+} // end namespace
+#endif // __cplusplus
+
+#endif // __SSE2__
+
+#endif // __SSE_H_
diff --git a/helpers.c b/helpers.c
index 35f210aa..e917f807 100644
--- a/helpers.c
+++ b/helpers.c
@@ -22,9 +22,43 @@
 #include <assert.h>
 #include <stdint.h>
 #include <math.h>
+#include "fastapprox/fastpow.h"
 
 #include "helpers.h"
 
+/*const float T_MATRIX[3][36] = {{0.000578913,0.001952085,0.009886235,0.032720398,0.100474668,0.183366464,0.233267126,0.172815304,0.021160832,-0.170961409,-0.358555623,-0.487793958,-0.674399544,-0.886748322,-0.97045709,-0.872696304,-0.559560624,-0.134497482,0.395369748,0.969077244,1.563646415,1.918490755,2.226446938,2.219830783,1.916051812,1.395620385,0.990444867,0.604042138,0.353697296,0.192706913,0.098266461,0.042521122,0.021860797,0.011569942,0.004800182,0.002704537},*/
+/*{-0.000491981,-0.00166858,-0.008527451,-0.028611512,-0.089589024,-0.169698855,-0.232545306,-0.211643919,-0.117700145,0.039996723,0.233957719,0.411776827,0.669587627,1.014305033,1.33449208,1.570104952,1.575060777,1.504833712,1.290156767,1.008658851,0.712494742,0.377174433,0.138783274,-0.025203917,-0.099437546,-0.104503807,-0.088552175,-0.059244144,-0.036402168,-0.020300987,-0.010518378,-0.004600355,-0.002372843,-0.001255839,-0.000521027,-0.000293559},*/
+/*{0.00344444,0.011708103,0.059997877,0.202735833,0.644403983,1.282371701,1.953710066,2.206582549,2.085203252,1.555923679,0.970316553,0.491245277,0.242873415,0.070279007,-0.061461896,-0.131571399,-0.164055717,-0.176617196,-0.165922717,-0.144226781,-0.119603243,-0.085357673,-0.061960232,-0.041673571,-0.026318559,-0.01522674,-0.009013144,-0.004849822,-0.002624901,-0.001371407,-0.00067843,-0.000287423,-0.000146799,-7.76941E-05,-3.2234E-05,-1.81614E-05}};*/
+
+/*const float spectral_r[36] = {0.001476566,0.001476571,0.001476697,0.001477428,0.001480081,0.00148896,0.001510625,0.001553174,0.001622652,0.001723776,0.001858242,0.002028291,0.002237804,0.002500589,0.002843823,0.003312684,0.003983853,0.004975295,0.0064965,0.008912057,0.012881757,0.019616497,0.031083884,0.050157206,0.079379607,0.117964522,0.16013148,0.196518985,0.222295104,0.237687584,0.245789155,0.249669549,0.251605188,0.252511963,0.252870747,0.252999473};*/
+
+/*const float spectral_g[36] = {0.004457553,0.004459131,0.004461979,0.004471645,0.004506172,0.004629205,0.004938512,0.005602633,0.006868923,0.009209474,0.013450892,0.021153289,0.034963807,0.059025032,0.094985053,0.129917789,0.136598211,0.112473456,0.080301564,0.055184078,0.038887514,0.028984983,0.022965813,0.019364324,0.01723558,0.016001357,0.015318335,0.014937495,0.014732014,0.014630031,0.014587647,0.014573872,0.014574704,0.014576967,0.014575977,0.014579698};*/
+
+/*const float spectral_b[36] = {0.089623258,0.08963333,0.089677437,0.089887675,0.090595886,0.092619509,0.096261517,0.099409964,0.0953724,0.077775801,0.053270305,0.03267985,0.019295409,0.011424407,0.006963547,0.004447989,0.003004361,0.002142226,0.001608547,0.001266496,0.001041612,0.00089261,0.000792505,0.000726769,0.000685047,0.000659708,0.000644691,0.000636345,0.000631858,0.000629574,0.000628471,0.000627966,0.000627715,0.000627593,0.000627548,0.000627528};*/
+
+static const float T_MATRIX_SMALL[3][10] = {{0.026595621243689,0.049779426257903,0.022449850859496,-0.218453689278271
+,-0.256894883201278,0.445881722194840,0.772365886289756,0.194498761382537
+,0.014038157587820,0.007687264480513}
+,{-0.032601672674412,-0.061021043498478,-0.052490001018404
+,0.206659098273522,0.572496335158169,0.317837248815438,-0.021216624031211
+,-0.019387668756117,-0.001521339050858,-0.000835181622534}
+,{0.339475473216284,0.635401374177222,0.771520797089589,0.113222640692379
+,-0.055251113343776,-0.048222578468680,-0.012966666339586
+,-0.001523814504223,-0.000094718948810,-0.000051604594741}};
+
+static const float spectral_r_small[10] = {0.009281362787953,0.009732627042016,0.011254252737167,0.015105578649573
+,0.024797924177217,0.083622585502406,0.977865045723212,1.000000000000000
+,0.999961046144372,0.999999992756822};
+
+static const float spectral_g_small[10] = {0.002854127435775,0.003917589679914,0.012132151699187,0.748259205918013
+,1.000000000000000,0.865695937531795,0.037477469241101,0.022816789725717
+,0.021747419446456,0.021384940572308};
+
+static const float spectral_b_small[10] = {0.537052150373386,0.546646402401469,0.575501819073983,0.258778829633924
+,0.041709923751716,0.012662638828324,0.007485593127390,0.006766900622462
+,0.006699764779016,0.006676219883241};
+
+
 float rand_gauss (RngDouble * rng)
 {
   double sum = 0.0;
@@ -35,6 +69,25 @@ float rand_gauss (RngDouble * rng)
   return sum * 1.73205080757 - 3.46410161514;
 }
 
+// C fmodf function is not "arithmetic modulo"; it doesn't handle negative dividends as you might expect
+// if you expect 0 or a positive number when dealing with negatives, use
+// this function instead.
+float mod_arith(float a, float N)
+{
+    float ret = a - N * floor (a / N);
+    return ret;
+}
+
+// Returns the smallest angular difference
+float smallest_angular_difference(float angleA, float angleB)
+{
+    float a;
+    a = angleB - angleA;
+    a = mod_arith((a + 180), 360) - 180;
+    a += (a>180) ? -360 : (a<-180) ? 360 : 0;
+    return a;
+}
+
 // stolen from GIMP (gimpcolorspace.c)
 // (from gimp_rgb_to_hsv)
 void
@@ -313,4 +366,236 @@ hsl_to_rgb_float (float *h_, float *s_, float *l_)
   *l_ = b;
 }
 
+void
+rgb_to_hcy_float (float *r_, float *g_, float *b_) {
+	
+	float _HCY_RED_LUMA = 0.2162;
+	float _HCY_GREEN_LUMA = 0.7152;
+	float _HCY_BLUE_LUMA = 0.0722;
+	float h, c, y;
+	float r, g, b;
+	float p, n, d;
+
+	r = *r_;
+	g = *g_;
+	b = *b_;
+
+	// Luma is just a weighted sum of the three components.
+	y = _HCY_RED_LUMA*r + _HCY_GREEN_LUMA*g + _HCY_BLUE_LUMA*b;
+
+	// Hue. First pick a sector based on the greatest RGB component, then add
+	// the scaled difference of the other two RGB components.
+	p = MAX3(r, g, b);
+	n = MIN3(r, g, b);
+	d = p - n; // An absolute measure of chroma: only used for scaling
+
+	if (n == p){
+		h = 0.0;
+	} else if (p == r){
+		h = (g - b)/d;
+		if (h < 0){
+			h += 6.0;
+		}
+	} else if (p == g){
+		h = ((b - r)/d) + 2.0;
+	} else {  // p==b
+		h = ((r - g)/d) + 4.0;
+	}
+	h /= 6.0;
+	h = fmod(h,1.0);
+
+	// Chroma, relative to the RGB gamut envelope.
+	if ((r == g) && (g == b)){
+		// Avoid a division by zero for the achromatic case.
+		c = 0.0;
+	} else {
+		// For the derivation, see the GLHS paper.
+		c = MAX((y-n)/y, (p-y)/(1-y));
+	}
+
+	*r_ = h;
+	*g_ = c;
+	*b_ = y;
+}
+
+void
+hcy_to_rgb_float (float *h_, float *c_, float *y_) {
+	
+	float _HCY_RED_LUMA = 0.2162;
+	float _HCY_GREEN_LUMA = 0.7152;
+	float _HCY_BLUE_LUMA = 0.0722;
+	float h, c, y;
+	float r, g, b;
+	float th, tm;
+
+	h = *h_;
+	c = *c_;
+	y = *y_;
+
+	h = h - floor(h);
+	c = CLAMP(c, 0.0f, 1.0f);
+	y = CLAMP(y, 0.0f, 1.0f);
+
+	if (c == 0)	{
+	  /*  achromatic case  */
+	  r = y;
+	  g = y;
+	  b = y;
+	}
+
+	h = fmod(h, 1.0);
+	h *= 6.0;
+
+	if (h < 1){
+		// implies (p==r and h==(g-b)/d and g>=b)
+		th = h;
+		tm = _HCY_RED_LUMA + _HCY_GREEN_LUMA * th;
+	} else if (h < 2) {
+		// implies (p==g and h==((b-r)/d)+2.0 and b<r)
+		th = 2.0 - h;
+		tm = _HCY_GREEN_LUMA + _HCY_RED_LUMA * th;
+	} else if (h < 3){
+		// implies (p==g and h==((b-r)/d)+2.0 and b>=g)
+		th = h - 2.0;
+		tm = _HCY_GREEN_LUMA + _HCY_BLUE_LUMA * th;
+	} else if (h < 4) {
+		// implies (p==b and h==((r-g)/d)+4.0 and r<g)
+		th = 4.0 - h;
+		tm = _HCY_BLUE_LUMA + _HCY_GREEN_LUMA * th;
+	} else if (h < 5){
+		// implies (p==b and h==((r-g)/d)+4.0 and r>=g)
+		th = h - 4.0;
+		tm = _HCY_BLUE_LUMA + _HCY_RED_LUMA * th;
+	} else {
+		// implies (p==r and h==(g-b)/d and g<b)
+		th = 6.0 - h;
+		tm = _HCY_RED_LUMA + _HCY_BLUE_LUMA * th;
+	}
+
+	float n,p,o;
+	// Calculate the RGB components in sorted order
+	if (tm >= y){
+		p = y + y*c*(1-tm)/tm;
+		o = y + y*c*(th-tm)/tm;
+		n = y - (y*c);
+	}else{
+		p = y + (1-y)*c;
+		o = y + (1-y)*c*(th-tm)/(1-tm);
+		n = y - (1-y)*c*tm/(1-tm);
+	}
+
+	// Back to RGB order
+	if (h < 1){
+		r = p;
+		g = o;
+		b = n;
+	} else if (h < 2){
+		r = o;
+		g = p;
+		b = n;
+	} else if (h < 3){
+		r = n;
+		g = p;
+		b = o;
+	} else if (h < 4){
+		r = n;
+		g = o;
+		b = p;
+	} else if (h < 5){
+		r = o;
+		g = n;
+		b = p;
+	}else{ 
+		r = p;
+		g = n;
+		b = o;
+	}
+
+	*h_ = r;
+	*c_ = g;
+	*y_ = b;
+}
+
+
+void
+rgb_to_spectral (float r, float g, float b, float *spectral_) {
+  float offset = 1.0 - WGM_EPSILON;
+  r = r * offset + WGM_EPSILON;
+  g = g * offset + WGM_EPSILON;
+  b = b * offset + WGM_EPSILON;
+  //upsample rgb to spectral primaries
+  float spec_r[10] = {0};
+  for (int i=0; i < 10; i++) {
+    spec_r[i] = spectral_r_small[i] * r;
+  }
+  float spec_g[10] = {0};
+  for (int i=0; i < 10; i++) {
+    spec_g[i] = spectral_g_small[i] * g;
+  }
+  float spec_b[10] = {0};
+  for (int i=0; i < 10; i++) {
+    spec_b[i] = spectral_b_small[i] * b;
+  }
+  //collapse into one spd
+  for (int i=0; i<10; i++) {
+    spectral_[i] += spec_r[i] + spec_g[i] + spec_b[i];
+  }
+
+}
+
+void
+spectral_to_rgb (float *spectral, float *rgb_) {
+  float offset = 1.0 - WGM_EPSILON;
+  for (int i=0; i<10; i++) {
+    rgb_[0] += T_MATRIX_SMALL[0][i] * spectral[i];
+    rgb_[1] += T_MATRIX_SMALL[1][i] * spectral[i];
+    rgb_[2] += T_MATRIX_SMALL[2][i] * spectral[i];
+  }
+  for (int i=0; i<3; i++) {
+    rgb_[i] = CLAMP((rgb_[i] - WGM_EPSILON) / offset, 0.0f, 1.0f);
+  }
+}
+
+
+//function to make it easy to blend two spectral colors via weighted geometric mean
+//a is the current smudge state, b is the get_color or brush color
+float * mix_colors(float *a, float *b, float fac, float paint_mode)
+{
+  static float result[4] = {0};  
+  
+  float opa_a = fac;
+  float opa_b = 1.0-opa_a;
+  result[3] = CLAMP(opa_a * a[3] + opa_b * b[3], 0.0f, 1.0f);
+
+  if (paint_mode > 0.0) { 
+    float spec_a[10] = {0};
+    float spec_b[10] = {0};
+
+    rgb_to_spectral(a[0], a[1], a[2], spec_a);
+    rgb_to_spectral(b[0], b[1], b[2], spec_b);
+    
+    //blend spectral primaries subtractive WGM
+    float spectralmix[10] = {0};
+    for (int i=0; i < 10; i++) {
+      spectralmix[i] = fastpow(spec_a[i], opa_a) * fastpow(spec_b[i], opa_b);
+    }
+    
+    //convert to RGB
+    float rgb_result[3] = {0};
+    spectral_to_rgb(spectralmix, rgb_result);
+
+    for (int i=0; i < 3; i++) {
+      result[i] = rgb_result[i];
+    }
+  }
+  
+  if (paint_mode < 1.0) {
+    for (int i=0; i<3; i++) {
+      result[i] = result[i] * paint_mode + (1-paint_mode) * (a[i] * opa_a + b[i] * opa_b);
+    }
+  }
+
+  return result;
+}
+
 #endif //HELPERS_C
diff --git a/helpers.h b/helpers.h
index 2ca5493c..0e6f68c8 100644
--- a/helpers.h
+++ b/helpers.h
@@ -1,6 +1,7 @@
 #ifndef HELPERS_H
 #define HELPERS_H
 
+#include <stdint.h>
 #include "rng-double.h"
 
 #define MAX(a, b)  (((a) > (b)) ? (a) : (b))
@@ -11,6 +12,7 @@
 #define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
 #define MAX3(a, b, c) ((a)>(b)?MAX((a),(c)):MAX((b),(c)))
 #define MIN3(a, b, c) ((a)<(b)?MIN((a),(c)):MIN((b),(c)))
+#define WGM_EPSILON 0.001
 
 void
 hsl_to_rgb_float (float *h_, float *s_, float *l_);
@@ -23,6 +25,24 @@ hsv_to_rgb_float (float *h_, float *s_, float *v_);
 void
 rgb_to_hsv_float (float *r_ /*h*/, float *g_ /*s*/, float *b_ /*v*/);
 
+void
+hcy_to_rgb_float (float *h_, float *c_, float *y_);
+
+void
+rgb_to_hcy_float (float *r_, float *g_, float *b_);
+
 float rand_gauss (RngDouble * rng);
 
+float mod_arith(float a, float N);
+
+float smallest_angular_difference(float angleA, float angleB);
+
+float * mix_colors(float *a, float *b, float fac, float paint_mode);
+
+void
+rgb_to_spectral (float r, float g, float b, float *spectral_);
+
+void
+spectral_to_rgb (float *spectral, float *rgb_);
+
 #endif // HELPERS_H
diff --git a/mypaint-brush.c b/mypaint-brush.c
index 7a1380c7..d7500684 100644
--- a/mypaint-brush.c
+++ b/mypaint-brush.c
@@ -21,6 +21,8 @@
 #include <string.h>
 #include <math.h>
 #include <assert.h>
+#include "fastapprox/fastpow.h"
+#include "fastapprox/fastexp.h"
 
 #if MYPAINT_CONFIG_USE_GLIB
 #include <glib.h>
@@ -53,6 +55,9 @@
 #define ACTUAL_RADIUS_MIN 0.2
 #define ACTUAL_RADIUS_MAX 1000 // safety guard against radius like 1e20 and against rendering overload with unexpected brush dynamics
 
+//array for smudge states, which allow much higher more variety and "memory" of the brush
+float smudge_buckets[256][9] = {{0.0f}};
+
 /* The Brush class stores two things:
    b) settings: constant during a stroke (eg. size, spacing, dynamics, color selected by the user)
    a) states: modified during a stroke (eg. speed, smudge colors, time/distance to next dab, position filter states)
@@ -370,29 +375,6 @@ mypaint_brush_set_state(MyPaintBrush *self, MyPaintBrushState i, float value)
     self->states[i] = value;
 }
 
-
-// C fmodf function is not "arithmetic modulo"; it doesn't handle negative dividends as you might expect
-// if you expect 0 or a positive number when dealing with negatives, use
-// this function instead.
-static inline float mod(float a, float N)
-{
-    float ret = a - N * floor (a / N);
-    return ret;
-}
-
-
-// Returns the smallest angular difference
-static inline float
-smallest_angular_difference(float angleA, float angleB)
-{
-    float a;
-    a = angleB - angleA;
-    a = mod((a + 180), 360) - 180;
-    a += (a>180) ? -360 : (a<-180) ? 360 : 0;
-    //printf("%f.1 to %f.1 = %f.1 \n", angleB, angleA, a);
-    return a;
-}
-
   // returns the fraction still left after t seconds
   float exp_decay (float T_const, float t)
   {
@@ -402,7 +384,7 @@ smallest_angular_difference(float angleA, float angleB)
     }
 
     const float arg = -t / T_const;
-    return expf(arg);
+    return fastexp(arg);
   }
 
 
@@ -426,7 +408,7 @@ smallest_angular_difference(float angleA, float angleB)
     for (i=0; i<2; i++) {
       float gamma;
       gamma = mypaint_mapping_get_base_value(self->settings[(i==0)?MYPAINT_BRUSH_SETTING_SPEED1_GAMMA:MYPAINT_BRUSH_SETTING_SPEED2_GAMMA]);
-      gamma = expf(gamma);
+      gamma = fastexp(gamma);
 
       float fix1_x, fix1_y, fix2_x, fix2_dy;
       fix1_x = 45.0;
@@ -474,6 +456,11 @@ smallest_angular_difference(float angleA, float angleB)
     self->states[MYPAINT_BRUSH_STATE_X]        += step_dx;
     self->states[MYPAINT_BRUSH_STATE_Y]        += step_dy;
     self->states[MYPAINT_BRUSH_STATE_PRESSURE] += step_dpressure;
+    
+    self->states[MYPAINT_BRUSH_STATE_DABS_PER_BASIC_RADIUS] = self->settings_value[MYPAINT_BRUSH_SETTING_DABS_PER_BASIC_RADIUS];
+    self->states[MYPAINT_BRUSH_STATE_DABS_PER_ACTUAL_RADIUS] = self->settings_value[MYPAINT_BRUSH_SETTING_DABS_PER_ACTUAL_RADIUS];
+    self->states[MYPAINT_BRUSH_STATE_DABS_PER_SECOND] = self->settings_value[MYPAINT_BRUSH_SETTING_DABS_PER_ACTUAL_RADIUS];
+    
 
     self->states[MYPAINT_BRUSH_STATE_DECLINATION] += step_declination;
     self->states[MYPAINT_BRUSH_STATE_DECLINATIONX] += step_declinationx;
@@ -481,12 +468,12 @@ smallest_angular_difference(float angleA, float angleB)
     self->states[MYPAINT_BRUSH_STATE_ASCENSION] += step_ascension;
     
     self->states[MYPAINT_BRUSH_STATE_VIEWZOOM] = step_viewzoom;
-    self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] = mod((step_viewrotation * 180.0 / M_PI) + 180.0, 360.0) -180.0;
-    gridmap_scale = expf(self->settings_value[MYPAINT_BRUSH_SETTING_GRIDMAP_SCALE]);
+    self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] = mod_arith((step_viewrotation * 180.0 / M_PI) + 180.0, 360.0) -180.0;
+    gridmap_scale = fastexp(self->settings_value[MYPAINT_BRUSH_SETTING_GRIDMAP_SCALE]);
     gridmap_scale_x = self->settings_value[MYPAINT_BRUSH_SETTING_GRIDMAP_SCALE_X];
     gridmap_scale_y = self->settings_value[MYPAINT_BRUSH_SETTING_GRIDMAP_SCALE_Y];
-    self->states[MYPAINT_BRUSH_STATE_GRIDMAP_X] = mod(fabsf(self->states[MYPAINT_BRUSH_STATE_ACTUAL_X] * gridmap_scale_x), (gridmap_scale * 256.0)) / (gridmap_scale * 256.0) * 256.0;
-    self->states[MYPAINT_BRUSH_STATE_GRIDMAP_Y] = mod(fabsf(self->states[MYPAINT_BRUSH_STATE_ACTUAL_Y] * gridmap_scale_y), (gridmap_scale * 256.0)) / (gridmap_scale * 256.0) * 256.0;
+    self->states[MYPAINT_BRUSH_STATE_GRIDMAP_X] = mod_arith(fabsf(self->states[MYPAINT_BRUSH_STATE_ACTUAL_X] * gridmap_scale_x), (gridmap_scale * 256.0)) / (gridmap_scale * 256.0) * 256.0;
+    self->states[MYPAINT_BRUSH_STATE_GRIDMAP_Y] = mod_arith(fabsf(self->states[MYPAINT_BRUSH_STATE_ACTUAL_Y] * gridmap_scale_y), (gridmap_scale * 256.0)) / (gridmap_scale * 256.0) * 256.0;
     
     if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_X] < 0.0) {
       self->states[MYPAINT_BRUSH_STATE_GRIDMAP_X] = 256.0 - self->states[MYPAINT_BRUSH_STATE_GRIDMAP_X];
@@ -496,7 +483,7 @@ smallest_angular_difference(float angleA, float angleB)
       self->states[MYPAINT_BRUSH_STATE_GRIDMAP_Y] = 256.0 - self->states[MYPAINT_BRUSH_STATE_GRIDMAP_Y];
     }
 
-    float base_radius = expf(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
+    float base_radius = fastexp(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
     
     //first iteration is zero, set to 1, then flip to -1, back and forth
     //useful for Anti-Art's mirrored offset but could be useful elsewhere
@@ -538,25 +525,26 @@ smallest_angular_difference(float angleA, float angleB)
     //norm_dist should relate to brush size, whereas norm_speed should not
     norm_dist = hypotf(step_dx / step_dtime / base_radius, step_dy / step_dtime / base_radius) * step_dtime;
 
-    inputs[MYPAINT_BRUSH_INPUT_PRESSURE] = pressure * expf(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_PRESSURE_GAIN_LOG]));
+    inputs[MYPAINT_BRUSH_INPUT_PRESSURE] = pressure * fastexp(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_PRESSURE_GAIN_LOG]));
     inputs[MYPAINT_BRUSH_INPUT_SPEED1] = log(self->speed_mapping_gamma[0] + self->states[MYPAINT_BRUSH_STATE_NORM_SPEED1_SLOW])*self->speed_mapping_m[0] + self->speed_mapping_q[0], 0.0, 4.0;
     inputs[MYPAINT_BRUSH_INPUT_SPEED2] = log(self->speed_mapping_gamma[1] + self->states[MYPAINT_BRUSH_STATE_NORM_SPEED2_SLOW])*self->speed_mapping_m[1] + self->speed_mapping_q[1], 0.0, 4.0;
     
     inputs[MYPAINT_BRUSH_INPUT_RANDOM] = self->random_input;
     inputs[MYPAINT_BRUSH_INPUT_STROKE] = MIN(self->states[MYPAINT_BRUSH_STATE_STROKE], 1.0);
     //correct direction for varying view rotation
-    inputs[MYPAINT_BRUSH_INPUT_DIRECTION] = mod(atan2f (self->states[MYPAINT_BRUSH_STATE_DIRECTION_DY], self->states[MYPAINT_BRUSH_STATE_DIRECTION_DX])/(2*M_PI)*360 + self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] + 180.0, 180.0);
+    inputs[MYPAINT_BRUSH_INPUT_DIRECTION] = mod_arith(atan2f (self->states[MYPAINT_BRUSH_STATE_DIRECTION_DY], self->states[MYPAINT_BRUSH_STATE_DIRECTION_DX])/(2*M_PI)*360 + self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] + 180.0, 180.0);
     inputs[MYPAINT_BRUSH_INPUT_DIRECTION_ANGLE] = fmodf(atan2f(self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DY], self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DX]) / (2 * M_PI) * 360 + self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] + 360.0, 360.0) ;
     inputs[MYPAINT_BRUSH_INPUT_TILT_DECLINATION] = self->states[MYPAINT_BRUSH_STATE_DECLINATION];
-    //use custom mod
-    inputs[MYPAINT_BRUSH_INPUT_TILT_ASCENSION] = mod(self->states[MYPAINT_BRUSH_STATE_ASCENSION] + 180.0, 360.0) - 180.0;
+    //correct ascension for varying view rotation, use custom mod
+    inputs[MYPAINT_BRUSH_INPUT_TILT_ASCENSION] = mod_arith(self->states[MYPAINT_BRUSH_STATE_ASCENSION] + self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] + 180.0, 360.0) - 180.0;
     inputs[MYPAINT_BRUSH_INPUT_VIEWZOOM] = (mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC])) - logf(base_radius * 1 / self->states[MYPAINT_BRUSH_STATE_VIEWZOOM]);
+    inputs[MYPAINT_BRUSH_INPUT_ATTACK_ANGLE] = smallest_angular_difference(self->states[MYPAINT_BRUSH_STATE_ASCENSION], mod_arith(atan2f(self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DY], self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DX]) / (2 * M_PI) * 360 + 90, 360));
     inputs[MYPAINT_BRUSH_INPUT_BRUSH_RADIUS] = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]);
     inputs[MYPAINT_BRUSH_INPUT_GRIDMAP_X] = CLAMP(self->states[MYPAINT_BRUSH_STATE_GRIDMAP_X], 0.0, 256.0);
     inputs[MYPAINT_BRUSH_INPUT_GRIDMAP_Y] = CLAMP(self->states[MYPAINT_BRUSH_STATE_GRIDMAP_Y], 0.0, 256.0);
     inputs[MYPAINT_BRUSH_INPUT_TILT_DECLINATIONX] = self->states[MYPAINT_BRUSH_STATE_DECLINATIONX];
     inputs[MYPAINT_BRUSH_INPUT_TILT_DECLINATIONY] = self->states[MYPAINT_BRUSH_STATE_DECLINATIONY];
-    inputs[MYPAINT_BRUSH_INPUT_ATTACK_ANGLE] = smallest_angular_difference(self->states[MYPAINT_BRUSH_STATE_ASCENSION], mod(atan2f(self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DY], self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DX]) / (2 * M_PI) * 360 + 90 + self->states[MYPAINT_BRUSH_STATE_VIEWROTATION], 360));
+    inputs[MYPAINT_BRUSH_INPUT_ATTACK_ANGLE] = smallest_angular_difference(self->states[MYPAINT_BRUSH_STATE_ASCENSION], mod_arith(atan2f(self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DY], self->states[MYPAINT_BRUSH_STATE_DIRECTION_ANGLE_DX]) / (2 * M_PI) * 360 + 90 + self->states[MYPAINT_BRUSH_STATE_VIEWROTATION], 360));
 
     inputs[MYPAINT_BRUSH_INPUT_CUSTOM] = self->states[MYPAINT_BRUSH_STATE_CUSTOM_INPUT];
     if (self->print_inputs) {
@@ -590,7 +578,7 @@ smallest_angular_difference(float angleA, float angleB)
       //   Is it broken, non-smooth, system-dependent math?!
       //   A replacement could be a directed random offset.
 
-      float time_constant = expf(self->settings_value[MYPAINT_BRUSH_SETTING_OFFSET_BY_SPEED_SLOWNESS]*0.01)-1.0;
+      float time_constant = fastexp(self->settings_value[MYPAINT_BRUSH_SETTING_OFFSET_BY_SPEED_SLOWNESS]*0.01)-1.0;
       // Workaround for a bug that happens mainly on Windows, causing
       // individual dabs to be placed far far away. Using the speed
       // with zero filtering is just asking for trouble anyway.
@@ -608,7 +596,7 @@ smallest_angular_difference(float angleA, float angleB)
 
 
       float step_in_dabtime = hypotf(dx, dy); // FIXME: are we recalculating something here that we already have?
-      float fac = 1.0 - exp_decay (exp(self->settings_value[MYPAINT_BRUSH_SETTING_DIRECTION_FILTER]*0.5)-1.0, step_in_dabtime);
+      float fac = 1.0 - exp_decay (fastexp(self->settings_value[MYPAINT_BRUSH_SETTING_DIRECTION_FILTER]*0.5)-1.0, step_in_dabtime);
 
       float dx_old = self->states[MYPAINT_BRUSH_STATE_DIRECTION_DX];
       float dy_old = self->states[MYPAINT_BRUSH_STATE_DIRECTION_DY];
@@ -635,7 +623,7 @@ smallest_angular_difference(float angleA, float angleB)
     { // stroke length
       float frequency;
       float wrap;
-      frequency = expf(-self->settings_value[MYPAINT_BRUSH_SETTING_STROKE_DURATION_LOGARITHMIC]);
+      frequency = fastexp(-self->settings_value[MYPAINT_BRUSH_SETTING_STROKE_DURATION_LOGARITHMIC]);
       self->states[MYPAINT_BRUSH_STATE_STROKE] += norm_dist * frequency;
       // can happen, probably caused by rounding
       if (self->states[MYPAINT_BRUSH_STATE_STROKE] < 0) self->states[MYPAINT_BRUSH_STATE_STROKE] = 0;
@@ -655,14 +643,14 @@ smallest_angular_difference(float angleA, float angleB)
     // calculate final radius
     float radius_log;
     radius_log = self->settings_value[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC];
-    self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = expf(radius_log);
+    self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = fastexp(radius_log);
     if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] < ACTUAL_RADIUS_MIN) self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = ACTUAL_RADIUS_MIN;
     if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] > ACTUAL_RADIUS_MAX) self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = ACTUAL_RADIUS_MAX;
 
     // aspect ratio (needs to be calculated here because it can affect the dab spacing)
     self->states[MYPAINT_BRUSH_STATE_ACTUAL_ELLIPTICAL_DAB_RATIO] = self->settings_value[MYPAINT_BRUSH_SETTING_ELLIPTICAL_DAB_RATIO];
     //correct dab angle for view rotation
-    self->states[MYPAINT_BRUSH_STATE_ACTUAL_ELLIPTICAL_DAB_ANGLE] = mod(self->settings_value[MYPAINT_BRUSH_SETTING_ELLIPTICAL_DAB_ANGLE] - self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] + 180.0, 180.0) - 180.0;
+    self->states[MYPAINT_BRUSH_STATE_ACTUAL_ELLIPTICAL_DAB_ANGLE] = mod_arith(self->settings_value[MYPAINT_BRUSH_SETTING_ELLIPTICAL_DAB_ANGLE] - self->states[MYPAINT_BRUSH_STATE_VIEWROTATION] + 180.0, 180.0) - 180.0;
   }
 
   // Called only from stroke_to(). Calculate everything needed to
@@ -687,8 +675,8 @@ smallest_angular_difference(float angleA, float angleB)
       // dabs_per_pixel is just estimated roughly, I didn't think hard
       // about the case when the radius changes during the stroke
       dabs_per_pixel = (
-                        mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_DABS_PER_ACTUAL_RADIUS]) +
-                        mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_DABS_PER_BASIC_RADIUS])
+                        self->states[MYPAINT_BRUSH_STATE_DABS_PER_ACTUAL_RADIUS] +
+                        self->states[MYPAINT_BRUSH_STATE_DABS_PER_BASIC_RADIUS]
                         ) * 2.0;
 
       // the correction is probably not wanted if the dabs don't overlap
@@ -702,7 +690,7 @@ smallest_angular_difference(float angleA, float angleB)
       // <==> beta_dab = beta^(1/dabs_per_pixel)
       alpha = opaque;
       beta = 1.0-alpha;
-      beta_dab = powf(beta, 1.0/dabs_per_pixel);
+      beta_dab = fastpow(beta, 1.0/dabs_per_pixel);
       alpha_dab = 1.0-beta_dab;
       opaque = alpha_dab;
     }
@@ -710,8 +698,8 @@ smallest_angular_difference(float angleA, float angleB)
     x = self->states[MYPAINT_BRUSH_STATE_ACTUAL_X];
     y = self->states[MYPAINT_BRUSH_STATE_ACTUAL_Y];
 
-    float base_radius = expf(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
-    float offset_mult = expf(self->settings_value[MYPAINT_BRUSH_SETTING_OFFSET_MULTIPLIER]);
+    float base_radius = fastexp(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
+    float offset_mult = fastexp(self->settings_value[MYPAINT_BRUSH_SETTING_OFFSET_MULTIPLIER]);
 
     if (self->settings_value[MYPAINT_BRUSH_SETTING_OFFSET_X]) {
       x += self->settings_value[MYPAINT_BRUSH_SETTING_OFFSET_X] * base_radius * offset_mult;
@@ -782,14 +770,13 @@ smallest_angular_difference(float angleA, float angleB)
       y += rand_gauss (self->rng) * amp * base_radius;
     }
 
-
     radius = self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS];
     if (self->settings_value[MYPAINT_BRUSH_SETTING_RADIUS_BY_RANDOM]) {
       float radius_log, alpha_correction;
       // go back to logarithmic radius to add the noise
       radius_log  = self->settings_value[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC];
       radius_log += rand_gauss (self->rng) * self->settings_value[MYPAINT_BRUSH_SETTING_RADIUS_BY_RANDOM];
-      radius = expf(radius_log);
+      radius = fastexp(radius_log);
       radius = CLAMP(radius, ACTUAL_RADIUS_MIN, ACTUAL_RADIUS_MAX);
       alpha_correction = self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] / radius;
       alpha_correction = SQR(alpha_correction);
@@ -798,10 +785,17 @@ smallest_angular_difference(float angleA, float angleB)
       }
     }
 
+    //convert to RGB here instead of later
+    // color part
+    float color_h = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_COLOR_H]);
+    float color_s = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_COLOR_S]);
+    float color_v = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_COLOR_V]);
+    hsv_to_rgb_float (&color_h, &color_s, &color_v);
     // update smudge color
     if (self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_LENGTH] < 1.0 &&
-        // optimization, since normal brushes have smudge_length == 0.5 without actually smudging
-        (self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE] != 0.0 || !mypaint_mapping_is_constant(self->settings[MYPAINT_BRUSH_SETTING_SMUDGE]))) {
+       // optimization, since normal brushes have smudge_length == 0.5 without actually smudging
+       (self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE] != 0.0 ||
+       !mypaint_mapping_is_constant(self->settings[MYPAINT_BRUSH_SETTING_SMUDGE]))) {
 
       float fac = self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_LENGTH];
       if (fac < 0.01) fac = 0.01;
@@ -809,72 +803,143 @@ smallest_angular_difference(float angleA, float angleB)
       px = ROUND(x);
       py = ROUND(y);
 
+      //determine which smudge bucket to use and update
+      int bucket = CLAMP(roundf(self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_BUCKET]), 0, 255);
+
       // Calling get_color() is almost as expensive as rendering a
       // dab. Because of this we use the previous value if it is not
       // expected to hurt quality too much. We call it at most every
       // second dab.
       float r, g, b, a;
-      self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_RECENTNESS] *= fac;
-      if (self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_RECENTNESS] < 0.5*fac) {
-        if (self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_RECENTNESS] == 0.0) {
+      r = color_h;
+      g = color_s;
+      b = color_v;
+      a = 1.0;
+
+      smudge_buckets[bucket][8] *= fac;
+      if (smudge_buckets[bucket][8] < (fastpow(0.5*fac, self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_LENGTH_LOG])) + 0.0000000000000001) {
+        if (smudge_buckets[bucket][8] == 0.0) {
           // first initialization of smudge color
           fac = 0.0;
+          //init with brush color instead of black
+          smudge_buckets[bucket][4] = color_h;
+          smudge_buckets[bucket][5] = color_s;
+          smudge_buckets[bucket][6] = color_v;
+          smudge_buckets[bucket][7] = opaque;
         }
-        self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_RECENTNESS] = 1.0;
+        smudge_buckets[bucket][8] = 1.0;
 
-        float smudge_radius = radius * expf(self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_RADIUS_LOG]);
+        float smudge_radius = radius * fasterexp(self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_RADIUS_LOG]);
         smudge_radius = CLAMP(smudge_radius, ACTUAL_RADIUS_MIN, ACTUAL_RADIUS_MAX);
+
         mypaint_surface_get_color(surface, px, py, smudge_radius, &r, &g, &b, &a);
 
-        self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_R] = r;
-        self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_G] = g;
-        self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_B] = b;
-        self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_A] = a;
-      } else {
-        r = self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_R];
-        g = self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_G];
-        b = self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_B];
-        a = self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_A];
-      }
 
-      // updated the smudge color (stored with premultiplied alpha)
-      self->states[MYPAINT_BRUSH_STATE_SMUDGE_A ] = fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_A ] + (1-fac)*a;
-      // fix rounding errors
-      self->states[MYPAINT_BRUSH_STATE_SMUDGE_A ] = CLAMP(self->states[MYPAINT_BRUSH_STATE_SMUDGE_A], 0.0, 1.0);
+        //don't draw unless the picked-up alpha is above a certain level
+        //this is sort of like lock_alpha but for smudge
+        //negative values reverse this idea
+        if ((self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_TRANSPARENCY] > 0.0 && 
+             a < self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_TRANSPARENCY]) ||
+             (self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_TRANSPARENCY] < 0.0 && 
+             a > self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_TRANSPARENCY] * -1)) {
+          return FALSE;
+        }
+        
+        // avoid false colors from extremely small alpha (noise)
+        if (a < WGM_EPSILON) {
+          r = color_h;
+          g = color_s;
+          b = color_v;
+          a = smudge_buckets[bucket][7];
+          fac = 0.0;
+        }
 
-      self->states[MYPAINT_BRUSH_STATE_SMUDGE_RA] = fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_RA] + (1-fac)*r*a;
-      self->states[MYPAINT_BRUSH_STATE_SMUDGE_GA] = fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_GA] + (1-fac)*g*a;
-      self->states[MYPAINT_BRUSH_STATE_SMUDGE_BA] = fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_BA] + (1-fac)*b*a;
+        smudge_buckets[bucket][4] = r;
+        smudge_buckets[bucket][5] = g;
+        smudge_buckets[bucket][6] = b;
+        smudge_buckets[bucket][7] = a;
+        
+      } else {
+        r = smudge_buckets[bucket][4];
+        g = smudge_buckets[bucket][5];
+        b = smudge_buckets[bucket][6];
+        a = smudge_buckets[bucket][7];
+      }
+      
+      float smudge_state[4] = {smudge_buckets[bucket][0], smudge_buckets[bucket][1], smudge_buckets[bucket][2], smudge_buckets[bucket][3]};
+      
+      float smudge_get[4] = {r, g, b, a};
+
+      float *smudge_new;
+      smudge_new = mix_colors(smudge_state,
+                              smudge_get,
+                              fac,
+                              self->settings_value[MYPAINT_BRUSH_SETTING_PAINT_MODE] );
+      // updated the smudge color (stored with straight alpha)
+      smudge_buckets[bucket][0] = smudge_new[0];
+      smudge_buckets[bucket][1] = smudge_new[1];
+      smudge_buckets[bucket][2] = smudge_new[2];
+      smudge_buckets[bucket][3] = smudge_new[3];
+      
+      //update all the states
+      self->states[MYPAINT_BRUSH_STATE_SMUDGE_RA] = smudge_buckets[bucket][0];
+      self->states[MYPAINT_BRUSH_STATE_SMUDGE_GA] = smudge_buckets[bucket][1];
+      self->states[MYPAINT_BRUSH_STATE_SMUDGE_BA] = smudge_buckets[bucket][2];
+      self->states[MYPAINT_BRUSH_STATE_SMUDGE_A] = smudge_buckets[bucket][3];
+      self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_R] = smudge_buckets[bucket][4];
+      self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_G] = smudge_buckets[bucket][5];
+      self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_B] = smudge_buckets[bucket][6];
+      self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_A] = smudge_buckets[bucket][7];
+      self->states[MYPAINT_BRUSH_STATE_LAST_GETCOLOR_RECENTNESS] = smudge_buckets[bucket][8];
     }
 
-    // color part
 
-    float color_h = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_COLOR_H]);
-    float color_s = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_COLOR_S]);
-    float color_v = mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_COLOR_V]);
     float eraser_target_alpha = 1.0;
+
     if (self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE] > 0.0) {
-      // mix (in RGB) the smudge color with the brush color
-      hsv_to_rgb_float (&color_h, &color_s, &color_v);
       float fac = self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE];
+      //hsv_to_rgb_float (&color_h, &color_s, &color_v);
+
+      //determine which smudge bucket to use when mixing with brush color
+      int bucket = CLAMP(roundf(self->settings_value[MYPAINT_BRUSH_SETTING_SMUDGE_BUCKET]), 0, 255);
+            
       if (fac > 1.0) fac = 1.0;
-      // If the smudge color somewhat transparent, then the resulting
-      // dab will do erasing towards that transparency level.
-      // see also ../doc/smudge_math.png
-      eraser_target_alpha = (1-fac)*1.0 + fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_A];
-      // fix rounding errors (they really seem to happen in the previous line)
-      eraser_target_alpha = CLAMP(eraser_target_alpha, 0.0, 1.0);
-      if (eraser_target_alpha > 0) {
-        color_h = (fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_RA] + (1-fac)*color_h) / eraser_target_alpha;
-        color_s = (fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_GA] + (1-fac)*color_s) / eraser_target_alpha;
-        color_v = (fac*self->states[MYPAINT_BRUSH_STATE_SMUDGE_BA] + (1-fac)*color_v) / eraser_target_alpha;
-      } else {
-        // we are only erasing; the color does not matter
-        color_h = 1.0;
-        color_s = 0.0;
-        color_v = 0.0;
-      }
-      rgb_to_hsv_float (&color_h, &color_s, &color_v);
+        // If the smudge color somewhat transparent, then the resulting
+        // dab will do erasing towards that transparency level.
+        // see also ../doc/smudge_math.png
+        eraser_target_alpha = (1-fac)*1.0 + fac*smudge_buckets[bucket][3];
+        // fix rounding errors (they really seem to happen in the previous line)
+        eraser_target_alpha = CLAMP(eraser_target_alpha, 0.0, 1.0);
+        if (eraser_target_alpha > 0) {
+          
+          float smudge_state[4] = {
+            smudge_buckets[bucket][0],
+            smudge_buckets[bucket][1],
+            smudge_buckets[bucket][2], 
+            smudge_buckets[bucket][3]
+          };
+          float brush_color[4] = {color_h, color_s, color_v, 1.0};
+          float *color_new;
+          
+          color_new = mix_colors(
+            smudge_state,
+            brush_color,
+            fac,
+            self->settings_value[MYPAINT_BRUSH_SETTING_PAINT_MODE]
+          );  
+          
+          color_h = color_new[0];// / eraser_target_alpha;
+          color_s = color_new[1];// / eraser_target_alpha;
+          color_v = color_new[2];// / eraser_target_alpha;
+
+        } else {
+          // we are only erasing; the color does not matter
+          color_h = 1.0;
+          color_s = 1.0;
+          color_v = 1.0;
+        }
+        
+      //rgb_to_hsv_float (&color_h, &color_s, &color_v);
     }
 
     // eraser
@@ -883,21 +948,28 @@ smallest_angular_difference(float angleA, float angleB)
     }
 
     // HSV color change
-    color_h += self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_H];
-    color_s += color_s * color_v * self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_HSV_S];
-    color_v += self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_V];
+    if (self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_H] || 
+        self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_HSV_S] ||
+        self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_V]) {
+      rgb_to_hsv_float (&color_h, &color_s, &color_v);
+      color_h += self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_H];
+      color_s += color_s * color_v * self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_HSV_S];
+      color_v += self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_V];
+      hsv_to_rgb_float (&color_h, &color_s, &color_v);
+    }
 
     // HSL color change
     if (self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_L] || self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_HSL_S]) {
       // (calculating way too much here, can be optimized if necessary)
       // this function will CLAMP the inputs
-      hsv_to_rgb_float (&color_h, &color_s, &color_v);
+      
+      //hsv_to_rgb_float (&color_h, &color_s, &color_v);
       rgb_to_hsl_float (&color_h, &color_s, &color_v);
       color_v += self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_L];
       color_s += color_s * MIN(fabsf(1.0 - color_v), fabsf(color_v)) * 2.0
         * self->settings_value[MYPAINT_BRUSH_SETTING_CHANGE_COLOR_HSL_S];
       hsl_to_rgb_float (&color_h, &color_s, &color_v);
-      rgb_to_hsv_float (&color_h, &color_s, &color_v);
+      //rgb_to_hsv_float (&color_h, &color_s, &color_v);
     }
 
     float hardness = CLAMP(self->settings_value[MYPAINT_BRUSH_SETTING_HARDNESS], 0.0, 1.0);
@@ -948,11 +1020,11 @@ smallest_angular_difference(float angleA, float angleB)
     }
 
     // the functions below will CLAMP most inputs
-    hsv_to_rgb_float (&color_h, &color_s, &color_v);
+    //hsv_to_rgb_float (&color_h, &color_s, &color_v);
     return mypaint_surface_draw_dab (surface, x, y, radius, color_h, color_s, color_v, opaque, hardness, eraser_target_alpha,
                               self->states[MYPAINT_BRUSH_STATE_ACTUAL_ELLIPTICAL_DAB_RATIO], self->states[MYPAINT_BRUSH_STATE_ACTUAL_ELLIPTICAL_DAB_ANGLE],
                               self->settings_value[MYPAINT_BRUSH_SETTING_LOCK_ALPHA],
-                              self->settings_value[MYPAINT_BRUSH_SETTING_COLORIZE]);
+                              self->settings_value[MYPAINT_BRUSH_SETTING_COLORIZE], self->settings_value[MYPAINT_BRUSH_SETTING_POSTERIZE], self->settings_value[MYPAINT_BRUSH_SETTING_POSTERIZE_NUM], self->settings_value[MYPAINT_BRUSH_SETTING_PAINT_MODE]);
   }
 
   // How many dabs will be drawn between the current and the next (x, y, pressure, +dt) position?
@@ -962,13 +1034,13 @@ smallest_angular_difference(float angleA, float angleB)
     float res1, res2, res3;
     float dist;
 
-    if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] == 0.0) self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = expf(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
+    if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] == 0.0) self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = fastexp(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
     if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] < ACTUAL_RADIUS_MIN) self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = ACTUAL_RADIUS_MIN;
     if (self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] > ACTUAL_RADIUS_MAX) self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] = ACTUAL_RADIUS_MAX;
 
 
-    // OPTIMIZE: expf() called too often
-    float base_radius = expf(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
+    // OPTIMIZE: fastexp() called too often
+    float base_radius = fastexp(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
     if (base_radius < ACTUAL_RADIUS_MIN) base_radius = ACTUAL_RADIUS_MIN;
     if (base_radius > ACTUAL_RADIUS_MAX) base_radius = ACTUAL_RADIUS_MAX;
     //if (base_radius < 0.5) base_radius = 0.5;
@@ -993,10 +1065,14 @@ smallest_angular_difference(float angleA, float angleB)
 
     // FIXME: no need for base_value or for the range checks above IF always the interpolation
     //        function will be called before this one
-    res1 = dist / self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] * mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_DABS_PER_ACTUAL_RADIUS]);
-    res2 = dist / base_radius   * mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_DABS_PER_BASIC_RADIUS]);
-    res3 = dt * mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_DABS_PER_SECOND]);
-    return res1 + res2 + res3;
+    res1 = dist / self->states[MYPAINT_BRUSH_STATE_ACTUAL_RADIUS] * self->states[MYPAINT_BRUSH_STATE_DABS_PER_ACTUAL_RADIUS];
+    res2 = dist / base_radius   * self->states[MYPAINT_BRUSH_STATE_DABS_PER_BASIC_RADIUS];
+    res3 = dt * self->states[MYPAINT_BRUSH_STATE_DABS_PER_SECOND];
+    //on first load if isnan the engine messes up and won't paint
+    //until you switch modes
+    float res4 = res1 + res2 + res3;
+    if (isnan(res4) || res4 < 0.0) { res4 = 0.0; }
+    return res4;
   }
 
   /**
@@ -1092,8 +1168,8 @@ smallest_angular_difference(float angleA, float angleB)
 
       // noise first
       if (mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_TRACKING_NOISE])) {
-        // OPTIMIZE: expf() called too often
-        const float base_radius = expf(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
+        // OPTIMIZE: fastexp() called too often
+        const float base_radius = fastexp(mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_RADIUS_LOGARITHMIC]));
         const float noise = base_radius * mypaint_mapping_get_base_value(self->settings[MYPAINT_BRUSH_SETTING_TRACKING_NOISE]);
 
         if (noise > 0.001) {
diff --git a/mypaint-surface.c b/mypaint-surface.c
index 8756b3c4..8d89cc0c 100644
--- a/mypaint-surface.c
+++ b/mypaint-surface.c
@@ -32,12 +32,16 @@ mypaint_surface_draw_dab(MyPaintSurface *self,
                        float alpha_eraser,
                        float aspect_ratio, float angle,
                        float lock_alpha,
-                       float colorize
+                       float colorize,
+                       float posterize,
+                       float posterize_num,
+                       float paint
                        )
 {
     assert(self->draw_dab);
     return self->draw_dab(self, x, y, radius, color_r, color_g, color_b,
-                   opaque, hardness, alpha_eraser, aspect_ratio, angle, lock_alpha, colorize);
+                   opaque, hardness, alpha_eraser, aspect_ratio, angle,
+                   lock_alpha, colorize, posterize, posterize_num, paint);
 }
 
 
@@ -52,6 +56,7 @@ mypaint_surface_get_color(MyPaintSurface *self,
     self->get_color(self, x, y, radius, color_r, color_g, color_b, color_a);
 }
 
+
 /**
  * mypaint_surface_init: (skip)
  *
diff --git a/mypaint-surface.h b/mypaint-surface.h
index 1217a76c..9fae02df 100644
--- a/mypaint-surface.h
+++ b/mypaint-surface.h
@@ -30,6 +30,7 @@ typedef void (*MyPaintSurfaceGetColorFunction) (MyPaintSurface *self,
                                                 float radius,
                                                 float * color_r, float * color_g, float * color_b, float * color_a
                                                 );
+
 typedef int (*MyPaintSurfaceDrawDabFunction) (MyPaintSurface *self,
                        float x, float y,
                        float radius,
@@ -38,7 +39,10 @@ typedef int (*MyPaintSurfaceDrawDabFunction) (MyPaintSurface *self,
                        float alpha_eraser,
                        float aspect_ratio, float angle,
                        float lock_alpha,
-                       float colorize);
+                       float colorize,
+                       float posterize,
+                       float posterize_num,
+                       float paint);
 
 typedef void (*MyPaintSurfaceDestroyFunction) (MyPaintSurface *self);
 
@@ -78,7 +82,10 @@ mypaint_surface_draw_dab(MyPaintSurface *self,
                        float alpha_eraser,
                        float aspect_ratio, float angle,
                        float lock_alpha,
-                       float colorize
+                       float colorize,
+                       float posterize,
+                       float posterize_num,
+                       float paint
                        );
 
 
@@ -88,6 +95,7 @@ mypaint_surface_get_color(MyPaintSurface *self,
                         float radius,
                         float * color_r, float * color_g, float * color_b, float * color_a
                         );
+                        
 
 float
 mypaint_surface_get_alpha (MyPaintSurface *self, float x, float y, float radius);
diff --git a/mypaint-tiled-surface.c b/mypaint-tiled-surface.c
index 00212bf5..1f9ae4a7 100644
--- a/mypaint-tiled-surface.c
+++ b/mypaint-tiled-surface.c
@@ -452,27 +452,56 @@ process_op(uint16_t *rgba_p, uint16_t *mask,
                     );
 
     // second, we use the mask to stamp a dab for each activated blend mode
+    if (op->paint < 1.0) {
+      if (op->normal) {
+        if (op->color_a == 1.0) {
+          draw_dab_pixels_BlendMode_Normal(mask, rgba_p,
+                                           op->color_r, op->color_g, op->color_b, op->normal*op->opaque*(1 - op->paint)*(1<<15));
+        } else {
+          // normal case for brushes that use smudging (eg. watercolor)
+          draw_dab_pixels_BlendMode_Normal_and_Eraser(mask, rgba_p,
+                                                      op->color_r, op->color_g, op->color_b, op->color_a*(1<<15),
+                                                      op->normal*op->opaque*(1 - op->paint)*(1<<15));
+        }
+      }
 
-    if (op->normal) {
-      if (op->color_a == 1.0) {
-        draw_dab_pixels_BlendMode_Normal(mask, rgba_p,
-                                         op->color_r, op->color_g, op->color_b, op->normal*op->opaque*(1<<15));
-      } else {
-        // normal case for brushes that use smudging (eg. watercolor)
-        draw_dab_pixels_BlendMode_Normal_and_Eraser(mask, rgba_p,
-                                                    op->color_r, op->color_g, op->color_b, op->color_a*(1<<15), op->normal*op->opaque*(1<<15));
+      if (op->lock_alpha) {
+        draw_dab_pixels_BlendMode_LockAlpha(mask, rgba_p,
+                                            op->color_r, op->color_g, op->color_b,
+                                            op->lock_alpha*op->opaque*(1 - op->colorize)*(1 - op->posterize)*(1 - op->paint)*(1<<15));
       }
     }
+    
+    if (op->paint > 0.0) {
+      if (op->normal) {
+        if (op->color_a == 1.0) {
+          draw_dab_pixels_BlendMode_Normal_Paint(mask, rgba_p,
+                                           op->color_r, op->color_g, op->color_b, op->normal*op->opaque*op->paint*(1<<15));
+        } else {
+          // normal case for brushes that use smudging (eg. watercolor)
+          draw_dab_pixels_BlendMode_Normal_and_Eraser_Paint(mask, rgba_p,
+                                                      op->color_r, op->color_g, op->color_b, op->color_a*(1<<15),
+                                                      op->normal*op->opaque*op->paint*(1<<15));
+        }
+      }
 
-    if (op->lock_alpha) {
-      draw_dab_pixels_BlendMode_LockAlpha(mask, rgba_p,
-                                          op->color_r, op->color_g, op->color_b, op->lock_alpha*op->opaque*(1<<15));
+      if (op->lock_alpha) {
+        draw_dab_pixels_BlendMode_LockAlpha_Paint(mask, rgba_p,
+                                            op->color_r, op->color_g, op->color_b,
+                                            op->lock_alpha*op->opaque*(1 - op->colorize)*(1 - op->posterize)*op->paint*(1<<15));
+      }
     }
+    
     if (op->colorize) {
       draw_dab_pixels_BlendMode_Color(mask, rgba_p,
                                       op->color_r, op->color_g, op->color_b,
                                       op->colorize*op->opaque*(1<<15));
     }
+    if (op->posterize) {
+      draw_dab_pixels_BlendMode_Posterize(mask, rgba_p,
+                                      op->posterize*op->opaque*(1<<15),
+                                      op->posterize_num);
+    }
 }
 
 // Must be threadsafe
@@ -531,7 +560,10 @@ gboolean draw_dab_internal (MyPaintTiledSurface *self, float x, float y,
                float color_a,
                float aspect_ratio, float angle,
                float lock_alpha,
-               float colorize
+               float colorize,
+               float posterize,
+               float posterize_num,
+               float paint
                )
 
 {
@@ -547,6 +579,9 @@ gboolean draw_dab_internal (MyPaintTiledSurface *self, float x, float y,
     op->hardness = CLAMP(hardness, 0.0f, 1.0f);
     op->lock_alpha = CLAMP(lock_alpha, 0.0f, 1.0f);
     op->colorize = CLAMP(colorize, 0.0f, 1.0f);
+    op->posterize = CLAMP(posterize, 0.0f, 1.0f);
+    op->posterize_num= CLAMP(ROUND(posterize_num * 100.0), 1, 128);
+    op->paint = CLAMP(paint, 0.0f, 1.0f);
     if (op->radius < 0.1f) return FALSE; // don't bother with dabs smaller than 0.1 pixel
     if (op->hardness == 0.0f) return FALSE; // infintly small center point, fully transparent outside
     if (op->opaque == 0.0f) return FALSE;
@@ -566,6 +601,7 @@ gboolean draw_dab_internal (MyPaintTiledSurface *self, float x, float y,
 
     op->normal *= 1.0f-op->lock_alpha;
     op->normal *= 1.0f-op->colorize;
+    op->normal *= 1.0f-op->posterize;
 
     if (op->aspect_ratio<1.0f) op->aspect_ratio=1.0f;
 
@@ -599,7 +635,10 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
                float color_a,
                float aspect_ratio, float angle,
                float lock_alpha,
-               float colorize)
+               float colorize,
+               float posterize,
+               float posterize_num,
+               float paint)
 {
   MyPaintTiledSurface *self = (MyPaintTiledSurface *)surface;
 
@@ -608,7 +647,7 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
   // Normal pass
   if (draw_dab_internal(self, x, y, radius, color_r, color_g, color_b,
                         opaque, hardness, color_a, aspect_ratio, angle,
-                        lock_alpha, colorize)) {
+                        lock_alpha, colorize, posterize, posterize_num, paint)) {
       surface_modified = TRUE;
   }
 
@@ -630,7 +669,7 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
           case MYPAINT_SYMMETRY_TYPE_VERTICAL:
             if (draw_dab_internal(self, symm_x, y, radius, color_r, color_g, color_b,
                                    opaque, hardness, color_a, aspect_ratio, -angle,
-                                   lock_alpha, colorize)) {
+                                   lock_alpha, colorize, posterize, posterize_num, paint)) {
                 surface_modified = TRUE;
             }
             break;
@@ -638,7 +677,7 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
           case MYPAINT_SYMMETRY_TYPE_HORIZONTAL:
             if (draw_dab_internal(self, x, symm_y, radius, color_r, color_g, color_b,
                                    opaque, hardness, color_a, aspect_ratio, angle + 180.0,
-                                   lock_alpha, colorize)) {
+                                   lock_alpha, colorize, posterize, posterize_num, paint)) {
                 surface_modified = TRUE;
             }
             break;
@@ -647,19 +686,19 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
             // reflect vertically
             if (draw_dab_internal(self, symm_x, y, radius, color_r, color_g, color_b,
                                    opaque, hardness, color_a, aspect_ratio, -angle,
-                                   lock_alpha, colorize)) {
+                                   lock_alpha, colorize, posterize, posterize_num, paint)) {
                 dab_count++;
             }
             // reflect horizontally
             if (draw_dab_internal(self, x, symm_y, radius, color_r, color_g, color_b,
                                    opaque, hardness, color_a, aspect_ratio, angle + 180.0,
-                                   lock_alpha, colorize)) {
+                                   lock_alpha, colorize, posterize, posterize_num, paint)) {
                 dab_count++;
             }
             // reflect horizontally and vertically
             if (draw_dab_internal(self, symm_x, symm_y, radius, color_r, color_g, color_b,
                                    opaque, hardness, color_a, aspect_ratio, -angle - 180.0,
-                                   lock_alpha, colorize)) {
+                                   lock_alpha, colorize, posterize, posterize_num, paint)) {
                 dab_count++;
             }
             if (dab_count == 4) {
@@ -687,7 +726,7 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
                     if (!draw_dab_internal(self, rot_x, rot_y, radius, color_r, color_g, color_b,
                                            opaque, hardness, color_a,
                                            aspect_ratio, -angle + symmetry_angle_offset,
-                                           lock_alpha, colorize)) {
+                                           lock_alpha, colorize, posterize, posterize_num, paint)) {
                         failed_subdabs = TRUE;
                         break;
                     }
@@ -718,7 +757,7 @@ int draw_dab (MyPaintSurface *surface, float x, float y,
                     if (!draw_dab_internal(self, rot_x, rot_y, radius, color_r, color_g, color_b,
                                            opaque, hardness, color_a, aspect_ratio,
                                            angle + symmetry_angle_offset,
-                                           lock_alpha, colorize)) {
+                                           lock_alpha, colorize, posterize, posterize_num, paint)) {
                         break;
                     }
                 }
@@ -834,6 +873,7 @@ void get_color (MyPaintSurface *surface, float x, float y,
     *color_a = CLAMP(*color_a, 0.0f, 1.0f);
 }
 
+
 /**
  * mypaint_tiled_surface_init: (skip)
  *
diff --git a/operationqueue.h b/operationqueue.h
index feb7769c..3c887192 100644
--- a/operationqueue.h
+++ b/operationqueue.h
@@ -19,6 +19,9 @@ typedef struct {
     float normal;
     float lock_alpha;
     float colorize;
+    float posterize;
+    float posterize_num;
+    float paint;
 } OperationDataDrawDab;
 
 typedef struct OperationQueue OperationQueue;