diff --git a/.gitignore b/.gitignore
index cb4660d03..fed3e92c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,7 +26,7 @@ fortran/examples/nufft2dmany_demof
 fortran/examples/nufft3d_demof
 test/dumbinputs
 test/finufft1d_basicpassfail
-test/testutils
+test/testlib
 __pycache__*
 
 docs/_build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c66e1b58..9188648fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -248,27 +248,19 @@ function(set_finufft_options target)
 endfunction()
 
 if(FINUFFT_USE_CPU)
-  # Main finufft libraries
-  if(NOT FINUFFT_STATIC_LINKING)
-    add_library(
-      finufft SHARED
+  set(FINUFFT_SOURCES
       src/spreadinterp.cpp
-      src/utils.cpp
       contrib/legendre_rule_fast.cpp
       src/fft.cpp
       src/finufft_core.cpp
       src/c_interface.cpp
+      src/finufft_utils.cpp
       fortran/finufftfort.cpp)
+  # Main finufft libraries
+  if(NOT FINUFFT_STATIC_LINKING)
+    add_library(finufft SHARED ${FINUFFT_SOURCES})
   else()
-    add_library(
-      finufft STATIC
-      src/spreadinterp.cpp
-      src/utils.cpp
-      contrib/legendre_rule_fast.cpp
-      src/fft.cpp
-      src/finufft_core.cpp
-      src/c_interface.cpp
-      fortran/finufftfort.cpp)
+    add_library(finufft STATIC ${FINUFFT_SOURCES})
   endif()
   set_finufft_options(finufft)
 
diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp
index d05ac986a..a84e3f4e4 100644
--- a/devel/foldrescale.cpp
+++ b/devel/foldrescale.cpp
@@ -1,7 +1,6 @@
-#include "finufft/defs.h"
+#include "finufft/test_defs.h"
 #include <benchmark/benchmark.h>
 #include <cmath>
-#include <immintrin.h>
 #include <iostream>
 #include <random>
 // no vectorize
@@ -17,22 +16,22 @@
    This should be done in C++ not as a macro, someday.
 */
 #define FOLDRESCALE(x, N, p)                                                \
-  (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \
+  (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)INV_2PI * N) \
      : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))
 
 #define FOLDRESCALE04(x, N, p)                                                       \
-  (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) \
+  (p ? ((x * FLT(INV_2PI) + FLT(0.5)) - floor(x * FLT(INV_2PI) + FLT(0.5))) * FLT(N) \
      : ((x / FLT(N)) - floor(x / FLT(N))) * FLT(N))
 
 #define FOLDRESCALE05(x, N, p)                                                       \
-  FLT(N) * (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) \
+  FLT(N) * (p ? ((x * FLT(INV_2PI) + FLT(0.5)) - floor(x * FLT(INV_2PI) + FLT(0.5))) \
               : ((x / FLT(N)) - floor(x / FLT(N))))
 
 inline __attribute__((always_inline)) FLT foldRescale00(FLT x, BIGINT N, bool p) {
   FLT result;
   FLT fN = FLT(N);
   if (p) {
-    static constexpr FLT x2pi = FLT(M_1_2PI);
+    static constexpr FLT x2pi = FLT(INV_2PI);
     result                    = x * x2pi + FLT(0.5);
     result -= floor(result);
   } else {
@@ -44,14 +43,14 @@ inline __attribute__((always_inline)) FLT foldRescale00(FLT x, BIGINT N, bool p)
 }
 
 inline __attribute__((always_inline)) FLT foldRescale01(FLT x, BIGINT N, bool p) {
-  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N)
+  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)INV_2PI * N)
            : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N);
 }
 
 template<bool p>
 inline __attribute__((always_inline)) FLT foldRescale02(FLT x, BIGINT N) {
   if constexpr (p) {
-    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N);
+    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)INV_2PI * N);
   } else {
     return (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N);
   }
@@ -62,7 +61,7 @@ inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) {
   FLT result;
   FLT fN = FLT(N);
   if constexpr (p) {
-    static constexpr FLT x2pi = FLT(M_1_2PI);
+    static constexpr FLT x2pi = FLT(INV_2PI);
     result                    = std::fma(x, x2pi, FLT(0.5));
     result -= floor(result);
   } else {
@@ -73,7 +72,6 @@ inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) {
   return result * fN;
 }
 
-
 static std::mt19937_64 gen;
 static std::uniform_real_distribution<> dis(-10, 10);
 static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
@@ -185,7 +183,6 @@ static void BM_FoldRescale05N(benchmark::State &state) {
   }
 }
 
-
 BENCHMARK(BM_BASELINE)->Iterations(10000000);
 BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale00)->Iterations(1000000);
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 26eb313ab..de01a9ea3 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -7,15 +7,17 @@
 #include <cuComplex.h>
 #include <cufinufft/types.h>
 
-#include <cuda_runtime.h>
-
-#include <sys/time.h>
-
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <type_traits>
 
 #include <thrust/extrema.h>
 
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <cmath>
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
@@ -87,17 +89,6 @@ class WithCudaDevice {
   }
 };
 
-// jfm timer class
-class CNTime {
-public:
-  void start();
-  double restart();
-  double elapsedsec();
-
-private:
-  struct timeval initial;
-};
-
 // ahb math helpers
 CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
 
@@ -118,8 +109,8 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
  */
 
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
-                                                              cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexShared(
+    cuda_complex<T> *address, cuda_complex<T> res) {
   const auto raw_address = reinterpret_cast<T *>(address);
   atomicAdd(raw_address, res.x);
   atomicAdd(raw_address + 1, res.y);
@@ -131,8 +122,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *a
  * on shared memory are supported so we leverage them
  */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
-                                                              cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(
+    cuda_complex<T> *address, cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
diff --git a/include/finufft/dirft.h b/include/finufft/dirft.h
deleted file mode 100644
index 2449d864e..000000000
--- a/include/finufft/dirft.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef DIRFT_H
-#define DIRFT_H
-
-#include <finufft/finufft_core.h>
-
-template<typename T>
-void dirft1d1(BIGINT nj, T *x, std::complex<T> *c, int isign, BIGINT ms,
-              std::complex<T> *f);
-template<typename T>
-void dirft1d2(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT ms,
-              std::complex<T> *f);
-template<typename T>
-void dirft1d3(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT nk, T *s,
-              std::complex<T> *f);
-
-template<typename T>
-void dirft2d1(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT ms, BIGINT mt,
-              std::complex<T> *f);
-template<typename T>
-void dirft2d2(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT ms, BIGINT mt,
-              std::complex<T> *f);
-template<typename T>
-void dirft2d3(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT nk, T *s, T *t,
-              std::complex<T> *f);
-
-template<typename T>
-void dirft3d1(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT ms,
-              BIGINT mt, BIGINT mu, std::complex<T> *f);
-template<typename T>
-void dirft3d2(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT ms,
-              BIGINT mt, BIGINT mu, std::complex<T> *f);
-template<typename T>
-void dirft3d3(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT nk, T *s,
-              T *t, T *u, std::complex<T> *f);
-
-#endif
diff --git a/include/finufft/finufft_utils.hpp b/include/finufft/finufft_utils.hpp
new file mode 100644
index 000000000..7577a57a1
--- /dev/null
+++ b/include/finufft/finufft_utils.hpp
@@ -0,0 +1,75 @@
+// Header for utils.cpp, a little library of low-level array stuff.
+// These are just the functions which depend on single/double precision (FLT)
+
+#pragma once
+
+#include <chrono>
+#include <cmath>
+
+#include "finufft_core.h"
+
+//  for CNTime...
+//  using chrono since the interface is portable between linux and windows
+
+namespace finufft::utils {
+
+template<typename T>
+FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arrayrange(BIGINT n, const T *a,
+                                                                   T *lo, T *hi)
+// With a a length-n array, writes out min(a) to lo and max(a) to hi,
+// so that all a values lie in [lo,hi].
+// If n==0, lo and hi are not finite.
+{
+  *lo = INFINITY;
+  *hi = -INFINITY;
+  for (BIGINT m = 0; m < n; ++m) {
+    if (a[m] < *lo) *lo = a[m];
+    if (a[m] > *hi) *hi = a[m];
+  }
+}
+template<typename T>
+FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arraywidcen(BIGINT n, const T *a,
+                                                                    T *w, T *c)
+// Writes out w = half-width and c = center of an interval enclosing all a[n]'s
+// Only chooses a nonzero center if this increases w by less than fraction
+// ARRAYWIDCEN_GROWFRAC defined in finufft_core.h.
+// This prevents rephasings which don't grow nf by much. 6/8/17
+// If n==0, w and c are not finite.
+{
+  T lo, hi;
+  arrayrange(n, a, &lo, &hi);
+  *w = (hi - lo) / 2;
+  *c = (hi + lo) / 2;
+  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
+    *w += std::abs(*c);
+    *c = 0.0;
+  }
+}
+
+FINUFFT_EXPORT BIGINT next235even(BIGINT n);
+
+// jfm's timer class
+class FINUFFT_EXPORT CNTime {
+public:
+  FINUFFT_NEVER_INLINE void start();
+  FINUFFT_NEVER_INLINE double restart();
+  FINUFFT_NEVER_INLINE double elapsedsec() const;
+
+private:
+  double initial;
+};
+
+// openmp helpers
+int get_num_threads_parallel_block();
+
+} // namespace finufft::utils
+
+// thread-safe rand number generator for Windows platform
+#ifdef _WIN32
+#include <random>
+namespace finufft {
+namespace utils {
+FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
+} // namespace utils
+} // namespace finufft
+#endif
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index 556315242..8a3f76fca 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -16,7 +16,6 @@
 
 // convenient private finufft internals
 #include <finufft/finufft_core.h>
-#include <finufft/utils.h>
 #include <memory>
 
 // --------------- Private data types for compilation in either prec ---------
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
deleted file mode 100644
index 0b875fdfe..000000000
--- a/include/finufft/utils.h
+++ /dev/null
@@ -1,114 +0,0 @@
-// Header for utils.cpp, a little library of low-level array stuff.
-// These are just the functions which depend on single/double precision (FLT)
-
-#ifndef UTILS_H
-#define UTILS_H
-
-#include "finufft/finufft_core.h"
-//  for CNTime...
-//  using chrono since the interface is portable between linux and windows
-#include <chrono>
-
-namespace finufft {
-namespace utils {
-
-// ahb's low-level array helpers
-template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, const std::complex<T> *a,
-                                             const std::complex<T> *b)
-// ||a-b||_2 / ||a||_2
-{
-  T err = 0.0, nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) {
-    // note std::norm here & below is |a|^2 ("field norm") not usual |a| ...
-    nrm += std::norm(a[m]);
-    err += std::norm(a[m] - b[m]);
-  }
-  return sqrt(err / nrm);
-}
-template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, const std::complex<T> *a,
-                                          const std::complex<T> *b)
-// ||a-b||_2
-{
-  T err = 0.0; // compute error 2-norm
-  for (BIGINT m = 0; m < n; ++m) err += std::norm(a[m] - b[m]);
-  return sqrt(err);
-}
-template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, const std::complex<T> *a)
-// ||a||_2
-{
-  T nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) nrm += std::norm(a[m]);
-  return sqrt(nrm);
-}
-template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, const std::complex<T> *a)
-// ||a||_infty
-{
-  T nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) nrm = std::max(nrm, std::norm(a[m]));
-  return sqrt(nrm);
-}
-template<typename T>
-FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, const T *a, T *lo, T *hi)
-// With a a length-n array, writes out min(a) to lo and max(a) to hi,
-// so that all a values lie in [lo,hi].
-// If n==0, lo and hi are not finite.
-{
-  *lo = INFINITY;
-  *hi = -INFINITY;
-  for (BIGINT m = 0; m < n; ++m) {
-    if (a[m] < *lo) *lo = a[m];
-    if (a[m] > *hi) *hi = a[m];
-  }
-}
-template<typename T>
-FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, const T *a, T *w, T *c)
-// Writes out w = half-width and c = center of an interval enclosing all a[n]'s
-// Only chooses a nonzero center if this increases w by less than fraction
-// ARRAYWIDCEN_GROWFRAC defined in finufft_core.h.
-// This prevents rephasings which don't grow nf by much. 6/8/17
-// If n==0, w and c are not finite.
-{
-  T lo, hi;
-  arrayrange(n, a, &lo, &hi);
-  *w = (hi - lo) / 2;
-  *c = (hi + lo) / 2;
-  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
-    *w += std::abs(*c);
-    *c = 0.0;
-  }
-}
-
-FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
-
-// jfm's timer class
-class FINUFFT_EXPORT CNTime {
-public:
-  void start();
-  double restart();
-  double elapsedsec();
-
-private:
-  double initial;
-};
-
-// openmp helpers
-int get_num_threads_parallel_block();
-
-} // namespace utils
-} // namespace finufft
-
-// thread-safe rand number generator for Windows platform
-#ifdef _WIN32
-#include <random>
-namespace finufft {
-namespace utils {
-FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
-} // namespace utils
-} // namespace finufft
-#endif
-
-#endif // UTILS_H
diff --git a/makefile b/makefile
index 928bbf209..23a5bab61 100644
--- a/makefile
+++ b/makefile
@@ -136,7 +136,7 @@ STATICLIB = lib-static/$(LIBNAME).a
 ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 
 # spreader objs
-SOBJS = src/utils.o src/spreadinterp.o
+SOBJS = src/finufft_utils.o src/spreadinterp.o
 
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
 OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS)
@@ -262,10 +262,10 @@ test/%: test/%.cpp $(DYNLIB)
 test/%f: test/%.cpp $(DYNLIB)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
 # low-level tests that are cleaner if depend on only specific objects...
-test/testutils: test/testutils.cpp src/utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o $(LIBS) -o test/testutils
-test/testutilsf: test/testutils.cpp src/utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils.o $(LIBS) -o test/testutilsf
+test/testutils: test/testutils.cpp src/finufft_utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o $(LIBS) -o test/testutils
+test/testutilsf: test/testutils.cpp src/finufft_utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o $(LIBS) -o test/testutilsf
 
 # make sure all double-prec test executables ready for testing
 TESTS := $(basename $(wildcard test/*.cpp))
diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp
index a291a269b..72145fcc0 100644
--- a/perftest/guru_timing_test.cpp
+++ b/perftest/guru_timing_test.cpp
@@ -1,4 +1,6 @@
+#include "finufft/finufft_utils.hpp"
 #include <finufft/test_defs.h>
+
 // for sleep call
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
 #include <Windows.h>
@@ -154,7 +156,7 @@ int main(int argc, char *argv[])
   printf("FINUFFT %dd%d use guru interface to do %d calls together:-------------------\n",
          ndim, type, ntransf);
   FINUFFT_PLAN plan;                // instantiate a finufft_plan
-  finufft::utils::CNTime timer;
+  CNTime timer;
   timer.start();                    // Guru Step 1
   BIGINT n_modes[3] = {N1, N2, N3}; // #modes per dimension (ignored for t3)
   int ier = FINUFFT_MAKEPLAN(type, ndim, n_modes, isign, ntransf, tol, &plan, &opts);
@@ -258,7 +260,7 @@ double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_P
    Malleo 2019; xyz passed in by Barnett 5/26/20 to prevent X_orig fields.
 */
 {
-  finufft::utils::CNTime timer;
+  CNTime timer;
   timer.start();
   int ier             = 0;
   double t            = 0;
diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp
index f0e4c29ae..d669eb770 100644
--- a/perftest/manysmallprobs.cpp
+++ b/perftest/manysmallprobs.cpp
@@ -1,16 +1,17 @@
+
+#include <complex>
+
 // public header
 #include "finufft.h"
 #include "finufft/test_defs.h"
 
 // private access to timer
-#include "finufft/utils.h"
-using namespace finufft::utils;
+#include "finufft/finufft_utils.hpp"
 
-#include <complex>
-#include <stdio.h>
-#include <stdlib.h>
 using namespace std;
 
+using namespace finufft::utils;
+
 int main(int argc, char *argv[])
 /* What is small-problem cost of FINUFFT library from C++, using plain
    arrays of C++ complex numbers?  Barnett 10/31/17.
@@ -48,7 +49,7 @@ int main(int argc, char *argv[])
   complex<double> *F = (complex<double> *)malloc(sizeof(complex<double>) * N);
 
   printf("repeatedly calling the simple interface: --------------------- \n");
-  finufft::utils::CNTime timer;
+  CNTime timer;
   timer.start();
   for (int r = 0; r < reps; ++r) { // call the NUFFT (with iflag=+1):
     // printf("rep %d\n",r);
diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp
index 5aab26fb3..6f942ee43 100644
--- a/perftest/spreadtestnd.cpp
+++ b/perftest/spreadtestnd.cpp
@@ -1,14 +1,16 @@
+#include "finufft/finufft_utils.hpp"
 #include <finufft/spreadinterp.h>
 #include <finufft/test_defs.h>
-#include <finufft/utils.h>
 
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
 
+#include "finufft/finufft_utils.hpp"
 using namespace finufft::spreadinterp;
-using namespace finufft::utils; // for timer
+using namespace std;
+using namespace finufft::utils;
 
 void usage() {
   printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth "
diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp
index 950c3526e..2ff12ba78 100644
--- a/perftest/spreadtestndall.cpp
+++ b/perftest/spreadtestndall.cpp
@@ -1,6 +1,6 @@
+#include "finufft/finufft_utils.hpp"
 #include <finufft/spreadinterp.h>
 #include <finufft/test_defs.h>
-#include <finufft/utils.h>
 
 #include <cmath>
 #include <cstdio>
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index 9f8d1344c..812f90f02 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -19,11 +19,9 @@ set(PRECISION_DEPENDENT_SRC
     common.cu)
 
 set(CUFINUFFT_INCLUDE_DIRS
-    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/contrib
+    ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/contrib
     $<TARGET_PROPERTY:CUDA::cudart,INTERFACE_INCLUDE_DIRECTORIES>
-    $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>
-    $<TARGET_PROPERTY:CUDA::nvToolsExt,INTERFACE_INCLUDE_DIRECTORIES>)
+    $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>)
 
 set(CUFINUFFT_INCLUDE_DIRS
     ${CUFINUFFT_INCLUDE_DIRS}
@@ -43,41 +41,17 @@ set(FINUFFT_CUDA_FLAGS
     >
     >)
 
-add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC})
-target_include_directories(cufinufft_common_objects
-                           PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-set_target_properties(
-  cufinufft_common_objects
-  PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
-             CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
-             CUDA_SEPARABLE_COMPILATION ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON)
-target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17)
-target_compile_options(cufinufft_common_objects PRIVATE ${FINUFFT_CUDA_FLAGS})
-
-add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC})
-target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-set_target_properties(
-  cufinufft_objects
-  PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
-             CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
-             CUDA_SEPARABLE_COMPILATION ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON)
-target_compile_features(cufinufft_objects PRIVATE cxx_std_17)
-target_compile_options(cufinufft_objects PRIVATE ${FINUFFT_CUDA_FLAGS})
-
 if(FINUFFT_SHARED_LINKING)
-  add_library(cufinufft SHARED $<TARGET_OBJECTS:cufinufft_common_objects>
-                               $<TARGET_OBJECTS:cufinufft_objects>)
+  add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC}
+                               ${PRECISION_DEPENDENT_SRC})
 else()
-  add_library(cufinufft STATIC $<TARGET_OBJECTS:cufinufft_common_objects>
-                               $<TARGET_OBJECTS:cufinufft_objects>)
+  add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC}
+                               ${PRECISION_DEPENDENT_SRC})
   set_target_properties(
     cufinufft PROPERTIES POSITION_INDEPENDENT_CODE
                          ${FINUFFT_POSITION_INDEPENDENT_CODE})
 endif()
+target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 
 set_target_properties(
   cufinufft
@@ -85,15 +59,14 @@ set_target_properties(
              CUDA_SEPARABLE_COMPILATION ON
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON
+             WINDOWS_EXPORT_ALL_SYMBOLS ON
              ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 target_compile_features(cufinufft PRIVATE cxx_std_17)
 target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS})
-if(WIN32)
-  target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft
-                                         CUDA::nvToolsExt)
+if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS))
+  target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
 else()
-  target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static
-                                         CUDA::nvToolsExt)
+  target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static)
 endif()
 
 file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h")
diff --git a/src/cuda/utils.cpp b/src/cuda/utils.cpp
index 9c3003cb8..406ee7e13 100644
--- a/src/cuda/utils.cpp
+++ b/src/cuda/utils.cpp
@@ -23,27 +23,5 @@ CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
   return nplus;
 }
 
-// ----------------------- helpers for timing (always stay double prec)...
-
-void CNTime::start() { gettimeofday(&initial, 0); }
-
-double CNTime::restart()
-// Barnett changed to returning in sec
-{
-  double delta = this->elapsedsec();
-  this->start();
-  return delta;
-}
-
-double CNTime::elapsedsec()
-// returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
-{
-  struct timeval now;
-  gettimeofday(&now, 0);
-  double nowsec     = (double)now.tv_sec + 1e-6 * now.tv_usec;
-  double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec;
-  return nowsec - initialsec;
-}
-
 } // namespace utils
 } // namespace cufinufft
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index 1adfabedb..b0d0fd33d 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -1,14 +1,12 @@
 #include <finufft/fft.h>
 #include <finufft/finufft_core.h>
+#include <finufft/finufft_utils.hpp>
 #include <finufft/spreadinterp.h>
-#include <finufft/utils.h>
 
 #include "../contrib/legendre_rule_fast.h"
 #include <cmath>
 #include <cstdio>
-#include <cstdlib>
 #include <iomanip>
-#include <iostream>
 #include <memory>
 #include <vector>
 
@@ -75,7 +73,7 @@ Design notes for guru interface implementation:
 // ---------- local math routines (were in common.cpp; no need now): --------
 
 namespace finufft {
-namespace common {
+namespace utils {
 
 static int set_nf_type12(BIGINT ms, const finufft_opts &opts,
                          const finufft_spread_opts &spopts, BIGINT *nf)
@@ -364,11 +362,11 @@ static void deconvolveshuffle2d(int dir, T prefac, const std::vector<T> &ker1,
       fw[j] = 0.0;
   for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
     // point fk and fw to the start of this y value's row (2* is for complex):
-    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
-                                &fw[nf1 * k2], modeord);
+    utils::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
+                               &fw[nf1 * k2], modeord);
   for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
-    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
-                                &fw[nf1 * (nf2 + k2)], modeord);
+    utils::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
+                               &fw[nf1 * (nf2 + k2)], modeord);
 }
 
 template<typename T>
@@ -409,11 +407,11 @@ static void deconvolveshuffle3d(int dir, T prefac, std::vector<T> &ker1,
       fw[j] = 0.0;
   for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
     // point fk and fw to the start of this z value's plane (2* is for complex):
-    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
-                                nf2, &fw[np * k3], modeord);
+    utils::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
+                               nf2, &fw[np * k3], modeord);
   for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
-    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
-                                nf2, &fw[np * (nf3 + k3)], modeord);
+    utils::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
+                               nf2, &fw[np * (nf3 + k3)], modeord);
 }
 
 // --------- batch helper functions for t1,2 exec: ---------------------------
@@ -488,12 +486,12 @@ static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T<T> *p, std::complex<T>
   return 0;
 }
 
-} // namespace common
+} // namespace utils
 } // namespace finufft
 
 // --------------- rest is the 5 user guru (plan) interface drivers: ---------
 // (not namespaced since have safe names finufft{f}_* )
-using namespace finufft::common; // accesses routines defined above
+using namespace finufft::utils; // accesses routines defined above
 
 // Marco Barbone: 5.8.2024
 // These are user-facing.
diff --git a/src/utils.cpp b/src/finufft_utils.cpp
similarity index 95%
rename from src/utils.cpp
rename to src/finufft_utils.cpp
index 2627a179a..8bcf8ddab 100644
--- a/src/utils.cpp
+++ b/src/finufft_utils.cpp
@@ -5,11 +5,11 @@
 
 #include <cstdint>
 
-#include "finufft/utils.h"
+#include <finufft/finufft_utils.hpp>
+
 using namespace std;
 
-namespace finufft {
-namespace utils {
+namespace finufft::utils {
 
 BIGINT next235even(BIGINT n)
 // finds even integer not less than n, with prime factors no larger than 5
@@ -47,7 +47,7 @@ double CNTime::restart()
   return delta;
 }
 
-double CNTime::elapsedsec()
+double CNTime::elapsedsec() const
 // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
 {
   std::uint64_t now = std::chrono::duration_cast<std::chrono::microseconds>(
@@ -85,5 +85,4 @@ int rand_r(unsigned int * /*seedp*/)
 }
 #endif
 
-} // namespace utils
-} // namespace finufft
+} // namespace finufft::utils
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index f6cf925e0..497e2396d 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1,7 +1,7 @@
 // Spreading/interpolating module within FINUFFT.
 
+#include <finufft/finufft_utils.hpp>
 #include <finufft/spreadinterp.h>
-#include <finufft/utils.h>
 
 #include "ker_horner_allw_loop_constexpr.h"
 #include "ker_lowupsampfac_horner_allw_loop_constexpr.h"
diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp
index e182cfe5f..866458c86 100644
--- a/test/dumbinputs.cpp
+++ b/test/dumbinputs.cpp
@@ -34,12 +34,12 @@
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
-#include "directft/dirft1d.cpp"
-#include "directft/dirft2d.cpp"
-#include "directft/dirft3d.cpp"
+#include "utils/dirft1d.hpp"
+#include "utils/dirft2d.hpp"
+#include "utils/norms.hpp"
 #include <finufft/test_defs.h>
+
 using namespace std;
-using namespace finufft::utils; // for twonorm, etc
 
 int main(int argc, char *argv[]) {
   int M = 100;    // number of nonuniform points
diff --git a/test/finufft1d_test.cpp b/test/finufft1d_test.cpp
index d8a66ed0d..00475c2da 100644
--- a/test/finufft1d_test.cpp
+++ b/test/finufft1d_test.cpp
@@ -1,6 +1,8 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft1d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft1d.hpp"
+#include "utils/norms.hpp"
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp
index f2b13534d..0491a4b27 100644
--- a/test/finufft1dmany_test.cpp
+++ b/test/finufft1dmany_test.cpp
@@ -1,6 +1,8 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft1d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft1d.hpp"
+#include "utils/norms.hpp"
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp
index eb1fec761..b79f5967b 100644
--- a/test/finufft2d_test.cpp
+++ b/test/finufft2d_test.cpp
@@ -1,6 +1,9 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft2d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft2d.hpp"
+#include "utils/norms.hpp"
+
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp
index 0efbaded9..547c37b9b 100644
--- a/test/finufft2dmany_test.cpp
+++ b/test/finufft2dmany_test.cpp
@@ -1,6 +1,8 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft2d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft2d.hpp"
+#include "utils/norms.hpp"
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/finufft3d_test.cpp b/test/finufft3d_test.cpp
index 1e89d471c..9cbd5c042 100644
--- a/test/finufft3d_test.cpp
+++ b/test/finufft3d_test.cpp
@@ -1,6 +1,8 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft3d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft3d.hpp"
+#include "utils/norms.hpp"
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/finufft3dkernel_test.cpp b/test/finufft3dkernel_test.cpp
index 9bc6d1955..87276261b 100644
--- a/test/finufft3dkernel_test.cpp
+++ b/test/finufft3dkernel_test.cpp
@@ -1,6 +1,9 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft3d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft3d.hpp"
+#include "utils/norms.hpp"
+
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp
index b1d315719..9cae4e41d 100644
--- a/test/finufft3dmany_test.cpp
+++ b/test/finufft3dmany_test.cpp
@@ -1,6 +1,9 @@
 #include <finufft/test_defs.h>
 // this enforces recompilation, responding to SINGLE...
-#include "directft/dirft3d.cpp"
+#include "finufft/finufft_utils.hpp"
+#include "utils/dirft3d.hpp"
+#include "utils/norms.hpp"
+
 using namespace std;
 using namespace finufft::utils;
 
diff --git a/test/testutils.cpp b/test/testutils.cpp
index 6facb72cd..100a46531 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -15,7 +15,11 @@
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
+
+#include "finufft/finufft_utils.hpp"
+#include "utils/norms.hpp"
 #include <finufft/test_defs.h>
+
 using namespace finufft::utils;
 
 int main(int argc, char *argv[]) {
diff --git a/test/directft/dirft1d.cpp b/test/utils/dirft1d.hpp
similarity index 98%
rename from test/directft/dirft1d.cpp
rename to test/utils/dirft1d.hpp
index c80299b47..22863edb3 100644
--- a/test/directft/dirft1d.cpp
+++ b/test/utils/dirft1d.hpp
@@ -1,5 +1,4 @@
-#include <finufft/dirft.h>
-#include <finufft/finufft_core.h>
+#include "finufft/finufft_core.h"
 #include <iostream>
 
 // This is basically a port of dirft1d.f from CMCL package, except with
diff --git a/test/directft/dirft2d.cpp b/test/utils/dirft2d.hpp
similarity index 98%
rename from test/directft/dirft2d.cpp
rename to test/utils/dirft2d.hpp
index 62f126c15..26b813e3c 100644
--- a/test/directft/dirft2d.cpp
+++ b/test/utils/dirft2d.hpp
@@ -1,5 +1,4 @@
-#include <finufft/dirft.h>
-#include <finufft/finufft_core.h>
+#include "finufft/finufft_core.h"
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
diff --git a/test/directft/dirft3d.cpp b/test/utils/dirft3d.hpp
similarity index 98%
rename from test/directft/dirft3d.cpp
rename to test/utils/dirft3d.hpp
index b77111257..795505d7a 100644
--- a/test/directft/dirft3d.cpp
+++ b/test/utils/dirft3d.hpp
@@ -1,5 +1,6 @@
-#include <finufft/dirft.h>
-#include <finufft/finufft_core.h>
+#pragma once
+
+#include "finufft/finufft_core.h"
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
diff --git a/test/utils/norms.hpp b/test/utils/norms.hpp
new file mode 100644
index 000000000..c7d42a74b
--- /dev/null
+++ b/test/utils/norms.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <finufft/test_defs.h>
+
+// ahb's low-level array helpers
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, const std::complex<T> *a,
+                                             const std::complex<T> *b)
+// ||a-b||_2 / ||a||_2
+{
+  T err = 0.0, nrm = 0.0;
+  for (BIGINT m = 0; m < n; ++m) {
+    // note std::norm here & below is |a|^2 ("field norm") not usual |a| ...
+    nrm += std::norm(a[m]);
+    err += std::norm(a[m] - b[m]);
+  }
+  return sqrt(err / nrm);
+}
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, const std::complex<T> *a,
+                                          const std::complex<T> *b)
+// ||a-b||_2
+{
+  T err = 0.0; // compute error 2-norm
+  for (BIGINT m = 0; m < n; ++m) err += std::norm(a[m] - b[m]);
+  return sqrt(err);
+}
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, const std::complex<T> *a)
+// ||a||_2
+{
+  T nrm = 0.0;
+  for (BIGINT m = 0; m < n; ++m) nrm += std::norm(a[m]);
+  return sqrt(nrm);
+}
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, const std::complex<T> *a)
+// ||a||_infty
+{
+  T nrm = 0.0;
+  for (BIGINT m = 0; m < n; ++m) nrm = std::max(nrm, std::norm(a[m]));
+  return sqrt(nrm);
+}