From f2339d40b396df5a1b0043d7e121b919743760aa Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 27 Jan 2025 11:35:30 +0100 Subject: [PATCH 1/4] reduce memory overhead in Type 3 setpts by eliminating temporary phihatk arrays; reduce large array reads/writes --- src/finufft_core.cpp | 124 +++++++++++++++++++++---------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 1adfabedb..7b7a0ff77 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -224,49 +224,55 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector &fwkerhalf, } } -template -static void onedim_nuft_kernel(BIGINT nk, const std::vector &k, std::vector &phihat, - const finufft_spread_opts &opts) -/* - Approximates exact 1D Fourier transform of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), - for a kernel with x measured in grid-spacings. (See previous routine for - FT definition). +template class KernelFseries { + private: + std::vector z, f; + + public: + /* + Approximates exact 1D Fourier transform of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), + for a kernel with x measured in grid-spacings. (See previous routine for + FT definition). + + Inputs: + opts - spreading opts object, needed to eval kernel (must be already set up) + + Barnett 2/8/17. openmp since cos slow 2/9/17. + To do (Nov 2024): replace evaluate_kernel by evaluate_kernel_horner. + */ + KernelFseries(const finufft_spread_opts &opts) { + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD + if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); + std::vector Z(2*q), W(2*q); + legendre_compute_glr(2 * q, Z.data(), W.data()); // only half the nodes used, eg on (0,1) + z.resize(q); + f.resize(q); + for (int n = 0; n < q; ++n) { + z[n] = T(Z[n]*J2); // quadr nodes for [0,J/2] + f[n] = J2 * T(W[n]) * evaluate_kernel(z[n], opts); // w/ quadr weights + } + } - Inputs: - nk - number of freqs - k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) - Note, z is in grid-point units, and k values must be in [-pi, pi) for - accuracy. - opts - spreading opts object, needed to eval kernel (must be already set up) + /* + Evaluates the Fourier transform of te kernel at a single point. - Outputs: - phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) + Inputs: + k - frequency, dual to the kernel's natural argument, ie exp(i.k.z) - Barnett 2/8/17. openmp since cos slow 2/9/17. - To do (Nov 2024): replace evaluate_kernel by evaluate_kernel_horner. - */ -{ - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD - if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); - T f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - for (int n = 0; n < q; ++n) { - z[n] *= (T)J2; // quadr nodes for [0,J/2] - f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights - } -#pragma omp parallel for num_threads(opts.nthreads) - for (BIGINT j = 0; j < nk; ++j) { // loop along output array - T x = 0.0; // register - for (int n = 0; n < q; ++n) - x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! - phihat[j] = x; - } -} + Outputs: + phihat - real Fourier transform evaluated at freq k + */ + T operator()(T k) { + T x = 0; + for (size_t n = 0; n < z.size(); ++n) + x += f[n] * 2 * cos(k * z[n]); // pos & neg freq pair. use T cos! + return x; + } +}; template static void deconvolveshuffle1d(int dir, T prefac, const std::vector &ker, BIGINT ms, @@ -862,37 +868,31 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF for (BIGINT j = 0; j < nj; ++j) prephase[j] = {1.0, 0.0}; // *** or keep flag so no mult in exec?? - // rescale the target s_k etc to s'_k etc... -#pragma omp parallel for num_threads(opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { - for (int idim = 0; idim < dim; ++idim) - STUp[idim][k] = - t3P.h[idim] * t3P.gam[idim] * (STU_in[idim][k] - t3P.D[idim]); // so |s'_k| < - // pi/R - } + KernelFseries fseries(spopts); // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... // (exploits that FT separates because kernel is prod of 1D funcs) deconv.resize(nk); - std::array, 3> phiHatk; - for (int idim = 0; idim < dim; ++idim) { - phiHatk[idim].resize(nk); - onedim_nuft_kernel(nk, STUp[idim], phiHatk[idim], spopts); // fill phiHat1 - } // C can be nan or inf if M=0, no input NU pts - int Cfinite = + bool Cfinite = std::isfinite(t3P.C[0]) && std::isfinite(t3P.C[1]) && std::isfinite(t3P.C[2]); - int Cnonzero = t3P.C[0] != 0.0 || t3P.C[1] != 0.0 || t3P.C[2] != 0.0; // cen + bool Cnonzero = t3P.C[0] != 0.0 || t3P.C[1] != 0.0 || t3P.C[2] != 0.0; // cen + bool do_phase = Cfinite && Cnonzero; #pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs TF phiHat = 1; - for (int idim = 0; idim < dim; ++idim) phiHat *= phiHatk[idim][k]; - deconv[k] = (std::complex)(1.0 / phiHat); - if (Cfinite && Cnonzero) { - TF phase = 0; - for (int idim = 0; idim < dim; ++idim) - phase += (STU_in[idim][k] - t3P.D[idim]) * t3P.C[idim]; - deconv[k] *= std::polar(TF(1), isign * phase); // Euler e^{+-i.phase} + TF phase = 0; + for (int idim = 0; idim < dim; ++idim) { + auto tSTUin = STU_in[idim][k]; + // rescale the target s_k etc to s'_k etc... + auto tSTUp = + t3P.h[idim] * t3P.gam[idim] * (tSTUin - t3P.D[idim]); // so |s'_k| < pi/R + phiHat *= fseries(tSTUp); + if (do_phase) + phase += (tSTUin - t3P.D[idim]) * t3P.C[idim]; + STUp[idim][k] = tSTUp; } + deconv[k] = do_phase ? std::polar(TF(1)/phiHat, isign * phase) + : TF(1)/phiHat; } if (opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); From 0fd014244bcd1c2c73744164d46c9ceb97a204da Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 27 Jan 2025 11:40:00 +0100 Subject: [PATCH 2/4] dummy commit to trigger formatter --- src/finufft_core.cpp | 99 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 7b7a0ff77..6a0039131 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -225,53 +225,54 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector &fwkerhalf, } template class KernelFseries { - private: - std::vector z, f; - - public: - /* - Approximates exact 1D Fourier transform of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), - for a kernel with x measured in grid-spacings. (See previous routine for - FT definition). - - Inputs: - opts - spreading opts object, needed to eval kernel (must be already set up) - - Barnett 2/8/17. openmp since cos slow 2/9/17. - To do (Nov 2024): replace evaluate_kernel by evaluate_kernel_horner. - */ - KernelFseries(const finufft_spread_opts &opts) { - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD - if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); - std::vector Z(2*q), W(2*q); - legendre_compute_glr(2 * q, Z.data(), W.data()); // only half the nodes used, eg on (0,1) - z.resize(q); - f.resize(q); - for (int n = 0; n < q; ++n) { - z[n] = T(Z[n]*J2); // quadr nodes for [0,J/2] - f[n] = J2 * T(W[n]) * evaluate_kernel(z[n], opts); // w/ quadr weights - } +private: + std::vector z, f; + +public: + /* + Approximates exact 1D Fourier transform of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), + for a kernel with x measured in grid-spacings. (See previous routine for + FT definition). + + Inputs: + opts - spreading opts object, needed to eval kernel (must be already set up) + + Barnett 2/8/17. openmp since cos slow 2/9/17. + To do (Nov 2024): replace evaluate_kernel by evaluate_kernel_horner. + */ + KernelFseries(const finufft_spread_opts &opts) { + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD + if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); + std::vector Z(2 * q), W(2 * q); + legendre_compute_glr(2 * q, Z.data(), W.data()); // only half the nodes used, eg on + // (0,1) + z.resize(q); + f.resize(q); + for (int n = 0; n < q; ++n) { + z[n] = T(Z[n] * J2); // quadr nodes for [0,J/2] + f[n] = J2 * T(W[n]) * evaluate_kernel(z[n], opts); // w/ quadr weights } + } - /* - Evaluates the Fourier transform of te kernel at a single point. + /* + Evaluates the Fourier transform of te kernel at a single point. - Inputs: - k - frequency, dual to the kernel's natural argument, ie exp(i.k.z) + Inputs: + k - frequency, dual to the kernel's natural argument, ie exp(i.k.z) - Outputs: - phihat - real Fourier transform evaluated at freq k - */ - T operator()(T k) { - T x = 0; - for (size_t n = 0; n < z.size(); ++n) - x += f[n] * 2 * cos(k * z[n]); // pos & neg freq pair. use T cos! - return x; - } + Outputs: + phihat - real Fourier transform evaluated at freq k + */ + T operator()(T k) { + T x = 0; + for (size_t n = 0; n < z.size(); ++n) + x += f[n] * 2 * cos(k * z[n]); // pos & neg freq pair. use T cos! + return x; + } }; template @@ -880,19 +881,17 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF #pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs TF phiHat = 1; - TF phase = 0; + TF phase = 0; for (int idim = 0; idim < dim; ++idim) { auto tSTUin = STU_in[idim][k]; // rescale the target s_k etc to s'_k etc... - auto tSTUp = - t3P.h[idim] * t3P.gam[idim] * (tSTUin - t3P.D[idim]); // so |s'_k| < pi/R + auto tSTUp = t3P.h[idim] * t3P.gam[idim] * (tSTUin - t3P.D[idim]); // so |s'_k| < + // pi/R phiHat *= fseries(tSTUp); - if (do_phase) - phase += (tSTUin - t3P.D[idim]) * t3P.C[idim]; + if (do_phase) phase += (tSTUin - t3P.D[idim]) * t3P.C[idim]; STUp[idim][k] = tSTUp; } - deconv[k] = do_phase ? std::polar(TF(1)/phiHat, isign * phase) - : TF(1)/phiHat; + deconv[k] = do_phase ? std::polar(TF(1) / phiHat, isign * phase) : TF(1) / phiHat; } if (opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); From a55354d8bcee0914366e22d97b30c0363b32d72f Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 27 Jan 2025 11:48:08 +0100 Subject: [PATCH 3/4] fix typo --- src/finufft_core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 6a0039131..55125f664 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -259,7 +259,7 @@ template class KernelFseries { } /* - Evaluates the Fourier transform of te kernel at a single point. + Evaluates the Fourier transform of the kernel at a single point. Inputs: k - frequency, dual to the kernel's natural argument, ie exp(i.k.z) From 8d36187bbe5ed06412612d376a6ef4054b168759 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 27 Jan 2025 19:46:11 +0100 Subject: [PATCH 4/4] use FINUFFT_ALWAYS_INLNE --- src/finufft_core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 55125f664..ff49efb19 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -267,7 +267,7 @@ template class KernelFseries { Outputs: phihat - real Fourier transform evaluated at freq k */ - T operator()(T k) { + FINUFFT_ALWAYS_INLINE T operator()(T k) { T x = 0; for (size_t n = 0; n < z.size(); ++n) x += f[n] * 2 * cos(k * z[n]); // pos & neg freq pair. use T cos!