From a33693965a4b0dcb943c3c2f7f1aff04baedb6a9 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 16 Oct 2021 13:26:12 +0200 Subject: [PATCH 1/5] first import --- benchees/Makefile.am | 7 +- benchees/duccfft/Makefile.am | 8 + benchees/duccfft/doit.cc | 79 + benchees/duccfft/ducc0/fft/fft.h | 2098 ++++++++++ benchees/duccfft/ducc0/fft/fft1d.h | 3158 +++++++++++++++ benchees/duccfft/ducc0/infra/aligned_array.h | 133 + benchees/duccfft/ducc0/infra/error_handling.h | 93 + benchees/duccfft/ducc0/infra/mav.h | 1154 ++++++ benchees/duccfft/ducc0/infra/misc_utils.h | 72 + benchees/duccfft/ducc0/infra/simd.h | 788 ++++ benchees/duccfft/ducc0/infra/threading.cc | 570 +++ benchees/duccfft/ducc0/infra/threading.h | 131 + benchees/duccfft/ducc0/infra/useful_macros.h | 21 + benchees/duccfft/ducc0/math/cmplx.h | 80 + benchees/duccfft/ducc0/math/unity_roots.h | 214 + benchees/pocketfft_cxx/Makefile.am | 8 + benchees/pocketfft_cxx/doit.cc | 80 + benchees/pocketfft_cxx/pocketfft_hdronly.h | 3578 +++++++++++++++++ configure.ac | 5 +- 19 files changed, 12274 insertions(+), 3 deletions(-) create mode 100644 benchees/duccfft/Makefile.am create mode 100644 benchees/duccfft/doit.cc create mode 100644 benchees/duccfft/ducc0/fft/fft.h create mode 100644 benchees/duccfft/ducc0/fft/fft1d.h create mode 100644 benchees/duccfft/ducc0/infra/aligned_array.h create mode 100644 benchees/duccfft/ducc0/infra/error_handling.h create mode 100644 benchees/duccfft/ducc0/infra/mav.h create mode 100644 benchees/duccfft/ducc0/infra/misc_utils.h create mode 100644 benchees/duccfft/ducc0/infra/simd.h create mode 100644 benchees/duccfft/ducc0/infra/threading.cc create mode 100644 benchees/duccfft/ducc0/infra/threading.h create mode 100644 benchees/duccfft/ducc0/infra/useful_macros.h create mode 100644 benchees/duccfft/ducc0/math/cmplx.h create mode 100644 benchees/duccfft/ducc0/math/unity_roots.h create mode 100644 benchees/pocketfft_cxx/Makefile.am create mode 100644 benchees/pocketfft_cxx/doit.cc create mode 100644 benchees/pocketfft_cxx/pocketfft_hdronly.h diff --git a/benchees/Makefile.am b/benchees/Makefile.am index 54dff6e..4b473a5 100644 --- a/benchees/Makefile.am +++ b/benchees/Makefile.am @@ -1,11 +1,14 @@ -SUBDIRS = acml arprec bloodworth burrus cross cwplib dfftpack dsp dxml \ +SUBDIRS = acml arprec bloodworth burrus cross cwplib dfftpack dsp duccfft dxml \ emayer esrfft essl ffmpeg ffte fftj fftpack fftreal fftw2 fftw3 fxt \ glassman goedecker gpfa green-ffts-2.0 gsl harm hp-mlib imsl intel-mkl \ intel-ipps jmfft kissfft krukar mfft minfft mixfft monnier morris \ -mpfun77 mpfun90 nag napack newsplit nr numutils ooura pocketfft qft \ +mpfun77 mpfun90 nag napack newsplit nr numutils ooura pocketfft pocketfft_cxx qft \ ransom rmayer scimark2c sciport sgimath singleton sorensen spiral-fft \ statlib sunperf temperton teneyck valkenburg vbigdsp vdsp +SUBDIRS = duccfft pocketfft_cxx pocketfft dfftpack + #fftw3 + EXTRA_DIST = Makefile.common distclean-local: diff --git a/benchees/duccfft/Makefile.am b/benchees/duccfft/Makefile.am new file mode 100644 index 0000000..4f1ecac --- /dev/null +++ b/benchees/duccfft/Makefile.am @@ -0,0 +1,8 @@ +PRG=doit + +AM_CPPFLAGS = $(INCLBENCH) + +doit_SOURCES=doit.cc +doit_LDADD=$(LIBBENCH) @FLIBS@ + +include ../Makefile.common diff --git a/benchees/duccfft/doit.cc b/benchees/duccfft/doit.cc new file mode 100644 index 0000000..7dc9c22 --- /dev/null +++ b/benchees/duccfft/doit.cc @@ -0,0 +1,79 @@ +/* this program is in the public domain */ + +#include +#include "bench-user.h" +#include "ducc0/infra/threading.cc" +#include "ducc0/fft/fft.h" +#include +#include + +using namespace std; +using namespace ducc0; + +BEGIN_BENCH_DOC +BENCH_DOC("name", "duccfft") +BENCH_DOC("author", "Martin Reinecke") +BENCH_DOC("year", "2021") +BENCH_DOC("version", "1.0") +BENCH_DOC("language", "C++") +BENCH_DOC("url", "https://gitlab.mpcdf.mpg.de/mtr/ducc") +BENCH_DOC("url-was-valid-on", "Fri Jul 23 23:06:24 ACST 2020") +BENCH_DOC("copyright", "GPLv2+") +END_BENCH_DOC + +int can_do(struct problem *p) +{ + return true; +} + +void copy_h2c(struct problem *p, bench_complex *out) +{ + copy_h2c_1d_fftpack(p, out, -1.0); +} + +void copy_c2h(struct problem *p, bench_complex *in) +{ + copy_c2h_1d_fftpack(p, in, -1.0); +} + + +void setup(struct problem *p) +{ + BENCH_ASSERT(can_do(p)); + // populate the transform cache + doit(1,p); +} + +void doit(int iter, struct problem *p) +{ + static fmav_info::shape_t shape(p->rank); + static fmav_info::shape_t axes(p->rank); + shape.resize(p->rank); + axes.resize(p->rank); + for (int i=0; irank; ++i) { + shape[i] = p->n[i]; + axes[i] = i; + } + + if (p->kind == PROBLEM_COMPLEX) { + auto in = reinterpret_cast *>(p->in); + auto out = reinterpret_cast *>(p->out); + cfmav> min(in, shape); + vfmav> mout(out, shape); + for (int i = 0; i < iter; ++i) { + c2c(min,mout,axes,p->sign==-1,bench_real(1)); + } + } else { + auto in = reinterpret_cast(p->in); + auto out = reinterpret_cast(p->out); + cfmav min(in, shape); + vfmav mout(out, shape); + for (int i = 0; i < iter; ++i) { + r2r_fftpack(min,mout,axes,p->sign==-1,p->sign==-1,bench_real(1)); + } + } +} + +void done(struct problem *p) +{ +} diff --git a/benchees/duccfft/ducc0/fft/fft.h b/benchees/duccfft/ducc0/fft/fft.h new file mode 100644 index 0000000..a33bfc7 --- /dev/null +++ b/benchees/duccfft/ducc0/fft/fft.h @@ -0,0 +1,2098 @@ +/* +This file is part of pocketfft. + +Copyright (C) 2010-2021 Max-Planck-Society +Copyright (C) 2019 Peter Bell + +For the odd-sized DCT-IV transforms: + Copyright (C) 2003, 2007-14 Matteo Frigo + Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology + +Authors: Martin Reinecke, Peter Bell + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DUCC0_FFT_H +#define DUCC0_FFT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/useful_macros.h" +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/threading.h" +#include "ducc0/infra/misc_utils.h" +#include "ducc0/infra/simd.h" +#include "ducc0/infra/mav.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/math/cmplx.h" +#include "ducc0/math/unity_roots.h" +#include "ducc0/fft/fft1d.h" + +/** \file fft.h + * Implementation of multi-dimensional Fast Fourier and related transforms + * \copyright Copyright (C) 2010-2021 Max-Planck-Society + * \copyright Copyright (C) 2019 Peter Bell + * \copyright + * \copyright For the odd-sized DCT-IV transforms: + * \copyright Copyright (C) 2003, 2007-14 Matteo Frigo + * \copyright Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology + * + * \authors Martin Reinecke, Peter Bell + */ + +namespace ducc0 { + +namespace detail_fft { + +template constexpr inline size_t fft_simdlen + = min(8, native_simd::size()); +template<> constexpr inline size_t fft_simdlen + = min(4, native_simd::size()); +template<> constexpr inline size_t fft_simdlen + = min(8, native_simd::size()); +template using fft_simd = typename simd_select>::type; +template constexpr inline bool fft_simd_exists = (fft_simdlen > 1); + +using shape_t=fmav_info::shape_t; +using stride_t=fmav_info::stride_t; + +constexpr bool FORWARD = true, + BACKWARD = false; + +struct util // hack to avoid duplicate symbols + { + static void sanity_check_axes(size_t ndim, const shape_t &axes) + { + if (ndim==1) + { + if ((axes.size()!=1) || (axes[0]!=0)) + throw std::invalid_argument("bad axes"); + return; + } + shape_t tmp(ndim,0); + if (axes.empty()) throw std::invalid_argument("no axes specified"); + for (auto ax : axes) + { + if (ax>=ndim) throw std::invalid_argument("bad axis number"); + if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly"); + } + } + + DUCC0_NOINLINE static void sanity_check_onetype(const fmav_info &a1, + const fmav_info &a2, bool inplace, const shape_t &axes) + { + sanity_check_axes(a1.ndim(), axes); + MR_assert(a1.conformable(a2), "array sizes are not conformable"); + if (inplace) MR_assert(a1.stride()==a2.stride(), "stride mismatch"); + } + DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac, + const fmav_info &ar, const shape_t &axes) + { + sanity_check_axes(ac.ndim(), axes); + MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch"); + for (size_t i=0; i=ac.ndim()) throw std::invalid_argument("bad axis number"); + MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch"); + for (size_t i=0; i class T_dct1 + { + private: + pocketfft_r fftplan; + + public: + DUCC0_NOINLINE T_dct1(size_t length, bool /*vectorize*/=false) + : fftplan(2*(length-1)) {} + + template DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=fftplan.length(), n=N/2+1; + if (ortho) + { c[0]*=sqrt2; c[n-1]*=sqrt2; } + auto tmp=&buf[0]; + tmp[0] = c[0]; + for (size_t i=1; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + exec(c, buf, fct, ortho, 1, true, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + quick_array buf(bufsize()); + exec_copyback(c, buf.data(), fct, ortho, 1, true, nthreads); + } + + size_t length() const { return fftplan.length()/2+1; } + size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); } + }; + +template class T_dst1 + { + private: + pocketfft_r fftplan; + + public: + DUCC0_NOINLINE T_dst1(size_t length, bool /*vectorize*/=false) + : fftplan(2*(length+1)) {} + + template DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + size_t N=fftplan.length(), n=N/2-1; + auto tmp = &buf[0]; + tmp[0] = tmp[n+1] = c[0]*0; + for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + exec(c, buf, fct, true, 1, false, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads) const + { + quick_array buf(bufsize()); + exec_copyback(c, buf.data(), fct, true, 1, false, nthreads); + } + + size_t length() const { return fftplan.length()/2-1; } + size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); } + }; + +template class T_dcst23 + { + private: + pocketfft_r fftplan; + std::vector twiddle; + + public: + DUCC0_NOINLINE T_dcst23(size_t length, bool /*vectorize*/=false) + : fftplan(length), twiddle(length) + { + UnityRoots> tw(4*length); + for (size_t i=0; i DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho, + int type, bool cosine, size_t nthreads=1) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + size_t NS2 = (N+1)/2; + if (type==2) + { + if (!cosine) + for (size_t k=1; k DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, + bool ortho, int type, bool cosine, size_t nthreads=1) const + { + exec(c, buf, fct, ortho, type, cosine, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho, + int type, bool cosine, size_t nthreads=1) const + { + quick_array buf(bufsize()); + exec(c, &buf[0], fct, ortho, type, cosine, nthreads); + } + + size_t length() const { return fftplan.length(); } + size_t bufsize() const { return fftplan.bufsize(); } + }; + +template class T_dcst4 + { + private: + size_t N; + std::unique_ptr> fft; + std::unique_ptr> rfft; + quick_array> C2; + + public: + DUCC0_NOINLINE T_dcst4(size_t length, bool /*vectorize*/=false) + : N(length), + fft((N&1) ? nullptr : make_unique>(N/2)), + rfft((N&1)? make_unique>(N) : nullptr), + C2((N&1) ? 0 : N/2) + { + if ((N&1)==0) + { + UnityRoots> tw(16*N); + for (size_t i=0; i DUCC0_NOINLINE T *exec(T c[], T /*buf*/[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads) const + { + size_t n2 = N/2; + if (!cosine) + for (size_t k=0, kc=N-1; k y(N); + { + size_t i=0, m=n2; + for (; mexec(y.data(), fct, true, nthreads); + { + auto SGN = [](size_t i) + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + return (i&2) ? -sqrt2 : sqrt2; + }; + c[n2] = y[0]*SGN(n2+1); + size_t i=0, i1=1, k=1; + for (; k> y(n2); + for(size_t i=0; iexec(y.data(), fct, true, nthreads); + for(size_t i=0, ic=n2-1; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const + { + exec(c, buf, fct, true, 4, cosine, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const + { + quick_array buf(bufsize()); + exec(c, &buf[0], fct, true, 4, cosine, nthreads); + } + + size_t length() const { return N; } +//FIXME: use buffers properly! + size_t bufsize() const { return 0; } + }; + + +// +// multi-D infrastructure +// + +template std::shared_ptr get_plan(size_t length, bool vectorize=false) + { +#ifdef DUCC0_NO_FFT_CACHE + return std::make_shared(length, vectorize); +#else + constexpr size_t nmax=10; + struct entry { size_t n; bool vectorize; std::shared_ptr ptr; }; + static std::array cache{{0,0,nullptr}}; + static std::array last_access{{0}}; + static size_t access_counter = 0; +#ifndef DUCC0_NO_THREADING + static std::mutex mut; +#endif + + auto find_in_cache = [&]() -> std::shared_ptr + { + for (size_t i=0; i lock(mut); +#endif + auto p = find_in_cache(); + if (p) return p; + } + auto plan = std::make_shared(length, vectorize); + { +#ifndef DUCC0_NO_THREADING + std::lock_guard lock(mut); +#endif + auto p = find_in_cache(); + if (p) return p; + + size_t lru = 0; + for (size_t i=1; i class multi_iter + { + private: + shape_t shp, pos; + stride_t str_i, str_o; + size_t cshp_i, cshp_o, rem; + ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N]; + bool uni_i, uni_o; + + void advance_i() + { + for (size_t i=0; i=1, "not enough dimensions"); + // Sort the extraneous dimensions in order of ascending output stride; + // this should improve overall cache re-use and avoid clashes between + // threads as much as possible. + shape_t idx(iarr.ndim()); + std::iota(idx.begin(), idx.end(), 0); + sort(idx.begin(), idx.end(), + [&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);}); + for (auto i: idx) + if (i!=idim) + { + pos.push_back(0); + MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch"); + shp.push_back(iarr.shape(i)); + str_i.push_back(iarr.stride(i)); + str_o.push_back(oarr.stride(i)); + } + MR_assert(idim0) + { + sstr_i = str_i[0]; + sstr_o = str_o[0]; + } + + if (nshares==1) return; + if (nshares==0) throw std::runtime_error("can't run with zero threads"); + if (myshare>=nshares) throw std::runtime_error("impossible share requested"); + auto [lo, hi] = calcShare(nshares, myshare, rem); + size_t todo = hi-lo; + + size_t chunk = rem; + for (size_t i2=0, i=pos.size()-1; i2(stride_in() *tsz)&4095)==0) + || ((abs(stride_out()*tsz)&4095)==0); + } + bool critical_stride_other(size_t tsz) const + { + if (unistride_i()==0) return false; // it's just one transform + return ((abs(unistride_i()*tsz)&4095)==0) + || ((abs(unistride_o()*tsz)&4095)==0); + } + }; + +template class TmpStorage + { + private: + aligned_array d; + size_t dofs, dstride; + + public: + TmpStorage(size_t n_trafo, size_t bufsize_data, size_t bufsize_trafo, + size_t n_simultaneous, bool inplace) + { + if (inplace) + { + d.resize(bufsize_trafo); + return; + } + constexpr auto vlen = fft_simdlen; + // FIXME: when switching to C++20, use bit_floor(othersize) + size_t buffct = std::min(vlen, n_trafo); + size_t datafct = std::min(vlen, n_trafo); + if (n_trafo>=n_simultaneous*vlen) datafct = n_simultaneous*vlen; + dstride = bufsize_data; + // critical stride avoidance + if ((dstride&256)==0) dstride+=3; + d.resize(buffct*(bufsize_trafo+17) + datafct*dstride); + dofs = bufsize_trafo + 17; + } + + template T2 *transformBuf() + { return reinterpret_cast(d.data()); } + template T2 *dataBuf() + { return reinterpret_cast(d.data()) + dofs; } + size_t data_stride() const + { return dstride; } + }; + +template class TmpStorage2 + { + private: + TmpStorage &stg; + + public: + using datatype = T2; + TmpStorage2(TmpStorage &stg_): stg(stg_) {} + + T2 *transformBuf() { return stg.template transformBuf(); } + T2 *dataBuf() { return stg.template dataBuf(); } + size_t data_stride() const { return stg.data_stride(); } + }; + +// Yes, this looks strange. But this is currently the only way I found to +// stop compilers from vectorizing the copying loops and messing up the ordering +// of the memory accesses, which is really important here. +template DUCC0_NOINLINE void copy_inputx2(const Titer &it, + const cfmav> &src, Ts *DUCC0_RESTRICT dst, size_t vlen) + { + for (size_t i=0; i DUCC0_NOINLINE void copy_inputx(const Titer &it, + const cfmav> &src, Ts *DUCC0_RESTRICT dst, size_t vlen) + { + if (it.stride_in()==1) + return copy_inputx2(it, src, dst, vlen); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav> &src, Cmplx *DUCC0_RESTRICT dst) + { + constexpr auto vlen=Tsimd::size(); + copy_inputx(it, src, reinterpret_cast(dst),vlen); + } + +template DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, Tsimd *DUCC0_RESTRICT dst) + { + constexpr auto vlen=Tsimd::size(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, T *DUCC0_RESTRICT dst) + { + if (dst == &src.raw(it.iofs(0))) return; // in-place + for (size_t i=0; i DUCC0_NOINLINE void copy_outputx2(const Titer &it, + const Ts *DUCC0_RESTRICT src, vfmav> &dst, size_t vlen) + { + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_outputx(const Titer &it, + const Ts *DUCC0_RESTRICT src, vfmav> &dst, size_t vlen) + { + if (it.stride_out()==1) + return copy_outputx2(it,src,dst,vlen); + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Cmplx *DUCC0_RESTRICT src, vfmav> &dst) + { + constexpr auto vlen=Tsimd::size(); + copy_outputx(it, reinterpret_cast(src), dst, vlen); + } + +template DUCC0_NOINLINE void copy_output(const Titer &it, + const Tsimd *DUCC0_RESTRICT src, vfmav &dst) + { + constexpr auto vlen=Tsimd::size(); + auto ptr=dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const T *DUCC0_RESTRICT src, vfmav &dst) + { + auto ptr=dst.data(); + if (src == &dst.raw(it.oofs(0))) return; // in-place + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav> &src, Cmplx *dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav> &src, Cmplx *dst, size_t nvec, size_t vstr) + { + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, Tsimd *dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, T *dst, size_t nvec, size_t vstr) + { + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Cmplx *src, vfmav> &dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Cmplx *src, vfmav> &dst, size_t nvec, size_t vstr) + { + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Tsimd *src, vfmav &dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + typename Tsimd::value_type * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const T *src, vfmav &dst, size_t nvec, size_t vstr) + { + T * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i struct add_vec + { using type = typename simd_select::type; }; +template struct add_vec, vlen> + { using type = Cmplx::type>; }; +template using add_vec_t = typename add_vec::type; + +template +DUCC0_NOINLINE void general_nd(const cfmav &in, vfmav &out, + const shape_t &axes, T0 fct, size_t nthreads, const Exec &exec, + const bool /*allow_inplace*/=true) + { + if ((in.ndim()==1)&&(in.stride(0)==1)&&(out.stride(0)==1)) + { + auto plan = get_plan(in.shape(0), true); + exec.exec_simple(in.data(), out.data(), *plan, fct, nthreads); + return; + } + std::shared_ptr plan; + size_t nth1d = (in.ndim()==1) ? nthreads : 1; + bool inplace = (out.ndim()==1)&&(out.stride(0)==1); + + for (size_t iax=0; iaxlength())) + plan = get_plan(len, in.ndim()==1); + + execParallel( + util::thread_count(nthreads, in, axes[iax], fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + constexpr size_t nmax = 16; + const auto &tin(iax==0? in : out); + multi_iter it(tin, out, axes[iax], sched.num_threads(), sched.thread_num()); + size_t nvec = 1; + if (it.critical_stride_trans(sizeof(T))) // do bunches of transforms + nvec = nmax/vlen; + TmpStorage storage(in.size()/len, len, plan->bufsize(), nvec, inplace); + + if (nvec>1) + { +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=vlen*nvec) + { + it.advance(vlen*nvec); + exec.exec_n(it, tin, out, storage2, *plan, fct, nvec, nth1d); + } + } +#endif + { + TmpStorage2 storage2(storage); + while (it.remaining()>=nvec) + { + it.advance(nvec); + exec.exec_n(it, tin, out, storage2, *plan, fct, nvec, nth1d); + } + } + } + +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=vlen) + { + it.advance(vlen); + exec(it, tin, out, storage2, *plan, fct, nth1d); + } + } + if constexpr (vlen>2) + if constexpr (simd_exists) + { + TmpStorage2,T,T0> storage2(storage); + if (it.remaining()>=vlen/2) + { + it.advance(vlen/2); + exec(it, tin, out, storage2, *plan, fct, nth1d); + } + } + if constexpr (vlen>4) + if constexpr (simd_exists) + { + TmpStorage2,T,T0> storage2(storage); + if (it.remaining()>=vlen/4) + { + it.advance(vlen/4); + exec(it, tin, out, storage2, *plan, fct, nth1d); + } + } +#endif + { + TmpStorage2 storage2(storage); + while (it.remaining()>0) + { + it.advance(1); + exec(it, tin, out, storage2, *plan, fct, nth1d, inplace); + } + } + }); // end of parallel region + fct = T0(1); // factor has been applied, use 1 for remaining axes + } + } + +struct ExecC2C + { + bool forward; + + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav> &in, + vfmav> &out, Tstorage &storage, const pocketfft_c &plan, T0 fct, + size_t nthreads, bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same, T>::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()); + plan.exec_copyback(out.data(), storage.transformBuf(), fct, forward, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, forward, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav> &in, + vfmav> &out, Tstorage &storage, const pocketfft_c &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const Cmplx *in, Cmplx *out, const pocketfft_c &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, forward, nthreads); + } + }; + +struct ExecHartley + { + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, vfmav &out, + Tstorage &storage, const pocketfft_hartley &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()); + plan.exec_copyback(out.data(), storage.transformBuf(), fct, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + vfmav &out, Tstorage &storage, const pocketfft_hartley &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_hartley &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, nthreads); + } + }; + +struct ExecFFTW + { + bool forward; + + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, vfmav &out, + Tstorage &storage, const pocketfft_fftw &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()); + plan.exec_copyback(out.data(), storage.transformBuf(), fct, forward, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, forward, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + vfmav &out, Tstorage &storage, const pocketfft_fftw &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_fftw &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, forward, nthreads); + } + }; + +struct ExecDcst + { + bool ortho; + int type; + bool cosine; + + template + DUCC0_NOINLINE void operator() (const Titer &it, const cfmav &in, + vfmav &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()); + plan.exec_copyback(out.data(), storage.transformBuf(), fct, ortho, type, cosine, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, ortho, type, cosine, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + vfmav &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const Tplan &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, ortho, type, cosine, nthreads); + } + }; + +template DUCC0_NOINLINE void general_r2c( + const cfmav &in, vfmav> &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + size_t nth1d = (in.ndim()==1) ? nthreads : 1; + auto plan = std::make_unique>(in.shape(axis)); + size_t len=in.shape(axis); + execParallel( + util::thread_count(nthreads, in, axis, fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + TmpStorage storage(in.size()/len, len, plan->bufsize(), 1, false); + multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>=vlen) + { + it.advance(vlen); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + for (size_t j=0; j2) + if constexpr (simd_exists) + if (it.remaining()>=vlen/2) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/2); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + for (size_t j=0; j4) + if constexpr( simd_exists) + if (it.remaining()>=vlen/4) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/4); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + for (size_t j=0; j storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>0) + { + it.advance(1); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + vout[it.oofs(0)].Set(res[0]); + size_t i=1, ii=1; + if (forward) + for (; i DUCC0_NOINLINE void general_c2r( + const cfmav> &in, vfmav &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + size_t nth1d = (in.ndim()==1) ? nthreads : 1; + auto plan = std::make_unique>(out.shape(axis)); + size_t len=out.shape(axis); + execParallel( + util::thread_count(nthreads, in, axis, fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + TmpStorage storage(out.size()/len, len, plan->bufsize(), 1, false); + multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>=vlen) + { + it.advance(vlen); + for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } + } + if constexpr (vlen>2) + if constexpr (simd_exists) + if (it.remaining()>=vlen/2) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/2); + for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } + if constexpr (vlen>4) + if constexpr(simd_exists) + if (it.remaining()>=vlen/4) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/4); + for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } +#endif + { + TmpStorage2 storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>0) + { + it.advance(1); + dbuf[0]=in.raw(it.iofs(0)).r; + { + size_t i=1, ii=1; + if (forward) + for (; iexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } + } + }); // end of parallel region + } + +struct ExecR2R + { + bool r2c, forward; + + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, + const pocketfft_r &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + T *buf1=storage.transformBuf(), *buf2=out.data(); + if (in.data()!=buf2) + copy_input(it, in, buf2); + if ((!r2c) && forward) + for (size_t i=2; i DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + vfmav &out, Tstorage &storage, const pocketfft_r &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + if ((!r2c) && forward) + for (size_t k=0; k DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_r &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + if ((!r2c) && forward) + for (size_t i=2; i DUCC0_NOINLINE void c2c(const cfmav> &in, + vfmav> &out, const shape_t &axes, bool forward, + T fct, size_t nthreads=1) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + const auto &in2(reinterpret_cast >&>(in)); + auto &out2(reinterpret_cast >&>(out)); + if ((axes.size()>1) && (in.data()!=out.data())) // optimize axis order + for (size_t i=1; i>(in2, out2, axes2, fct, nthreads, ExecC2C{forward}); + return; + } + general_nd>(in2, out2, axes, fct, nthreads, ExecC2C{forward}); + } + +/// Fast Discrete Cosine Transform +/** This executes a DCT on \a in and stores the result in \a out. + * + * \a in and \a out must have identical shapes; they may point to the same + * memory; in this case their strides must also be identical. + * + * \a axes specifies the axes over which the transform is carried out. + * + * If \a forward is true, a DCT is computed, otherwise an inverse DCT. + * + * \a type specifies the desired type (1-4) of the transform. + * + * No normalization factors will be applied by default; if multiplication by + * a constant is desired, it can be supplied in \a fct. + * + * If \a ortho is true, the first and last array entries are corrected (if + * necessary) to allow an orthonormalized transform. + * + * If the underlying array has more than one dimension, the computation will + * be distributed over \a nthreads threads. + */ +template DUCC0_NOINLINE void dct(const cfmav &in, vfmav &out, + const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1) + { + if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type"); + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + const ExecDcst exec{ortho, type, true}; + if (type==1) + general_nd>(in, out, axes, fct, nthreads, exec); + else if (type==4) + general_nd>(in, out, axes, fct, nthreads, exec); + else + general_nd>(in, out, axes, fct, nthreads, exec); + } + +/// Fast Discrete Sine Transform +/** This executes a DST on \a in and stores the result in \a out. + * + * \a in and \a out must have identical shapes; they may point to the same + * memory; in this case their strides must also be identical. + * + * \a axes specifies the axes over which the transform is carried out. + * + * If \a forward is true, a DST is computed, otherwise an inverse DST. + * + * \a type specifies the desired type (1-4) of the transform. + * + * No normalization factors will be applied by default; if multiplication by + * a constant is desired, it can be supplied in \a fct. + * + * If \a ortho is true, the first and last array entries are corrected (if + * necessary) to allow an orthonormalized transform. + * + * If the underlying array has more than one dimension, the computation will + * be distributed over \a nthreads threads. + */ +template DUCC0_NOINLINE void dst(const cfmav &in, vfmav &out, + const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1) + { + if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type"); + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + const ExecDcst exec{ortho, type, false}; + if (type==1) + general_nd>(in, out, axes, fct, nthreads, exec); + else if (type==4) + general_nd>(in, out, axes, fct, nthreads, exec); + else + general_nd>(in, out, axes, fct, nthreads, exec); + } + +template DUCC0_NOINLINE void r2c(const cfmav &in, + vfmav> &out, size_t axis, bool forward, T fct, + size_t nthreads=1) + { + util::sanity_check_cr(out, in, axis); + if (in.size()==0) return; + auto &out2(reinterpret_cast>&>(out)); + general_r2c(in, out2, axis, forward, fct, nthreads); + } + +template DUCC0_NOINLINE void r2c(const cfmav &in, + vfmav> &out, const shape_t &axes, + bool forward, T fct, size_t nthreads=1) + { + util::sanity_check_cr(out, in, axes); + if (in.size()==0) return; + r2c(in, out, axes.back(), forward, fct, nthreads); + if (axes.size()==1) return; + + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(out, out, newaxes, forward, T(1), nthreads); + } + +template DUCC0_NOINLINE void c2r(const cfmav> &in, + vfmav &out, size_t axis, bool forward, T fct, size_t nthreads=1) + { + util::sanity_check_cr(in, out, axis); + if (in.size()==0) return; + const auto &in2(reinterpret_cast>&>(in)); + general_c2r(in2, out, axis, forward, fct, nthreads); + } + +template DUCC0_NOINLINE void c2r(const cfmav> &in, + vfmav &out, const shape_t &axes, bool forward, T fct, + size_t nthreads=1) + { + if (axes.size()==1) + return c2r(in, out, axes[0], forward, fct, nthreads); + util::sanity_check_cr(in, out, axes); + if (in.size()==0) return; + auto atmp(vfmav>::build_noncritical(in.shape(), UNINITIALIZED)); + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(in, atmp, newaxes, forward, T(1), nthreads); + c2r(atmp, out, axes.back(), forward, fct, nthreads); + } + +template DUCC0_NOINLINE void r2r_fftpack(const cfmav &in, + vfmav &out, const shape_t &axes, bool real2hermitian, bool forward, + T fct, size_t nthreads=1) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecR2R{real2hermitian, forward}); + } + +template DUCC0_NOINLINE void r2r_fftw(const cfmav &in, + vfmav &out, const shape_t &axes, bool forward, + T fct, size_t nthreads=1) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecFFTW{forward}); + } + +template DUCC0_NOINLINE void r2r_separable_hartley(const cfmav &in, + vfmav &out, const shape_t &axes, T fct, size_t nthreads=1) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecHartley{}, false); + } + +template void hermiteHelper(size_t idim, ptrdiff_t iin, + ptrdiff_t iout0, ptrdiff_t iout1, const cfmav &c, + vfmav &r, const shape_t &axes, Func func, size_t nthreads) + { + auto cstr=c.stride(idim), str=r.stride(idim); + auto len=r.shape(idim); + + if (idim+1==c.ndim()) // last dimension, not much gain in parallelizing + { + if (idim==axes.back()) // halfcomplex axis + for (size_t i=0; i void oscarize(vfmav &data, size_t ax0, size_t ax1, + size_t nthreads) + { + vfmav d(data); + // sort axes to have decreasing strides from ax0 to ax1 + if (d.stride(ax0)([nthreads](const auto &plane) + { + auto nu=plane.shape(0), nv=plane.shape(1); + execParallel((nu+1)/2-1, nthreads, [&](size_t lo, size_t hi) + { + for(auto i=lo+1; i void oscarize3(vfmav &data, size_t ax0, size_t ax1, size_t ax2, + size_t nthreads) + { + vfmav d(data); + // sort axes to have decreasing strides from ax0 to ax2 + if (d.stride(ax0)([nthreads](const auto &plane) + { + auto nu=plane.shape(0), nv=plane.shape(1), nw=plane.shape(2); + execParallel(nu/2+1, nthreads, [&](size_t lo, size_t hi) + { + for(auto i=lo, xi=(i==0)?0:nu-i; i void r2r_genuine_hartley(const cfmav &in, + vfmav &out, const shape_t &axes, T fct, size_t nthreads=1) + { + if (axes.size()==1) + return r2r_separable_hartley(in, out, axes, fct, nthreads); + if (axes.size()==2) + { + r2r_separable_hartley(in, out, axes, fct, nthreads); + oscarize(out, axes[0], axes[1], nthreads); + return; + } + if (axes.size()==3) + { + r2r_separable_hartley(in, out, axes, fct, nthreads); + oscarize3(out, axes[0], axes[1], axes[2], nthreads); + return; + } + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + shape_t tshp(in.shape()); + tshp[axes.back()] = tshp[axes.back()]/2+1; + auto atmp(vfmav>::build_noncritical(tshp, UNINITIALIZED)); + r2c(in, atmp, axes, true, fct, nthreads); + hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex &c, T &r0, T &r1) + { +#ifdef DUCC0_USE_PROPER_HARTLEY_CONVENTION + r0 = c.real()-c.imag(); + r1 = c.real()+c.imag(); +#else + r0 = c.real()+c.imag(); + r1 = c.real()-c.imag(); +#endif + }, nthreads); + } + +template aligned_array alloc_tmp_conv_axis + (const fmav_info &info, size_t axis, size_t len, size_t bufsize) + { + auto othersize = info.size()/info.shape(axis); + constexpr auto vlen = fft_simdlen; + return aligned_array((len+bufsize)*std::min(vlen, othersize)); + } + +template +DUCC0_NOINLINE void general_convolve_axis(const cfmav &in, vfmav &out, + const size_t axis, const cmav &kernel, size_t nthreads, + const Exec &exec) + { + std::unique_ptr plan1, plan2; + + size_t l_in=in.shape(axis), l_out=out.shape(axis); + MR_assert(kernel.size()==l_in, "bad kernel size"); + plan1 = std::make_unique(l_in); + plan2 = std::make_unique(l_out); + size_t bufsz = max(plan1->bufsize(), plan2->bufsize()); + + vmav fkernel({kernel.shape(0)}); + for (size_t i=0; iexec(fkernel.data(), T0(1)/T0(l_in), true, nthreads); + + execParallel( + util::thread_count(nthreads, in, axis, fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + TmpStorage storage(in.size()/l_in, l_in+l_out, bufsz, 1, false); + multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=vlen) + { + it.advance(vlen); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } + } + if constexpr (vlen>2) + if constexpr (simd_exists) + if (it.remaining()>=vlen/2) + { + TmpStorage2,T,T0> storage2(storage); + it.advance(vlen/2); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } + if constexpr (vlen>4) + if constexpr (simd_exists) + if (it.remaining()>=vlen/4) + { + TmpStorage2,T,T0> storage2(storage); + it.advance(vlen/4); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } +#endif + { + TmpStorage2 storage2(storage); + while (it.remaining()>0) + { + it.advance(1); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } + } + }); // end of parallel region + } + +struct ExecConv1R + { + template void operator() ( + const Titer &it, const cfmav &in, vfmav &out, + Tstorage &storage, const pocketfft_r &plan1, const pocketfft_r &plan2, + const cmav &fkernel) const + { + using T = typename Tstorage::datatype; + size_t l_in = plan1.length(), + l_out = plan2.length(), + l_min = std::min(l_in, l_out); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + plan1.exec_copyback(buf2, buf1, T0(1), true); + auto res = buf2; + { + res[0] *= fkernel(0); + size_t i; + for (i=1; 2*i t1(res[2*i-1], res[2*i]); + Cmplx t2(fkernel(2*i-1), fkernel(2*i)); + auto t3 = t1*t2; + res[2*i-1] = t3.r; + res[2*i] = t3.i; + } + if (2*i==l_min) + { + if (l_min t1(res[2*i-1], res[2*i]); + Cmplx t2(fkernel(2*i-1), fkernel(2*i)); + res[2*i-1] = (t1*t2).r*T0(2); + } + else + res[2*i-1] *= fkernel(2*i-1); + } + } + for (size_t i=l_in; i void operator() ( + const Titer &it, const cfmav> &in, vfmav> &out, + Tstorage &storage, const pocketfft_c &plan1, const pocketfft_c &plan2, + const cmav,1> &fkernel) const + { + using T = typename Tstorage::datatype; + size_t l_in = plan1.length(), + l_out = plan2.length(), + l_min = std::min(l_in, l_out); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan1.exec(buf2, buf1, T0(1), true); + auto res2 = buf2+l_in; + { + res2[0] = res[0]*fkernel(0); + size_t i; + for (i=1; 2*i DUCC0_NOINLINE void convolve_axis(const cfmav &in, + vfmav &out, size_t axis, const cmav &kernel, size_t nthreads=1) + { + MR_assert(axis, T>(in, out, axis, kernel, nthreads, + ExecConv1R()); + } +template DUCC0_NOINLINE void convolve_axis(const cfmav> &in, + vfmav> &out, size_t axis, const cmav,1> &kernel, + size_t nthreads=1) + { + MR_assert(axis>&>(in)); + auto &out2(reinterpret_cast>&>(out)); + const auto &kernel2(reinterpret_cast,1>&>(kernel)); + general_convolve_axis, T>(in2, out2, axis, kernel2, nthreads, + ExecConv1C()); + } + +} // namespace detail_fft + +using detail_fft::FORWARD; +using detail_fft::BACKWARD; +using detail_fft::c2c; +using detail_fft::c2r; +using detail_fft::r2c; +using detail_fft::r2r_fftpack; +using detail_fft::r2r_fftw; +using detail_fft::r2r_separable_hartley; +using detail_fft::r2r_genuine_hartley; +using detail_fft::dct; +using detail_fft::dst; +using detail_fft::convolve_axis; + +} // namespace ducc0 + +#endif // POCKETFFT_HDRONLY_H diff --git a/benchees/duccfft/ducc0/fft/fft1d.h b/benchees/duccfft/ducc0/fft/fft1d.h new file mode 100644 index 0000000..484a7f0 --- /dev/null +++ b/benchees/duccfft/ducc0/fft/fft1d.h @@ -0,0 +1,3158 @@ +/* +This file is part of pocketfft. + +Copyright (C) 2010-2021 Max-Planck-Society +Copyright (C) 2019 Peter Bell + +Authors: Martin Reinecke, Peter Bell + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DUCC0_FFT1D_H +#define DUCC0_FFT1D_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/useful_macros.h" +#include "ducc0/math/cmplx.h" +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/infra/simd.h" +#include "ducc0/infra/threading.h" +#include "ducc0/math/unity_roots.h" + +//#define DUCC0_USE_PROPER_HARTLEY_CONVENTION + +namespace ducc0 { + +namespace detail_fft { + +using namespace std; + +template constexpr inline size_t fft1d_simdlen + = min(8, native_simd::size()); +template<> constexpr inline size_t fft1d_simdlen + = min(4, native_simd::size()); +template<> constexpr inline size_t fft1d_simdlen + = min(8, native_simd::size()); +template using fft1d_simd = typename simd_select>::type; +template constexpr inline bool fft1d_simd_exists = (fft1d_simdlen > 1); + +// Always use std:: for functions +template T cos(T) = delete; +template T sin(T) = delete; +template T sqrt(T) = delete; + +template inline void PM(T &a, T &b, T c, T d) + { a=c+d; b=c-d; } +template inline void PMINPLACE(T &a, T &b) + { T t = a; a+=b; b=t-b; } +template inline void MPINPLACE(T &a, T &b) + { T t = a; a-=b; b=t+b; } +template void special_mul (const Cmplx &v1, const Cmplx &v2, Cmplx &res) + { + res = fwd ? Cmplx(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) + : Cmplx(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); + } + +template void ROTX90(Cmplx &a) + { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } + +template inline auto tidx() { return type_index(typeid(T)); } + +struct util1d // hack to avoid duplicate symbols + { + /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ + DUCC0_NOINLINE static size_t good_size_cmplx(size_t n) + { + if (n<=12) return n; + + size_t bestfac=2*n; + for (size_t f11=1; f11n) + { + if (x>=1; + } + else + return n; + } + } + return bestfac; + } + + /* returns the smallest composite of 2, 3, 5 which is >= n */ + DUCC0_NOINLINE static size_t good_size_real(size_t n) + { + if (n<=6) return n; + + size_t bestfac=2*n; + for (size_t f5=1; f5n) + { + if (x>=1; + } + else + return n; + } + } + return bestfac; + } + + DUCC0_NOINLINE static vector prime_factors(size_t N) + { + MR_assert(N>0, "need a positive number"); + vector factors; + while ((N&1)==0) + { N>>=1; factors.push_back(2); } + for (size_t divisor=3; divisor*divisor<=N; divisor+=2) + while ((N%divisor)==0) + { + factors.push_back(divisor); + N/=divisor; + } + if (N>1) factors.push_back(N); + return factors; + } + }; + +template using Troots = shared_ptr>>; + +// T: "type", f/c: "float/complex", s/v: "scalar/vector" +template class cfftpass + { + public: + virtual ~cfftpass(){} + using Tcs = Cmplx; + + // number of Tcd values required as scratch space during "exec" + // will be provided in "buf" + virtual size_t bufsize() const = 0; + virtual bool needs_copy() const = 0; + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, + bool fwd, size_t nthreads=1) const = 0; + + static vector factorize(size_t N) + { + MR_assert(N>0, "need a positive number"); + vector factors; + factors.reserve(15); + while ((N&3)==0) + { factors.push_back(4); N>>=2; } + if ((N&1)==0) + { + N>>=1; + // factor 2 should be at the front of the factor list + factors.push_back(2); + swap(factors[0], factors.back()); + } + for (size_t divisor=3; divisor*divisor<=N; divisor+=2) + while ((N%divisor)==0) + { + factors.push_back(divisor); + N/=divisor; + } + if (N>1) factors.push_back(N); + return factors; + } + + static shared_ptr make_pass(size_t l1, size_t ido, size_t ip, + const Troots &roots, bool vectorize=false); + static shared_ptr make_pass(size_t ip, bool vectorize=false) + { + return make_pass(1,1,ip,make_shared>>(ip), + vectorize); + } + }; + +#define POCKETFFT_EXEC_DISPATCH \ + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ + bool fwd, size_t nthreads=1) const \ + { \ + static const auto tics = tidx(); \ + if (ti==tics) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + if constexpr (fft1d_simdlen > 1) \ + if constexpr (simd_exists>) \ + { \ + using Tfv = typename simd_select>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 2) \ + if constexpr (simd_exists/2>) \ + { \ + using Tfv = typename simd_select/2>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 4) \ + if constexpr (simd_exists/4>) \ + { \ + using Tfv = typename simd_select/4>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 8) \ + if constexpr (simd_exists/8>) \ + { \ + using Tfv = typename simd_select/8>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + MR_fail("impossible vector length requested"); \ + } + +template using Tcpass = shared_ptr>; + +template class cfftp1: public cfftpass + { + public: + cfftp1() {} + virtual size_t bufsize() const { return 0; } + virtual bool needs_copy() const { return false; } + + virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, + void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const + { return in; } + }; + +template class cfftp2: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=2; + quick_array wa; + + auto WA(size_t i) const + { return wa[i-1]; } + + template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, + Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const + { + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tcd& + { return ch[b+l1*c]; }; + auto CC = [cc](size_t b, size_t c) -> const Tcd& + { return cc[b+ip*c]; }; + for (size_t k=0; k Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + for (size_t k=0; k(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1)); + } + } + return ch; + } + } + + public: + cfftp2(size_t l1_, size_t ido_, const Troots &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp3: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=3; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r=-0.5, + tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP3(idx) \ + Tcd t0 = CC(idx,0,k), t1, t2; \ + PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ + CH(idx,k,0)=t0+t1; +#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ + { \ + Tcd ca=t0+t1*twr; \ + Tcd cb{-t2.i*twi, t2.r*twi}; \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ + } +#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ + { \ + Tcd ca=t0+t1*twr; \ + Tcd cb{-t2.i*twi, t2.r*twi}; \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp4: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=4; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tcd& + { return ch[b+l1*c]; }; + auto CC = [cc](size_t b, size_t c) -> const Tcd& + { return cc[b+ip*c]; }; + for (size_t k=0; k(t4); + PM(CH(k,0),CH(k,2),t2,t3); + PM(CH(k,1),CH(k,3),t1,t4); + } + } + else + { + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + for (size_t k=0; k(t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); + } + for (size_t i=1; i(t4); + CH(i,k,0) = t2+t3; + special_mul(t1+t4,WA(0,i),CH(i,k,1)); + special_mul(t2-t3,WA(1,i),CH(i,k,2)); + special_mul(t1-t4,WA(2,i),CH(i,k,3)); + } + } + } + return ch; + } + + public: + cfftp4(size_t l1_, size_t ido_, const Troots &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp5: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=5; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L), + tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L), + tw2r= Tfs(-0.8090169943749474241022934171828191L), + tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP5(idx) \ + Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \ + PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ + PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ + CH(idx,k,0).r=t0.r+t1.r+t2.r; \ + CH(idx,k,0).i=t0.i+t1.i+t2.i; + +#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ + { \ + Tcd ca,cb; \ + ca.r=t0.r+twar*t1.r+twbr*t2.r; \ + ca.i=t0.i+twar*t1.i+twbr*t2.i; \ + cb.i=twai*t4.r twbi*t3.r; \ + cb.r=-(twai*t4.i twbi*t3.i); \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ + } + +#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ + { \ + Tcd ca,cb,da,db; \ + ca.r=t0.r+twar*t1.r+twbr*t2.r; \ + ca.i=t0.i+twar*t1.i+twbr*t2.i; \ + cb.i=twai*t4.r twbi*t3.r; \ + cb.r=-(twai*t4.i twbi*t3.i); \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp7: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=7; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L), + tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L), + tw2r= Tfs(-0.2225209339563144042889025644967948L), + tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L), + tw3r= Tfs(-0.9009688679024191262361023195074451L), + tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP7(idx) \ + Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ + PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ + PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ + PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ + CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ + CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; + +#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ + { \ + Tcd ca,cb; \ + ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ + ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ + cb.i=y1*t7.r y2*t6.r y3*t5.r; \ + cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ + PM(out1,out2,ca,cb); \ + } +#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ + POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) +#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ + { \ + Tcd da,db; \ + POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp11: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=11; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template [[gnu::hot]] Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L), + tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L), + tw2r= Tfs(0.4154150130018864255292741492296232L), + tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L), + tw3r= Tfs(-0.1423148382732851404437926686163697L), + tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L), + tw4r= Tfs(-0.6548607339452850640569250724662936L), + tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L), + tw5r= Tfs(-0.9594929736144973898903680570663277L), + tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP11(idx) \ + Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ + PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ + PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ + PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ + PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ + PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ + CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ + CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; + +#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ + { \ + Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ + cb; \ + cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ + cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ + PM(out1,out2,ca,cb); \ + } +#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ + POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) +#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ + { \ + Tcd da,db; \ + POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftpg: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + size_t ip; + quick_array wa; + quick_array csarr; + + auto WA(size_t x, size_t i) const + { return wa[i-1+x*(ido-1)]; } + + template Tcd *exec_ + (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const + { + size_t ipph = (ip+1)/2; + size_t idl1 = ido*l1; + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd& + { return cc[a+ido*(b+l1*c)]; }; + auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd& + { return cc[a+idl1*b]; }; + auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd& + { return ch[a+idl1*b]; }; + + for (size_t k=0; kip) iwal-=ip; + Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; + iwal+=l; if (iwal>ip) iwal-=ip; + Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal]; + for (size_t ik=0; ikip) iwal-=ip; + Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; + for (size_t ik=0; ik(x1,wa[idij],CX(i,k,j)); + idij=(jc-1)*(ido-1)+i-1; + special_mul(x2,wa[idij],CX(i,k,jc)); + } + } + } + return cc; + } + + public: + cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots &roots) + : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip) + { + MR_assert((ip&1)&&(ip>=5), "need an odd number >=5"); + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class cfftpblue: public cfftpass + { + private: + using typename cfftpass::Tcs; + + const size_t l1, ido, ip; + const size_t ip2; + const Tcpass subplan; + quick_array wa, bk, bkf; + size_t bufsz; + bool need_cpy; + + auto WA(size_t x, size_t i) const + { return wa[i-1+x*(ido-1)]; } + + template Tcd *exec_ + (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, + Tcd * DUCC0_RESTRICT buf, size_t nthreads) const + { + static const auto ti=tidx(); + Tcd *akf = &buf[0]; + Tcd *akf2 = &buf[ip2]; + Tcd *subbuf = &buf[2*ip2]; + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +//FIXME: parallelize here? + for (size_t k=0; k(CC(i,m,k),bk[m],akf[m]); + auto zero = akf[0]*Tfs(0); + for (size_t m=ip; m(subplan->exec(ti,akf,akf2, + subbuf, true, nthreads)); + + /* do the convolution */ + res[0] = res[0].template special_mul(bkf[0]); + for (size_t m=1; m<(ip2+1)/2; ++m) + { + res[m] = res[m].template special_mul(bkf[m]); + res[ip2-m] = res[ip2-m].template special_mul(bkf[m]); + } + if ((ip2&1)==0) + res[ip2/2] = res[ip2/2].template special_mul(bkf[ip2/2]); + + /* inverse FFT */ + res = static_cast(subplan->exec(ti, res, + (res==akf) ? akf2 : akf, subbuf, false, nthreads)); + + /* multiply by b_k and write to output buffer */ + if (l1>1) + { + if (i==0) + for (size_t m=0; m(bk[m]); + else + { + CH(i,k,0) = res[0].template special_mul(bk[0]); + for (size_t m=1; m(bk[m]*WA(m-1,i)); + } + } + else + { + if (i==0) + for (size_t m=0; m(bk[m]); + else + { + CC(i,0,0) = res[0].template special_mul(bk[0]); + for (size_t m=1; m(bk[m]*WA(m-1,i)); + } + } + } + + return (l1>1) ? ch : cc; + } + + public: + cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots &roots, + bool vectorize=false) + : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)), + subplan(cfftpass::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)), + bk(ip), bkf(ip2/2+1) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; jsize()/(2*ip))*2*ip==roots->size()) ? + roots : make_shared>(2*ip); + size_t rfct2 = roots2->size()/(2*ip); + for (size_t m=1; m=2*ip) coeff-=2*ip; + bk[m] = (*roots2)[coeff*rfct2]; + } + + /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + quick_array tbkf(ip2), tbkf2(ip2); + Tfs xn2 = Tfs(1)/Tfs(ip2); + tbkf[0] = bk[0]*xn2; + for (size_t m=1; m buf(subplan->bufsize()); + static const auto tics=tidx(); + auto res = static_cast(subplan->exec(tics, tbkf.data(), + tbkf2.data(), buf.data(), true)); + for (size_t i=0; i1; + bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize(); + } + + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return need_cpy; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class cfft_multipass: public cfftpass + { + private: + using typename cfftpass::Tcs; + static constexpr size_t bunchsize=8; + + const size_t l1, ido; + size_t ip; + vector> passes; + size_t bufsz; + bool need_cpy; + size_t rfct; + Troots myroots; + +// FIXME split into sub-functions. This is too long! + template Cmplx *exec_(Cmplx *cc, Cmplx *ch, + Cmplx *buf, size_t nthreads) const + { + using Tc = Cmplx; + if ((l1==1) && (ido==1)) // no chance at vectorizing + { + static const auto tic=tidx(); + Tc *p1=cc, *p2=ch; + for(const auto &pass: passes) + { + auto res = static_cast(pass->exec(tic, p1, p2, buf, + fwd, nthreads)); + if (res==p2) swap (p1,p2); + } + return p1; + } + else + { + if constexpr(is_same::value && fft1d_simd_exists) // we can vectorize! + { + using Tfv = fft1d_simd; + using Tcv = Cmplx; + constexpr size_t vlen = Tfv::size(); + size_t nvtrans = (l1*ido + vlen-1)/vlen; + static const auto ticv = tidx(); + + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tc& + { return ch[b+l1*c]; }; + auto CC = [cc,this](size_t b, size_t c) -> Tc& + { return cc[b+ip*c]; }; + + execStatic(nvtrans, nthreads, 0, [&](auto &sched) + { + quick_array tbuf(2*ip+bufsize()); + auto cc2 = &tbuf[0]; + auto ch2 = &tbuf[ip]; + auto buf2 = &tbuf[2*ip]; + + while (auto rng=sched.getNext()) + for(auto itrans=rng.lo; itrans(pass->exec(ticv, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + + for (size_t m=0; m Tc& + { return cc[a+ido*b]; }; + + execStatic(nvtrans, nthreads, 0, [&](auto &sched) + { + quick_array tbuf(2*ip+bufsize()); + auto cc2 = &tbuf[0]; + auto ch2 = &tbuf[ip]; + auto buf2 = &tbuf[2*ip]; + + while (auto rng=sched.getNext()) + for(auto itrans=rng.lo; itrans(pass->exec(ticv, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + + for (size_t m=0; m= ido) break; + if (i==0) + CC(0,m) = { p1[m].r[n], p1[m].i[n] }; + else + { + if (m==0) + CC(i,0) = { p1[0].r[n], p1[0].i[n] } ; + else + CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*m*i]); + } + } + } + }); + return cc; + } + +MR_fail("must not get here"); +#if 0 +//FIXME this code path is currently unused + quick_array tbuf(2*ip+bufsize()); + auto cc2 = &tbuf[0]; + auto ch2 = &tbuf[ip]; + auto buf2 = &tbuf[2*ip]; + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& + { return cc[a+ido*(b+ip*c)]; }; + +//FIXME parallelize? + for (size_t itrans=0; itrans ix, kx; + size_t ixcur = (itrans*vlen)%ido; + size_t kxcur = (itrans*vlen)/ido; + for (size_t n=0; n(pass->exec(ticv, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + + for (size_t m=0; m= l1*ido) break; + if (i==0) + CH(0,k,m) = { p1[m].r[n], p1[m].i[n] }; + else + { + if (m==0) + CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ; + else + CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*l1*m*i]); + } + } + } + return ch; +#endif + } + else + { + static const auto tic = tidx *>(); + if (ido==1) + { +// parallelize here! + for (size_t n=0; n *p1=&cc[n*ip], *p2=ch; + Cmplx *res = nullptr; + for(const auto &pass: passes) + { + res = static_cast *>(pass->exec(tic, + p1, p2, buf, fwd)); + if (res==p2) swap (p1,p2); + } + if (res != &cc[n*ip]) + copy(res, res+ip, cc+n*ip); + } + // transpose + size_t nbunch = (l1*ido + bunchsize-1)/bunchsize; +// parallelize here! + for (size_t ibunch=0; ibunch Tc& + { return cc[a+ido*b]; }; + +// parallelize here! + for (size_t ibunch=0; ibunch *p1=&cc2[n*ip], *p2=ch2; + Cmplx *res = nullptr; + for(const auto &pass: passes) + { + res = static_cast *>(pass->exec(tic, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + if (res==&cc2[n*ip]) // no copying necessary + { + if (i!=0) + { + for (size_t m=1; m((*myroots)[rfct*m*i]); + } + } + else + { + if (i==0) + for (size_t m=0; m((*myroots)[rfct*m*i]); + } + } + } + for (size_t m=0; m Tc& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& + { return cc[a+ido*(b+ip*c)]; }; + +// parallelize here! + for (size_t ibunch=0; ibunch ix, kx; + size_t ixcur = (ibunch*bunchsize)%ido; + size_t kxcur = (ibunch*bunchsize)/ido; + for (size_t n=0; n *p1=&cc2[n*ip], *p2=ch2; + Cmplx *res = nullptr; + for(const auto &pass: passes) + { + res = static_cast *>(pass->exec(tic, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + if (res==&cc2[n*ip]) // no copying necessary + { + if (i!=0) + { + for (size_t m=1; m((*myroots)[rfct*l1*m*i]); + } + } + else + { + if (i==0) + for (size_t m=0; m((*myroots)[rfct*l1*m*i]); + } + } + } + for (size_t m=0; m &roots, bool vectorize=false) + : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), + myroots(roots) + { + size_t N=ip*l1*ido; + rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + + // FIXME TBD +// do we need the vectorize flag at all? + size_t lim = vectorize ? 100000 : 100000; + if (ip<=lim) + { + auto factors = cfftpass::factorize(ip); + size_t l1l=1; + for (auto fct: factors) + { + passes.push_back(cfftpass::make_pass(l1l, ip/(fct*l1l), fct, roots, false)); + l1l*=fct; + } + } + else + { + vector packets(2,1); + auto factors = util1d::prime_factors(ip); + sort(factors.begin(), factors.end(), std::greater()); + for (auto fct: factors) + (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct; + size_t l1l=1; + for (auto pkt: packets) + { + passes.push_back(cfftpass::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false)); + l1l*=pkt; + } + } + for (const auto &pass: passes) + { + bufsz = max(bufsz, pass->bufsize()); + need_cpy |= pass->needs_copy(); + } + if ((l1!=1)||(ido!=1)) + { + need_cpy=true; + bufsz += (bunchsize+1)*ip; + } + } + + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return need_cpy; } + + POCKETFFT_EXEC_DISPATCH + }; + +#undef POCKETFFT_EXEC_DISPATCH + +#if 0 // leaving in for potential future use; but doesn't seem beneficial +template class cfftp_vecpass: public cfftpass + { + private: + static_assert(simd_exists, "bad vlen"); + using typename cfftpass::Tcs; + using Tfv=typename simd_select::type; + using Tcv=Cmplx; + + size_t ip; + Tcpass spass; + Tcpass vpass; + size_t bufsz; + + template Tcs *exec_ (Tcs *cc, + Tcs * /*ch*/, Tcs * /*buf*/, size_t nthreads) const + { + quick_array buf(2*ip+bufsz); + auto * cc2 = buf.data(); + auto * ch2 = buf.data()+ip; + auto * buf2 = buf.data()+2*ip; + static const auto tics = tidx(); +// run scalar pass + auto res = static_cast(spass->exec(tics, cc, + reinterpret_cast(ch2), reinterpret_cast(buf2), + fwd, nthreads)); +// arrange input in SIMD-friendly way +// FIXME: swap loops? + for (size_t i=0; i(); + auto res2 = static_cast(vpass->exec(ticv, + cc2, ch2, buf2, fwd, nthreads)); +// de-SIMDify + for (size_t i=0; i &roots) + : ip(ip_), spass(cfftpass::make_pass(1, ip/vlen, vlen, roots)), + vpass(cfftpass::make_pass(1, 1, ip/vlen, roots)), bufsz(0) + { + MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size"); + bufsz=2*ip+max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen); + } + virtual size_t bufsize() const { return 0; } + virtual bool needs_copy() const { return false; } + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, + bool fwd, size_t nthreads=1) const + { + static const auto tics = tidx(); + MR_assert(ti==tics, "bad input type"); + auto in1 = static_cast(in); + auto copy1 = static_cast(copy); + auto buf1 = static_cast(buf); + return fwd ? exec_(in1, copy1, buf1, nthreads) + : exec_(in1, copy1, buf1, nthreads); + } + }; +#endif + +template Tcpass cfftpass::make_pass(size_t l1, + size_t ido, size_t ip, const Troots &roots, bool vectorize) + { + MR_assert(ip>=1, "no zero-sized FFTs"); +#if 0 + if (vectorize && (ip>300) && (ip<32768) && (l1==1) && (ido==1)) + { + constexpr auto vlen = native_simd::size(); + if constexpr(vlen>1) + if ((ip&(vlen-1))==0) + return make_shared>(ip, roots); + } +#endif + + if (ip==1) return make_shared>(); + auto factors=cfftpass::factorize(ip); + if (factors.size()==1) + { + switch(ip) + { + case 2: + return make_shared>(l1, ido, roots); + case 3: + return make_shared>(l1, ido, roots); + case 4: + return make_shared>(l1, ido, roots); + case 5: + return make_shared>(l1, ido, roots); + case 7: + return make_shared>(l1, ido, roots); + case 11: + return make_shared>(l1, ido, roots); + default: + if (ip<110) + return make_shared>(l1, ido, ip, roots); + else + return make_shared>(l1, ido, ip, roots, vectorize); + } + } + else // more than one factor, need a multipass + return make_shared>(l1, ido, ip, roots, vectorize); + } + +template class pocketfft_c + { + private: + size_t N; + size_t critbuf; + Tcpass plan; + + public: + pocketfft_c(size_t n, bool vectorize=false) + : N(n), critbuf(((N&1023)==0) ? 16 : 0), + plan(cfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N*plan->needs_copy()+2*critbuf+plan->bufsize(); } + template DUCC0_NOINLINE Cmplx *exec(Cmplx *in, Cmplx *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + static const auto tic = tidx *>(); + auto res = static_cast *>(plan->exec(tic, + in, buf+critbuf+plan->bufsize(), buf+critbuf, fwd, nthreads)); + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(Cmplx *in, Cmplx *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + static const auto tic = tidx *>(); + auto res = static_cast *>(plan->exec(tic, + in, buf, buf+N*plan->needs_copy(), fwd, nthreads)); + if (res==in) + { + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec(Cmplx *in, Tfs fct, bool fwd, size_t nthreads=1) const + { + quick_array> buf(N*plan->needs_copy()+plan->bufsize()); + exec_copyback(in, buf.data(), fct, fwd, nthreads); + } + }; + +template class rfftpass + { + public: + virtual ~rfftpass(){} + + // number of Tfd values required as scratch space during "exec" + // will be provided in "buf" + virtual size_t bufsize() const = 0; + virtual bool needs_copy() const = 0; + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, + bool fwd, size_t nthreads=1) const = 0; + + static vector factorize(size_t N) + { + MR_assert(N>0, "need a positive number"); + vector factors; + while ((N&3)==0) + { factors.push_back(4); N>>=2; } + if ((N&1)==0) + { + N>>=1; + // factor 2 should be at the front of the factor list + factors.push_back(2); + swap(factors[0], factors.back()); + } + for (size_t divisor=3; divisor*divisor<=N; divisor+=2) + while ((N%divisor)==0) + { + factors.push_back(divisor); + N/=divisor; + } + if (N>1) factors.push_back(N); + return factors; + } + + static shared_ptr make_pass(size_t l1, size_t ido, size_t ip, + const Troots &roots, bool vectorize=false); + static shared_ptr make_pass(size_t ip, bool vectorize=false) + { + return make_pass(1,1,ip,make_shared>>(ip), + vectorize); + } + }; + +#define POCKETFFT_EXEC_DISPATCH \ + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ + bool fwd, size_t nthreads) const \ + { \ + static const auto tifs=tidx(); \ + if (ti==tifs) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + if constexpr (fft1d_simdlen > 1) \ + if constexpr (simd_exists>) \ + { \ + using Tfv = typename simd_select>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 2) \ + if constexpr (simd_exists/2>) \ + { \ + using Tfv = typename simd_select/2>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 4) \ + if constexpr (simd_exists/4>) \ + { \ + using Tfv = typename simd_select/4>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 8) \ + if constexpr (simd_exists/8>) \ + { \ + using Tfv = typename simd_select/8>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + MR_fail("impossible vector length requested"); \ + } + +template using Trpass = shared_ptr>; + +/* (a+ib) = conj(c+id) * (e+if) */ +template inline void MULPM + (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) + { a=c*e+d*f; b=c*f-d*e; } + +template class rfftp1: public rfftpass + { + public: + rfftp1() {} + virtual size_t bufsize() const { return 0; } + virtual bool needs_copy() const { return false; } + + virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, + void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const + { return in; } + }; + +template class rfftp2: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=2; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftp3: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=3; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + constexpr Tfs taur=Tfs(-0.5), + taui=Tfs(0.8660254037844386467637231707529362L); + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftp4: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=4; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + if constexpr(fwd) + { + constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftp5: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=5; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L), + ti11= Tfs(0.9510565162951535721164393333793821L), + tr12= Tfs(-0.8090169943749474241022934171828191L), + ti12= Tfs(0.5877852522924731291687059546390728L); + + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftpg: public rfftpass + { + private: + size_t l1, ido; + size_t ip; + quick_array wa, csarr; + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + if constexpr(fwd) + { + size_t ipph=(ip+1)/2; + size_t idl1 = ido*l1; + + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd& + { return ch[a+idl1*b]; }; + + if (ido>1) + { + for (size_t j=1, jc=ip-1; j=ip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + Tfs ar=csarr[2*iang], ai=csarr[2*iang+1]; + for (size_t ik=0; ik const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd& + { return ch[a+idl1*b]; }; + + for (size_t k=0; kip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + Tfs war=csarr[2*iang], wai=csarr[2*iang+1]; + for (size_t ik=0; ik &roots) + : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip) + { + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftpblue: public rfftpass + { + private: + const size_t l1, ido, ip; + quick_array wa; + const Tcpass cplan; + size_t bufsz; + bool need_cpy; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ + (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, + Tfd * DUCC0_RESTRICT buf_, size_t nthreads) const + { + using Tcd = Cmplx; + auto buf = reinterpret_cast(buf_); + Tcd *cc2 = &buf[0]; + Tcd *ch2 = &buf[ip]; + Tcd *subbuf = &buf[2*ip]; + static const auto ticd = tidx(); + + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + + for (size_t k=0; k(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + // copy out + CH(0,0,k) = res[0].r; + for (size_t m=1; m<=ip/2; ++m) + { + CH(ido-1,2*m-1,k)=res[m].r; + CH(0,2*m,k)=res[m].i; + } + } + if (ido==1) return ch; + size_t ipph = (ip+1)/2; + for (size_t k=0; k(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + CH(i-1,0,k) = res[0].r; + CH(i,0,k) = res[0].i; + for (size_t m=1; m Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + for (size_t m=0; m(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + CH(i-1,k,0) = res[0].r; + CH(i,k,0) = res[0].i; + for (size_t m=1; m &roots, bool vectorize=false) + : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), + cplan(cfftpass::make_pass(1,1,ip,roots,vectorize)) + { + MR_assert(ip&1, "Bluestein length must be odd"); + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; jbufsize(); } + virtual bool needs_copy() const { return true; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class rfft_multipass: public rfftpass + { + private: + const size_t l1, ido; + size_t ip; + vector> passes; + size_t bufsz; + bool need_cpy; + quick_array wa; + + auto WA(size_t x, size_t i) const + { return wa[(i-1)*(ip-1)+x]; } + + template Tfd *exec_(Tfd *cc, Tfd *ch, Tfd *buf, + size_t nthreads) const + { + static const auto tifd = tidx(); + if ((l1==1) && (ido==1)) + { + Tfd *p1=cc, *p2=ch; + if constexpr (fwd) + for (auto it=passes.rbegin(); it!=passes.rend(); ++it) + { + auto res = static_cast((*it)->exec(tifd, + p1, p2, buf, fwd, nthreads)); + if (res==p2) swap(p1,p2); + } + else + for (const auto &pass: passes) + { + auto res = static_cast(pass->exec(tifd, + p1, p2, buf, fwd, nthreads)); + if (res==p2) swap(p1,p2); + } + return p1; + } + else + MR_fail("not yet supported"); + } + + public: + rfft_multipass(size_t l1_, size_t ido_, size_t ip_, + const Troots &roots, bool /*vectorize*/=false) + : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), + wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j::factorize(ip); + + size_t l1l=1; + for (auto fct: factors) + { + passes.push_back(rfftpass::make_pass(l1l, ip/(fct*l1l), fct, roots)); + l1l*=fct; + } + for (const auto &pass: passes) + { + bufsz = max(bufsz, pass->bufsize()); + need_cpy |= pass->needs_copy(); + } + if ((l1!=1)||(ido!=1)) + { + need_cpy=true; + bufsz += 2*ip; + } + } + + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return need_cpy; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class rfftp_complexify: public rfftpass + { + private: + size_t N; + Troots roots; + size_t rfct; + Tcpass pass; + size_t l1, ido; + static constexpr size_t ip=2; + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * buf, size_t nthreads) const + { + using Tcd = Cmplx; + auto ccc = reinterpret_cast(cc); + auto cch = reinterpret_cast(ch); + auto cbuf = reinterpret_cast(buf); + static const auto ticd = tidx(); + if constexpr(fwd) + { + auto res = static_cast(pass->exec(ticd, + ccc, cch, cbuf, true, nthreads)); + auto rres = (res==ccc) ? ch : cc; + rres[0] = res[0].r+res[0].i; +//FIXME: parallelize? + for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi) + { + auto xe = res[i]+res[xi].conj(); + auto xo = Tcd(res[i].i+res[xi].i, res[xi].r-res[i].r) + * (*roots)[rfct*i].conj(); + rres[2*i-1] = Tfs(0.5)*(xe.r+xo.r); + rres[2*i] = Tfs(0.5)*(xe.i+xo.i); + rres[2*xi-1] = Tfs(0.5)*(xe.r-xo.r); + rres[2*xi] = Tfs(0.5)*(xo.i-xe.i); + } + rres[N-1] = res[0].r-res[0].i; + return rres; + } + else + { + cch[0] = Tcd(cc[0]+cc[N-1], cc[0]-cc[N-1]); +//FIXME: parallelize? + for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi) + { + Tcd t1 (cc[2*i-1], cc[2*i]); + Tcd t2 (cc[2*xi-1], -cc[2*xi]); + auto xe = t1+t2; + auto xo = (t1-t2)*(*roots)[rfct*i]; + cch[i] = (xe + Tcd(-xo.i, xo.r)); + cch[xi] = (xe.conj() + Tcd(xo.i, xo.r)); + } + auto res = static_cast(pass->exec(ticd, + cch, ccc, cbuf, false, nthreads)); + return (res==ccc) ? cc : ch; + } + } + + public: + rfftp_complexify(size_t N_, const Troots &roots_, bool vectorize=false) + : N(N_), roots(roots_), pass(cfftpass::make_pass(N/2, vectorize)) + { + rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + MR_assert((N&1)==0, "N must be even"); + } + + virtual size_t bufsize() const { return 2*pass->bufsize(); } + virtual bool needs_copy() const { return true; } + + POCKETFFT_EXEC_DISPATCH + }; +#undef POCKETFFT_EXEC_DISPATCH + +template Trpass rfftpass::make_pass(size_t l1, + size_t ido, size_t ip, const Troots &roots, bool vectorize) + { + MR_assert(ip>=1, "no zero-sized FFTs"); + if (ip==1) return make_shared>(); + if ((ip>1000) && ((ip&1)==0)) // use complex transform + return make_shared>(ip, roots, vectorize); + auto factors=rfftpass::factorize(ip); + if (factors.size()==1) + { + switch(ip) + { + case 2: + return make_shared>(l1, ido, roots); + case 3: + return make_shared>(l1, ido, roots); + case 4: + return make_shared>(l1, ido, roots); + case 5: + return make_shared>(l1, ido, roots); + default: + if (ip<135) + return make_shared>(l1, ido, ip, roots); + else + return make_shared>(l1, ido, ip, roots, vectorize); + } + } + else // more than one factor, need a multipass + return make_shared>(l1, ido, ip, roots, vectorize); + } + +template class pocketfft_r + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_r(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N*plan->needs_copy()+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + bool fwd, size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, in, buf, + buf+N*plan->needs_copy(), fwd, nthreads)); + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, in, buf, + buf+N*plan->needs_copy(), fwd, nthreads)); + if (res==in) + { + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd, + size_t nthreads=1) const + { + quick_array buf(N*plan->needs_copy()+plan->bufsize()); + exec_copyback(in, buf.data(), fct, fwd, nthreads); + } + }; + +template class pocketfft_hartley + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_hartley(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, + in, buf, buf+N, true, nthreads)); + auto res2 = (res==buf) ? in : buf; + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, size_t nthreads=1) const + { + auto res = exec(in, buf, fct, nthreads); + if (res!=in) + copy_n(res, N, in); + } + template DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, + size_t nthreads=1) const + { + quick_array buf(N+plan->bufsize()); + exec_copyback(in, buf.data(), fct, nthreads); + } + }; + +// R2R transforms using FFTW's halfcomplex format +template class pocketfft_fftw + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_fftw(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + bool fwd, size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = in; + auto res2 = buf; + if (!fwd) // go to FFTPACK halfcomplex order + { + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i(plan->exec(tifd, + res, res2, buf+N, fwd, nthreads)); + if (!fwd) return res; + + // go to FFTW halfcomplex order + res2 = (res==buf) ? in : buf; + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + auto res = exec(in, buf, fct, fwd, nthreads); + if (res!=in) + copy_n(res, N, in); + } + template DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd, + size_t nthreads=1) const + { + quick_array buf(N+plan->bufsize()); + exec_copyback(in, buf.data(), fct, fwd, nthreads); + } + }; + +} + +using detail_fft::pocketfft_c; +using detail_fft::pocketfft_r; +using detail_fft::pocketfft_hartley; +using detail_fft::pocketfft_fftw; +inline size_t good_size_complex(size_t n) + { return detail_fft::util1d::good_size_cmplx(n); } +inline size_t good_size_real(size_t n) + { return detail_fft::util1d::good_size_real(n); } + +} + +#endif diff --git a/benchees/duccfft/ducc0/infra/aligned_array.h b/benchees/duccfft/ducc0/infra/aligned_array.h new file mode 100644 index 0000000..c72a984 --- /dev/null +++ b/benchees/duccfft/ducc0/infra/aligned_array.h @@ -0,0 +1,133 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** \file ducc0/infra/aligned_array.h + * + * \copyright Copyright (C) 2019-2021 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef DUCC0_ALIGNED_ARRAY_H +#define DUCC0_ALIGNED_ARRAY_H + +#include +#include + +namespace ducc0 { + +namespace detail_aligned_array { + +using namespace std; + +/// Bare bones array class. +/** Mostly useful for uninitialized temporary buffers. + * \note Since this class operates on raw memory, it should only be used with + * POD types, and even then only with caution! */ +template class array_base + { + private: + T *p; + size_t sz; + + static T *ralloc(size_t num) + { + if constexpr(alignment<=alignof(max_align_t)) + { + void *res = malloc(num*sizeof(T)); + if (!res) throw bad_alloc(); + return reinterpret_cast(res); + } + else + { + if (num==0) return nullptr; +// FIXME: let's not use aligned_alloc on Apple for the moment, +// it's only supported from 10.15 on... +#if 0//((__cplusplus >= 201703L) && (!defined(__APPLE__))) + // aligned_alloc requires the allocated size to be a multiple of the + // requested alignment, so increase size if necessary + void *res = aligned_alloc(alignment,((num*sizeof(T)+alignment-1)/alignment)*alignment); + if (!res) throw bad_alloc(); +#else // portable emulation + void *ptr = malloc(num*sizeof(T)+alignment); + if (!ptr) throw bad_alloc(); + void *res = reinterpret_cast((reinterpret_cast(ptr) & ~(size_t(alignment-1))) + alignment); + (reinterpret_cast(res))[-1] = ptr; +#endif + return reinterpret_cast(res); + } + } + static void dealloc(T *ptr) + { + if constexpr(alignment<=alignof(max_align_t)) + free(ptr); + else +#if 0//((__cplusplus >= 201703L) && (!defined(__APPLE__))) + free(ptr); +#else + if (ptr) free((reinterpret_cast(ptr))[-1]); +#endif + } + + public: + /// Creates a zero-sized array with no associated memory. + array_base() : p(nullptr), sz(0) {} + /// Creates an array with \a n entries. + /** \note Memory is not initialized! */ + array_base(size_t n) : p(ralloc(n)), sz(n) {} + array_base(array_base &&other) + : p(other.p), sz(other.sz) + { other.p=nullptr; other.sz=0; } + ~array_base() { dealloc(p); } + + /// If \a n is different from the currnt size, resizes the array to hold + /// \a n elements. + /** \note No data content is copied, the new array is uninitialized! */ + void resize(size_t n) + { + if (n==sz) return; + dealloc(p); + p = ralloc(n); + sz = n; + } + + /// Returns a writeable reference to the element at index \a idx. + T &operator[](size_t idx) { return p[idx]; } + /// Returns a read-only reference to the element at index \a idx. + const T &operator[](size_t idx) const { return p[idx]; } + + /// Returns a writeable pointer to the array data. + T *data() { return p; } + /// Returns a read-only pointer to the array data. + const T *data() const { return p; } + + /// Returns the size of the array. + size_t size() const { return sz; } + }; + +template using quick_array = array_base; +template using aligned_array = array_base; + +} + +using detail_aligned_array::aligned_array; +using detail_aligned_array::quick_array; + +} + +#endif + diff --git a/benchees/duccfft/ducc0/infra/error_handling.h b/benchees/duccfft/ducc0/infra/error_handling.h new file mode 100644 index 0000000..93deffd --- /dev/null +++ b/benchees/duccfft/ducc0/infra/error_handling.h @@ -0,0 +1,93 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** \file ducc0/infra/error_handling.h + * + * \copyright Copyright (C) 2019-2021 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef DUCC0_ERROR_HANDLING_H +#define DUCC0_ERROR_HANDLING_H + +#include +#include +#include "ducc0/infra/useful_macros.h" + +namespace ducc0 { + +namespace detail_error_handling { + +#if defined (__GNUC__) +#define DUCC0_ERROR_HANDLING_LOC_ ::ducc0::detail_error_handling::CodeLocation(__FILE__, __LINE__, __PRETTY_FUNCTION__) +#else +#define DUCC0_ERROR_HANDLING_LOC_ ::ducc0::detail_error_handling::CodeLocation(__FILE__, __LINE__) +#endif + +// to be replaced with std::source_location once generally available +class CodeLocation + { + private: + const char *file, *func; + int line; + + public: + CodeLocation(const char *file_, int line_, const char *func_=nullptr) + : file(file_), func(func_), line(line_) {} + + inline ::std::ostream &print(::std::ostream &os) const + { + os << "\n" << file << ": " << line; + if (func) os << " (" << func << ")"; + os << ":\n"; + return os; + } + }; + +inline ::std::ostream &operator<<(::std::ostream &os, const CodeLocation &loc) + { return loc.print(os); } + +template +void streamDump__(::std::ostream &os, Args&&... args) + { (os << ... << args); } +template +[[noreturn]] DUCC0_NOINLINE void fail__(Args&&... args) + { + ::std::ostringstream msg; \ + ::ducc0::detail_error_handling::streamDump__(msg, args...); \ + throw ::std::runtime_error(msg.str()); \ + } + +/// Throws a std::runtime_error containing the code location and the +/// passed arguments. +#define MR_fail(...) \ + do { \ + ::ducc0::detail_error_handling::fail__(DUCC0_ERROR_HANDLING_LOC_, "\n", ##__VA_ARGS__, "\n"); \ + } while(0) + +/// If \a cond is false, throws a std::runtime_error containing the code +/// location and the passed arguments. +#define MR_assert(cond,...) \ + do { \ + if (cond); \ + else { MR_fail("Assertion failure\n", ##__VA_ARGS__); } \ + } while(0) + +}} + +#endif diff --git a/benchees/duccfft/ducc0/infra/mav.h b/benchees/duccfft/ducc0/infra/mav.h new file mode 100644 index 0000000..9987048 --- /dev/null +++ b/benchees/duccfft/ducc0/infra/mav.h @@ -0,0 +1,1154 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/*! \file ducc0/infra/mav.h + * Classes for dealing with multidimensional arrays + * + * \copyright Copyright (C) 2019-2021 Max-Planck-Society + * \author Martin Reinecke + * */ + +#ifndef DUCC0_MAV_H +#define DUCC0_MAV_H + +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/infra/misc_utils.h" +#include "ducc0/infra/threading.h" + +namespace ducc0 { + +namespace detail_mav { + +using namespace std; + +struct uninitialized_dummy {}; +constexpr uninitialized_dummy UNINITIALIZED; + +template class cmembuf + { + protected: + shared_ptr> ptr; + shared_ptr> rawptr; + const T *d; + + cmembuf(const T *d_, const cmembuf &other) + : ptr(other.ptr), rawptr(other.rawptr), d(d_) {} + + // externally owned data pointer + cmembuf(const T *d_) + : d(d_) {} + // share another memory buffer, but read-only + cmembuf(const cmembuf &other) + : ptr(other.ptr), rawptr(other.rawptr), d(other.d) {} + cmembuf(size_t sz) + : ptr(make_shared>(sz)), d(ptr->data()) {} + cmembuf(size_t sz, uninitialized_dummy) + : rawptr(make_shared>(sz)), d(rawptr->data()) {} + // take over another memory buffer + cmembuf(cmembuf &&other) = default; + + public: + cmembuf(): d(nullptr) {} + void assign(const cmembuf &other) + { + ptr = other.ptr; + rawptr = other.rawptr; + d = other.d; + } + // read access to element #i + template const T &raw(I i) const + { return d[i]; } + // read access to data area + const T *data() const + { return d; } + }; + +constexpr size_t MAXIDX=~(size_t(0)); + +struct slice + { + size_t lo, hi; + slice() : lo(0), hi(MAXIDX) {} + slice(size_t idx) : lo(idx), hi(idx) {} + slice(size_t lo_, size_t hi_) : lo(lo_), hi(hi_) {} + }; + +/// Helper class containing shape and stride information of an `fmav` object +class fmav_info + { + public: + /// vector of nonnegative integers for storing the array shape + using shape_t = vector; + /// vector of integers for storing the array strides + using stride_t = vector; + + protected: + shape_t shp; + stride_t str; + size_t sz; + + static stride_t shape2stride(const shape_t &shp) + { + auto ndim = shp.size(); + stride_t res(ndim); + if (ndim==0) return res; + res[ndim-1]=1; + for (size_t i=2; i<=ndim; ++i) + res[ndim-i] = res[ndim-i+1]*ptrdiff_t(shp[ndim-i+1]); + return res; + } + template ptrdiff_t getIdx(size_t dim, size_t n, Ns... ns) const + { return str[dim]*ptrdiff_t(n) + getIdx(dim+1, ns...); } + ptrdiff_t getIdx(size_t dim, size_t n) const + { return str[dim]*ptrdiff_t(n); } + ptrdiff_t getIdx(size_t /*dim*/) const + { return 0; } + + public: + /// Constructs a 1D object with all extents and strides set to zero. + fmav_info() : shp(1,0), str(1,0), sz(0) {} + /// Constructs an object with the given shape and stride. + fmav_info(const shape_t &shape_, const stride_t &stride_) + : shp(shape_), str(stride_), + sz(accumulate(shp.begin(),shp.end(),size_t(1),multiplies<>())) + { + MR_assert(shp.size()==str.size(), "dimensions mismatch"); + } + /// Constructs an object with the given shape and computes the strides + /// automatically, assuming a C-contiguous memory layout. + fmav_info(const shape_t &shape_) + : fmav_info(shape_, shape2stride(shape_)) {} + void assign(const fmav_info &other) + { + shp = other.shp; + str = other.str; + sz = other.sz; + } + /// Returns the dimensionality of the object. + size_t ndim() const { return shp.size(); } + /// Returns the total number of entries in the object. + size_t size() const { return sz; } + /// Returns the shape of the object. + const shape_t &shape() const { return shp; } + /// Returns the length along dimension \a i. + size_t shape(size_t i) const { return shp[i]; } + /// Returns the strides of the object. + const stride_t &stride() const { return str; } + /// Returns the stride along dimension \a i. + const ptrdiff_t &stride(size_t i) const { return str[i]; } + /// Returns true iff the last dimension has stride 1. + /** Typically used for optimization purposes. */ + bool last_contiguous() const + { return ((ndim()==0) || (str.back()==1)); } + /** Returns true iff the object is C-contiguous, i.e. if the stride of the + * last dimension is 1, the stride for the next-to-last dimension is the + * shape of the last dimension etc. */ + bool contiguous() const + { + auto ndim = shp.size(); + ptrdiff_t stride=1; + for (size_t i=0; ishape and \a other.shape match. + bool conformable(const fmav_info &other) const + { return shp==other.shp; } + /// Returns the one-dimensional index of an entry from the given + /// multi-dimensional index tuple, taking strides into account. + template ptrdiff_t idx(Ns... ns) const + { + MR_assert(ndim()==sizeof...(ns), "incorrect number of indices"); + return getIdx(0, ns...); + } + /// Returns the common broadcast shape of *this and \a shp2 + shape_t bcast_shape(const shape_t &shp2) const + { + shape_t res(max(shp.size(), shp2.size()), 1); + for (size_t i=0; i=shp.size(), "cannot reduce dimensionality"); + stride_t newstr(shp2.size(), 0); + for (size_t i=0; i &slices) const + { + auto ndim = shp.size(); + shape_t nshp(ndim); + stride_t nstr(ndim); + MR_assert(slices.size()==ndim, "incorrect number of slices"); + size_t n0=0; + for (auto x:slices) if (x.lo==x.hi) ++n0; + ptrdiff_t nofs=0; + nshp.resize(ndim-n0); + nstr.resize(ndim-n0); + for (size_t i=0, i2=0; i class mav_info + { + public: + /// Fixed-size array of nonnegative integers for storing the array shape + using shape_t = array; + /// Fixed-size array of integers for storing the array strides + using stride_t = array; + + protected: + shape_t shp; + stride_t str; + size_t sz; + + static stride_t shape2stride(const shape_t &shp) + { + stride_t res; + if (ndim==0) return res; + res[ndim-1]=1; + for (size_t i=2; i<=ndim; ++i) + res[ndim-i] = res[ndim-i+1]*ptrdiff_t(shp[ndim-i+1]); + return res; + } + template ptrdiff_t getIdx(size_t dim, size_t n, Ns... ns) const + { return str[dim]*n + getIdx(dim+1, ns...); } + ptrdiff_t getIdx(size_t dim, size_t n) const + { return str[dim]*n; } + ptrdiff_t getIdx(size_t /*dim*/) const + { return 0; } + + public: + /// Constructs an object with all extents and strides set to zero. + mav_info() : sz(0) + { + for (size_t i=0; i())) {} + /// Constructs an object with the given shape and computes the strides + /// automatically, assuming a C-contiguous memory layout. + mav_info(const shape_t &shape_) + : mav_info(shape_, shape2stride(shape_)) {} + void assign(const mav_info &other) + { + shp = other.shp; + str = other.str; + sz = other.sz; + } + /// Returns the total number of entries in the object. + size_t size() const { return sz; } + /// Returns the shape of the object. + const shape_t &shape() const { return shp; } + /// Returns the length along dimension \a i. + size_t shape(size_t i) const { return shp[i]; } + /// Returns the strides of the object. + const stride_t &stride() const { return str; } + /// Returns the stride along dimension \a i. + const ptrdiff_t &stride(size_t i) const { return str[i]; } + /// Returns true iff the last dimension has stride 1. + /** Typically used for optimization purposes. */ + bool last_contiguous() const + { return ((ndim==0) || (str.back()==1)); } + /** Returns true iff the object is C-contiguous, i.e. if the stride of the + * last dimension is 1, the stride for the next-to-last dimension is the + * shape of the last dimension etc. */ + bool contiguous() const + { + ptrdiff_t stride=1; + for (size_t i=0; ishape and \a other.shape match. + bool conformable(const mav_info &other) const + { return shp==other.shp; } + /// Returns true iff this->shape and \a other match. + bool conformable(const shape_t &other) const + { return shp==other; } + /// Returns the one-dimensional index of an entry from the given + /// multi-dimensional index tuple, taking strides into account. + template ptrdiff_t idx(Ns... ns) const + { + static_assert(ndim==sizeof...(ns), "incorrect number of indices"); + return getIdx(0, ns...); + } + + protected: + template auto subdata(const vector &slices) const + { + MR_assert(slices.size()==ndim, "bad number of slices"); + array nshp; + array nstr; + + // unnecessary, but gcc arns otherwise + for (size_t i=0; i(nshp, nstr), nofs); + } + }; + +template class cfmav: public fmav_info, public cmembuf + { + protected: + using tbuf = cmembuf; + using tinfo = fmav_info; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tbuf::raw, tbuf::data; + + + protected: + cfmav(const shape_t &shp_, uninitialized_dummy) + : tinfo(shp_), tbuf(size(), UNINITIALIZED) {} + cfmav(const shape_t &shp_, const stride_t &str_, uninitialized_dummy) + : tinfo(shp_, str_), tbuf(size(), UNINITIALIZED) + { + ptrdiff_t ofs=0; + for (size_t i=0; i const T &operator()(Ns... ns) const + { return raw(idx(ns...)); } + + cfmav subarray(const vector &slices) const + { + auto [ninfo, nofs] = subdata(slices); + return cfmav(ninfo, tbuf::d+nofs, *this); + } + }; + +template cfmav subarray + (const cfmav &arr, const vector &slices) + { return arr.subarray(slices); } + +template class vfmav: public cfmav + { + protected: + using tbuf = cmembuf; + using tinfo = fmav_info; + using tinfo::shp, tinfo::str; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tinfo::size, tinfo::shape, tinfo::stride; + + protected: + vfmav(const fmav_info &info, T *d_, tbuf &buf) + : cfmav(info, d_, buf) {} + + public: + using tbuf::raw, tbuf::data, tinfo::ndim; + vfmav(T *d_, const fmav_info &info) + : cfmav(d_, info) {} + vfmav(T *d_, const shape_t &shp_, const stride_t &str_) + : cfmav(d_, shp_, str_) {} + vfmav(T *d_, const shape_t &shp_) + : cfmav(d_, shp_) {} + vfmav(const shape_t &shp_) + : cfmav(shp_) {} + vfmav(const shape_t &shp_, uninitialized_dummy) + : cfmav(shp_, UNINITIALIZED) {} + vfmav(const shape_t &shp_, const stride_t &str_, uninitialized_dummy) + : cfmav(shp_, str_, UNINITIALIZED) + { + ptrdiff_t ofs=0; + for (size_t i=0; i(buf, shp_, str_) {} + + using cfmav::data; + T *data() + { return const_cast(tbuf::d); } + using cfmav::raw; + template T &raw(I i) + { return data()[i]; } + + void assign(vfmav &other) + { + fmav_info::assign(other); + cmembuf::assign(other); + } + + using cfmav::operator(); + template const T &operator()(Ns... ns) const + { return raw(idx(ns...)); } + + vfmav subarray(const vector &slices) + { + auto [ninfo, nofs] = tinfo::subdata(slices); + return vfmav(ninfo, data()+nofs, *this); + } + /** Returns a writable fmav with the specified shape. + * The strides are chosen in such a way that critical strides (multiples + * of 4096 bytes) along any dimension are avoided, by enlarging the + * allocated memory slightly if necessary. + * The array data is default-initialized. */ + static vfmav build_noncritical(const shape_t &shape) + { + auto ndim = shape.size(); + auto shape2 = noncritical_shape(shape, sizeof(T)); + vfmav tmp(shape2); + vector slc(ndim); + for (size_t i=0; i slc(ndim); + for (size_t i=0; i vfmav subarray + (vfmav &arr, const vector &slices) + { return arr.subarray(slices); } + +template class cmav: public mav_info, public cmembuf + { + protected: + template friend class cmav; + template friend class vmav; + + using tinfo = mav_info; + using tbuf = cmembuf; + using tinfo::shp, tinfo::str; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tbuf::raw, tbuf::data; + using tinfo::contiguous, tinfo::size, tinfo::idx, tinfo::conformable; + + protected: + cmav() {} + cmav(const shape_t &shp_, uninitialized_dummy) + : tinfo(shp_), tbuf(size(), UNINITIALIZED) {} + cmav(const shape_t &shp_) + : tinfo(shp_), tbuf(size()) {} + cmav(const tbuf &buf, const shape_t &shp_, const stride_t &str_) + : tinfo(shp_, str_), tbuf(buf) {} + cmav(const tinfo &info, const T *d_, const tbuf &buf) + : tinfo(info), tbuf(d_, buf) {} + + public: + cmav(const T *d_, const shape_t &shp_, const stride_t &str_) + : tinfo(shp_, str_), tbuf(d_) {} + cmav(const T *d_, const shape_t &shp_) + : tinfo(shp_), tbuf(d_) {} + void assign(const cmav &other) + { + mav_info::assign(other); + cmembuf::assign(other); + } + operator cfmav() const + { + return cfmav(*this, {shp.begin(), shp.end()}, {str.begin(), str.end()}); + } + template const T &operator()(Ns... ns) const + { return raw(idx(ns...)); } + template cmav subarray(const vector &slices) const + { + auto [ninfo, nofs] = tinfo::template subdata (slices); + return cmav (ninfo, tbuf::d+nofs, *this); + } + + static cmav build_uniform(const shape_t &shape, const T &value) + { + // Don't do this at home! + shape_t tshp; + tshp.fill(1); + cmav tmp(tshp); + const_cast(tmp.raw(0)) = value; + stride_t nstr; + nstr.fill(0); + return cmav(tmp, shape, nstr); + } + }; +template cmav subarray + (const cmav &arr, const vector &slices) + { return arr.template subarray(slices); } + +template class vmav: public cmav + { + protected: + template friend class vmav; + + using parent = cmav; + using tinfo = mav_info; + using tbuf = cmembuf; + using tinfo::shp, tinfo::str; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tbuf::raw, tbuf::data; + using tinfo::contiguous, tinfo::size, tinfo::idx, tinfo::conformable; + + protected: + vmav(const tinfo &info, T *d_, tbuf &buf) + : parent(info, d_, buf) {} + + public: + vmav() {} + vmav(T *d_, const shape_t &shp_, const stride_t &str_) + : parent(d_, shp_, str_) {} + vmav(T *d_, const shape_t &shp_) + : parent(d_, shp_) {} + vmav(const shape_t &shp_) + : parent(shp_) {} + vmav(const shape_t &shp_, uninitialized_dummy) + : parent(shp_, UNINITIALIZED) {} + + void assign(vmav &other) + { parent::assign(other); } + operator vfmav() + { + return vfmav(*this, {shp.begin(), shp.end()}, {str.begin(), str.end()}); + } + using parent::operator(); + template T &operator()(Ns... ns) + { return const_cast(parent::operator()(ns...)); } + + template vmav subarray(const vector &slices) + { + auto [ninfo, nofs] = tinfo::template subdata (slices); + return vmav (ninfo, data()+nofs, *this); + } + + using parent::data; + T *data() + { return const_cast(tbuf::d); } + // read access to element #i + using parent::raw; + template T &raw(I i) + { return data()[i]; } + + static vmav build_empty() + { + shape_t nshp; + nshp.fill(0); + return vmav(static_cast(nullptr), nshp); + } + + static vmav build_noncritical(const shape_t &shape) + { + auto shape2 = noncritical_shape(shape, sizeof(T)); + vmav tmp(shape2); + vector slc(ndim); + for (size_t i=0; i(slc); + } + static vmav build_noncritical(const shape_t &shape, uninitialized_dummy) + { + if (ndim<=1) return vmav(shape, UNINITIALIZED); + auto shape2 = noncritical_shape(shape, sizeof(T)); + vmav tmp(shape2, UNINITIALIZED); + vector slc(ndim); + for (size_t i=0; i(slc); + } + }; + +template vmav subarray + (vmav &arr, const vector &slices) + { return arr.template subarray(slices); } + +// various operations involving fmav objects of the same shape -- experimental + +DUCC0_NOINLINE void opt_shp_str(fmav_info::shape_t &shp, vector &str) + { + if (shp.size()>1) + { + // sort dimensions in order of descending stride, as far as possible + vector strcrit(shp.size(),0); + for (const auto &curstr: str) + for (size_t i=0; i1; --lastdim) + { + auto dim = size_t(min_element(strcrit.begin(),strcrit.begin()+lastdim) + -strcrit.begin()); + if (dim+1!=lastdim) + { + swap(strcrit[dim], strcrit[lastdim-1]); + swap(shp[dim], shp[lastdim-1]); + for (auto &curstr: str) + swap(curstr[dim], curstr[lastdim-1]); + } + } + // try merging dimensions + size_t ndim = shp.size(); + if (ndim>1) + for (size_t d0=ndim-2; d0+1>0; --d0) + { + bool can_merge = true; + for (const auto &curstr: str) + can_merge &= curstr[d0] == ptrdiff_t(shp[d0+1])*curstr[d0+1]; + if (can_merge) + { + for (auto &curstr: str) + curstr.erase(curstr.begin()+d0); + shp[d0+1] *= shp[d0]; + shp.erase(shp.begin()+d0); + } + } + } + } + +DUCC0_NOINLINE auto multiprep(const vector &info) + { + auto narr = info.size(); + MR_assert(narr>=1, "need at least one array"); + for (size_t i=1; i str(narr); + for (size_t i=0; i + void applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim]; + if (idim+1 + void applyHelper(const vector &shp, + const vector> &str, T0 ptr0, Func func, size_t nthreads) + { + if (shp.size()==0) + func(*ptr0); + else if (nthreads==1) + applyHelper(0, shp, str, ptr0, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, T1 ptr1, Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim], str1 = str[1][idim]; + if (idim+1 + void applyHelper(const vector &shp, + const vector> &str, T0 ptr0, T1 ptr1, + Func func, size_t nthreads) + { + if (shp.size()==0) + func(*ptr0, *ptr1); + else if (nthreads==1) + applyHelper(0, shp, str, ptr0, ptr1, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, T1 ptr1, T2 ptr2, Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim], str1 = str[1][idim], str2 = str[2][idim]; + if (idim+1 + void applyHelper(const vector &shp, + const vector> &str, T0 ptr0, T1 ptr1, T2 ptr2, + Func func, size_t nthreads) + { + if (shp.size()==0) + func(*ptr0, *ptr1, *ptr2); + else if (nthreads==1) + applyHelper(0, shp, str, ptr0, ptr1, ptr2, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, T1 ptr1, T2 ptr2, T3 ptr3, + Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim], str1 = str[1][idim], str2 = str[2][idim], + str3 = str[3][idim]; + if (idim+1 + void applyHelper(const vector &shp, + const vector> &str, T0 ptr0, T1 ptr1, T2 ptr2, T3 ptr3, + Func func, size_t nthreads) + { + if (shp.size()==0) + func(*ptr0, *ptr1, *ptr2, *ptr3); + else if (nthreads==1) + applyHelper(0, shp, str, ptr0, ptr1, ptr2, ptr3, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void mav_apply(Func func, int nthreads, T0 &&m0) + { + auto [shp, str] = multiprep({m0}); + applyHelper(shp, str, m0.data(), func, nthreads); + } +template + void mav_apply(Func func, int nthreads, T0 &&m0, T1 &&m1) + { + auto [shp, str] = multiprep({m0, m1}); + applyHelper(shp, str, m0.data(), m1.data(), func, nthreads); + } +template + void mav_apply(Func func, int nthreads, T0 &&m0, T1 &&m1, T2 &&m2) + { + auto [shp, str] = multiprep({m0, m1, m2}); + applyHelper(shp, str, m0.data(), m1.data(), m2.data(), func, nthreads); + } + +template class mavref + { + private: + const mav_info &info; + T *d; + + public: + using shape_t = typename mav_info::shape_t; + using stride_t = typename mav_info::stride_t; + mavref(const mav_info &info_, T *d_) : info(info_), d(d_) {} + template T &operator()(Ns... ns) const + { return d[info.idx(ns...)]; } + /// Returns the total number of entries in the object. + size_t size() const { return info.size(); } + /// Returns the shape of the object. + const shape_t &shape() const { return info.shape(); } + /// Returns the length along dimension \a i. + size_t shape(size_t i) const { return info.shape(i); } + /// Returns the strides of the object. + const stride_t &stride() const { return info.stride(); } + /// Returns the stride along dimension \a i. + const ptrdiff_t &stride(size_t i) const { return info.stride(i); } + /// Returns true iff the last dimension has stride 1. + /** Typically used for optimization purposes. */ + bool last_contiguous() const + { return info.last_contiguous(); } + /** Returns true iff the object is C-contiguous, i.e. if the stride of the + * last dimension is 1, the stride for the next-to-last dimension is the + * shape of the last dimension etc. */ + bool contiguous() const + { return info.contiguous(); } + /// Returns true iff this->shape and \a other.shape match. + bool conformable(const mavref &other) const + { return shape()==other.shape(); } + }; +template + mavref make_mavref(const mav_info &info_, T *d_) + { return mavref(info_, d_); } + +template auto make_infos(const fmav_info &info) + { + if constexpr(ndim>0) + MR_assert(ndim<=info.ndim(), "bad dimensionality"); + auto iterdim = info.ndim()-ndim; + fmav_info fout({info.shape().begin(),info.shape().begin()+iterdim}, + {info.stride().begin(),info.stride().begin()+iterdim}); + + typename mav_info::shape_t shp; + typename mav_info::stride_t str; + if constexpr (ndim>0) // just to silence compiler warnings + for (size_t i=0; i iout(shp, str); + return make_tuple(fout, iout); + } + + +template + void flexible_mav_applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, const Ti0 &info0, + Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim]; + if (idim+1 + void flexible_mav_applyHelper(const vector &shp, + const vector> &str, T0 ptr0, const Ti0 &info0, + Func func, size_t nthreads) + { + if (shp.size()==0) + func(make_mavref(info0, ptr0)); + else if (nthreads==1) + flexible_mav_applyHelper(0, shp, str, ptr0, info0, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void flexible_mav_applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, const Ti0 &info0, + T1 ptr1, const Ti1 &info1, Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim], str1 = str[1][idim]; + if (idim+1 + void flexible_mav_applyHelper(const vector &shp, + const vector> &str, T0 ptr0, const Ti0 &info0, + T1 ptr1, const Ti1 &info1, Func func, size_t nthreads) + { + if (shp.size()==0) + func(mavref(info0, ptr0), mavref(info1, ptr1)); + else if (nthreads==1) + flexible_mav_applyHelper(0, shp, str, ptr0, info0, ptr1, info1, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void flexible_mav_applyHelper(size_t idim, const vector &shp, + const vector> &str, T0 ptr0, const Ti0 &info0, + T1 ptr1, const Ti1 &info1, T2 ptr2, const Ti2 &info2, Func func) + { + auto len = shp[idim]; + auto str0 = str[0][idim], str1 = str[1][idim], str2 = str[2][idim]; + if (idim+1 + void flexible_mav_applyHelper(const vector &shp, + const vector> &str, T0 ptr0, const Ti0 &info0, + T1 ptr1, const Ti1 &info1, T2 ptr2, const Ti2 &info2, Func func, + size_t nthreads) + { + if (shp.size()==0) + func(mavref(info0, ptr0), mavref(info1, ptr1), mavref(info2, ptr2)); + else if (nthreads==1) + flexible_mav_applyHelper(0, shp, str, ptr0, info0, ptr1, info1, ptr2, info2, func); + else if (shp.size()==1) + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + for (size_t i=lo; i + void flexible_mav_apply(Func func, size_t nthreads, T0 &&m0) + { + auto [f0, i0] = make_infos(m0); + vector iterinfo{f0}; + auto [shp, str] = multiprep(iterinfo); + flexible_mav_applyHelper(shp, str, m0.data(), i0, func, nthreads); + } + +template + void flexible_mav_apply(Func func, size_t nthreads, T0 &&m0, T1 &&m1) + { + MR_assert(m0.ndim()-nd0 == m1.ndim()-nd1, "dimensionality mismatch"); + auto [f0, i0] = make_infos(m0); + auto [f1, i1] = make_infos(m1); + vector iterinfo{f0, f1}; + auto [shp, str] = multiprep(iterinfo); + flexible_mav_applyHelper(shp, str, m0.data(), i0, m1.data(), i1, func, nthreads); + } + +template + void flexible_mav_apply(Func func, size_t nthreads, T0 &&m0, T1 &&m1, T2 &&m2) + { + MR_assert(m0.ndim()-nd0 == m1.ndim()-nd1, "dimensionality mismatch"); + MR_assert(m0.ndim()-nd0 == m2.ndim()-nd2, "dimensionality mismatch"); + auto [f0, i0] = make_infos(m0); + auto [f1, i1] = make_infos(m1); + auto [f2, i2] = make_infos(m2); + vector iterinfo{f0, f1, f2}; + auto [shp, str] = multiprep(iterinfo); + flexible_mav_applyHelper(shp, str, m0.data(), i0, m1.data(), i1, m2.data(), i2, + func, nthreads); + } + +} + +using detail_mav::UNINITIALIZED; +using detail_mav::fmav_info; +using detail_mav::mav_info; +using detail_mav::slice; +using detail_mav::MAXIDX; +using detail_mav::cfmav; +using detail_mav::vfmav; +using detail_mav::cmav; +using detail_mav::vmav; +using detail_mav::subarray; +using detail_mav::mav_apply; +using detail_mav::flexible_mav_apply; +} + +#endif diff --git a/benchees/duccfft/ducc0/infra/misc_utils.h b/benchees/duccfft/ducc0/infra/misc_utils.h new file mode 100644 index 0000000..d45a51e --- /dev/null +++ b/benchees/duccfft/ducc0/infra/misc_utils.h @@ -0,0 +1,72 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Copyright (C) 2019-2021 Max-Planck-Society + Author: Martin Reinecke */ + +#ifndef DUCC0_MISC_UTILS_H +#define DUCC0_MISC_UTILS_H + +#include +#include + +namespace ducc0 { + +namespace detail_misc_utils { + +using namespace std; + +template auto calcShare(size_t nshares, size_t myshare, + const T &begin, const T &end) + { + auto nwork = end-begin; + auto nbase = nwork/nshares; + auto additional = nwork%nshares; + auto lo = begin + (myshare*nbase + ((myshare auto calcShare(size_t nshares, size_t myshare, const T &end) + { return calcShare(nshares, myshare, T(0), end); } + +template shp noncritical_shape(const shp &in, size_t elemsz) + { + constexpr size_t critstride = 4096; // must be a power of 2 + auto ndim = in.size(); + if (ndim==1) return in; + shp res(in); + size_t stride = elemsz; + for (size_t i=0, xi=ndim-1; i+1) +#include +#include +#include +#include +#include + +namespace ducc0 { + +namespace detail_simd { + +namespace stdx=std::experimental; +using stdx::native_simd; + +template struct simd_select + { using type = stdx::simd>; }; + +using stdx::element_aligned_tag; +template constexpr inline bool vectorizable = native_simd::size()>1; + +template constexpr bool simd_exists_h() + { + if constexpr (N>1) + if constexpr (vectorizable) + if constexpr (!std::is_same_v>, stdx::fixed_size_simd>) + return true; + return false; + } +template constexpr inline bool simd_exists = simd_exists_h(); + +template inline stdx::simd apply(stdx::simd in, Func func) + { + stdx::simd res; + for (size_t i=0; i inline stdx::simd sin(stdx::simd in) + { return apply(in,[](T v){return sin(v);}); } +template inline stdx::simd cos(stdx::simd in) + { return apply(in,[](T v){return cos(v);}); } + +} + +using detail_simd::element_aligned_tag; +using detail_simd::native_simd; +using detail_simd::simd_select; +using detail_simd::simd_exists; +using detail_simd::vectorizable; + +} + +#else + +// only enable SIMD support for gcc>=5.0 and clang>=5.0 +#ifndef DUCC0_NO_SIMD +#define DUCC0_NO_SIMD +#if defined(__clang__) +// AppleClang has their own version numbering +#ifdef __apple_build_version__ +# if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1) +# undef DUCC0_NO_SIMD +# endif +#elif __clang_major__ >= 5 +# undef DUCC0_NO_SIMD +#endif +#elif defined(__GNUC__) +#if __GNUC__>=5 +#undef DUCC0_NO_SIMD +#endif +#endif +#endif + +#include +#include +#include + +#ifndef DUCC0_NO_SIMD +#if defined(__SSE2__) // we are on an x86 platform and we have vector types +#include +#endif + +#if defined(__aarch64__) // let's check for SVE and Neon +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) +#if __ARM_FEATURE_SVE_BITS>0 +// OK, we can use SVE +#warning Using SVE +#define DUCC0_USE_SVE +#include +#endif +#endif +#ifndef DUCC0_USE_SVE +// see if we can use Neon +#if defined(__ARM_NEON) +#warning Using NEON +#define DUCC0_USE_NEON +#include +#endif +#endif +#endif + +#endif + +namespace ducc0 { + +namespace detail_simd { + +/// true iff SIMD support is provided for \a T. +template constexpr inline bool vectorizable = false; +#if (!defined(DUCC0_NO_SIMD)) +#if defined(__SSE2__) || defined (DUCC0_USE_SVE) || defined (DUCC0_USE_NEON) +template<> constexpr inline bool vectorizable = true; +template<> constexpr inline bool vectorizable = true; +#endif +#endif + +/// true iff a SIMD type with vector length \a len exists for \a T. +template constexpr inline bool simd_exists = false; + +template constexpr size_t vectorlen + = vectorizable ? reglen/sizeof(T) : 1; + +template class helper_; +template struct vmask_ + { + private: + using hlp = helper_; + using Tm = typename hlp::Tm; + Tm v; + + public: +#if defined(_MSC_VER) + vmask_() {} + vmask_(const vmask_ &other) : v(other.v) {} + vmask_ &operator=(const vmask_ &other) + { v = other.v; return *this; } +#else + vmask_() = default; + vmask_(const vmask_ &other) = default; + vmask_ &operator=(const vmask_ &other) = default; +#endif + vmask_(Tm v_): v(v_) {} + operator Tm() const { return v; } + bool none() const { return hlp::mask_none(v); } + bool any() const { return hlp::mask_any(v); } + bool all() const { return hlp::mask_all(v); } + vmask_ operator& (const vmask_ &other) const { return hlp::mask_and(v,other.v); } + vmask_ operator| (const vmask_ &other) const { return hlp::mask_or(v,other.v); } + }; +struct element_aligned_tag {}; +template class vtp + { + private: + using hlp = helper_; + + public: + using value_type = T; + using Tv = typename hlp::Tv; + using Tm = vmask_; + static constexpr size_t size() { return len; } + + private: + Tv v; + + public: +#if defined(_MSC_VER) + vtp() {} + vtp(const vtp &other): v(other.v) {} + vtp &operator=(const vtp &other) + { v=other.v; return *this; } +#else + vtp() = default; + vtp(const vtp &other) = default; + vtp &operator=(const vtp &other) = default; +#endif + vtp(T other): vtp(hlp::from_scalar(other)) {} + vtp(const Tv &other) : v(other) {} + vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; } + operator Tv() const { return v; } + + vtp(const T *ptr, element_aligned_tag) : v(hlp::loadu(ptr)) {} + void copy_to(T *ptr, element_aligned_tag) const { hlp::storeu(ptr, v); } + + vtp operator-() const { return vtp(-v); } + vtp operator+(vtp other) const { return vtp(v+other.v); } + vtp operator-(vtp other) const { return vtp(v-other.v); } + vtp operator*(vtp other) const { return vtp(v*other.v); } + vtp operator/(vtp other) const { return vtp(v/other.v); } + vtp &operator+=(vtp other) { v+=other.v; return *this; } + vtp &operator-=(vtp other) { v-=other.v; return *this; } + vtp &operator*=(vtp other) { v*=other.v; return *this; } + vtp &operator/=(vtp other) { v/=other.v; return *this; } + vtp abs() const { return hlp::abs(v); } + inline vtp sqrt() const + { return hlp::sqrt(v); } + vtp max(const vtp &other) const + { return hlp::max(v, other.v); } + Tm operator>(const vtp &other) const + { return hlp::gt(v, other.v); } + Tm operator>=(const vtp &other) const + { return hlp::ge(v, other.v); } + Tm operator<(const vtp &other) const + { return hlp::lt(v, other.v); } + Tm operator!=(const vtp &other) const + { return hlp::ne(v, other.v); } + static vtp blend(Tm mask, const vtp &a, const vtp &b) + { return hlp::blend(mask, a, b); } + + class reference + { + private: + vtp &v; + size_t i; + public: + reference (vtp &v_, size_t i_) + : v(v_), i(i_) {} + reference &operator= (T other) + { v.v[i] = other; return *this; } + reference &operator*= (T other) + { v.v[i] *= other; return *this; } + operator T() const { return v.v[i]; } + }; + + void Set(size_t i, T val) { v[i] = val; } + reference operator[](size_t i) { return reference(*this, i); } + T operator[](size_t i) const { return v[i]; } + + class where_expr + { + private: + vtp &v; + Tm m; + + public: + where_expr (Tm m_, vtp &v_) + : v(v_), m(m_) {} + where_expr &operator= (const vtp &other) + { v=hlp::blend(m, other.v, v.v); return *this; } + where_expr &operator*= (const vtp &other) + { v=hlp::blend(m, v.v*other.v, v.v); return *this; } + where_expr &operator+= (const vtp &other) + { v=hlp::blend(m, v.v+other.v, v.v); return *this; } + where_expr &operator-= (const vtp &other) + { v=hlp::blend(m, v.v-other.v, v.v); return *this; } + }; + }; +template inline vtp abs(vtp v) { return v.abs(); } +template typename vtp::where_expr where(typename vtp::Tm m, vtp &v) + { return typename vtp::where_expr(m, v); } +template vtp operator*(T0 a, vtp b) + { return b*a; } +template vtp operator+(T a, vtp b) + { return b+a; } +template vtp operator-(T a, vtp b) + { return vtp(a) - b; } +template vtp max(vtp a, vtp b) + { return a.max(b); } +template vtp sqrt(vtp v) + { return v.sqrt(); } +template inline bool none_of(const vmask_ &mask) + { return mask.none(); } +template inline bool any_of(const vmask_ &mask) + { return mask.any(); } +template inline bool all_of(const vmask_ &mask) + { return mask.all(); } +template inline vtp blend (const vmask_ &mask, const vtp &a, const vtp &b) + { return vtp::blend(mask, a, b); } +template T reduce(const vtp &v, Op op) + { + T res=v[0]; + for (size_t i=1; i vtp apply(vtp in, Func func) + { + vtp res; + for (size_t i=0; i class pseudoscalar + { + private: + T v; + + public: +#if defined(_MSC_VER) + pseudoscalar() {} + pseudoscalar(const pseudoscalar &other) : v(other.v) {} + pseudoscalar & operator=(const pseudoscalar &other) + { v=other.v; return *this; } +#else + pseudoscalar() = default; + pseudoscalar(const pseudoscalar &other) = default; + pseudoscalar & operator=(const pseudoscalar &other) = default; +#endif + pseudoscalar(T v_):v(v_) {} + pseudoscalar operator-() const { return pseudoscalar(-v); } + pseudoscalar operator+(pseudoscalar other) const { return pseudoscalar(v+other.v); } + pseudoscalar operator-(pseudoscalar other) const { return pseudoscalar(v-other.v); } + pseudoscalar operator*(pseudoscalar other) const { return pseudoscalar(v*other.v); } + pseudoscalar operator/(pseudoscalar other) const { return pseudoscalar(v/other.v); } + pseudoscalar &operator+=(pseudoscalar other) { v+=other.v; return *this; } + pseudoscalar &operator-=(pseudoscalar other) { v-=other.v; return *this; } + pseudoscalar &operator*=(pseudoscalar other) { v*=other.v; return *this; } + pseudoscalar &operator/=(pseudoscalar other) { v/=other.v; return *this; } + + pseudoscalar abs() const { return std::abs(v); } + inline pseudoscalar sqrt() const { return std::sqrt(v); } + pseudoscalar max(const pseudoscalar &other) const + { return std::max(v, other.v); } + + bool operator>(const pseudoscalar &other) const + { return v>other.v; } + bool operator>=(const pseudoscalar &other) const + { return v>=other.v; } + bool operator<(const pseudoscalar &other) const + { return v class helper_ + { + private: + static constexpr size_t len = 1; + public: + using Tv = pseudoscalar; + using Tm = bool; + + static Tv loadu(const T *ptr) { return *ptr; } + static void storeu(T *ptr, Tv v) { *ptr = v[0]; } + + static Tv from_scalar(T v) { return v; } + static Tv abs(Tv v) { return v.abs(); } + static Tv max(Tv v1, Tv v2) { return v1.max(v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return m ? v1 : v2; } + static Tv sqrt(Tv v) { return v.sqrt(); } + static Tm gt (Tv v1, Tv v2) { return v1>v2; } + static Tm ge (Tv v1, Tv v2) { return v1>=v2; } + static Tm lt (Tv v1, Tv v2) { return v1 constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 8; + public: + using Tv = __m512d; + using Tm = __mmask8; + + static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); } + static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); } + + static Tv from_scalar(T v) { return _mm512_set1_pd(v); } + static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); } + static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_pd(m, v2, v1); } + static Tv sqrt(Tv v) { return _mm512_sqrt_pd(v); } + static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LT_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } + static Tm mask_or (Tm v1, Tm v2) { return v1|v2; } + static bool mask_none(Tm v) { return v==0; } + static bool mask_any(Tm v) { return v!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = Tm((size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 16; + public: + using Tv = __m512; + using Tm = __mmask16; + + static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); } + static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); } + + static Tv from_scalar(T v) { return _mm512_set1_ps(v); } + static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); } + static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_ps(m, v2, v1); } + static Tv sqrt(Tv v) { return _mm512_sqrt_ps(v); } + static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LT_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } + static Tm mask_or (Tm v1, Tm v2) { return v1|v2; } + static bool mask_none(Tm v) { return v==0; } + static bool mask_any(Tm v) { return v!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = Tm((size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 4; + public: + using Tv = __m256d; + using Tm = __m256d; + + static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); } + static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); } + + static Tv from_scalar(T v) { return _mm256_set1_pd(v); } + static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_pd(v2, v1, m); } + static Tv sqrt(Tv v) { return _mm256_sqrt_pd(v); } + static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LT_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_pd(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_pd(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm256_movemask_pd(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 8; + public: + using Tv = __m256; + using Tm = __m256; + + static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); } + static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); } + + static Tv from_scalar(T v) { return _mm256_set1_ps(v); } + static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_ps(v2, v1, m); } + static Tv sqrt(Tv v) { return _mm256_sqrt_ps(v); } + static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LT_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_ps(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_ps(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm256_movemask_ps(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 2; + public: + using Tv = __m128d; + using Tm = __m128d; + + static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); } + static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); } + + static Tv from_scalar(T v) { return _mm_set1_pd(v); } + static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) + { +#if defined(__SSE4_1__) + return _mm_blendv_pd(v2,v1,m); +#else + return _mm_or_pd(_mm_and_pd(m,v1),_mm_andnot_pd(m,v2)); +#endif + } + static Tv sqrt(Tv v) { return _mm_sqrt_pd(v); } + static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_pd(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_pd(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_pd(v1,v2); } + static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_pd(v1,v2); } + static Tm mask_and (Tm v1, Tm v2) { return _mm_and_pd(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm_or_pd(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm_movemask_pd(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 4; + public: + using Tv = __m128; + using Tm = __m128; + + static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); } + static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); } + + static Tv from_scalar(T v) { return _mm_set1_ps(v); } + static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) + { +#if defined(__SSE4_1__) + return _mm_blendv_ps(v2,v1,m); +#else + return _mm_or_ps(_mm_and_ps(m,v1),_mm_andnot_ps(m,v2)); +#endif + } + static Tv sqrt(Tv v) { return _mm_sqrt_ps(v); } + static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_ps(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_ps(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_ps(v1,v2); } + static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_ps(v1,v2); } + static Tm mask_and (Tm v1, Tm v2) { return _mm_and_ps(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm_or_ps(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm_movemask_ps(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< class gnuvec_helper + { + public: + using Tv __attribute__ ((vector_size (len*sizeof(T)))) = T; + using Tm = decltype(Tv()v2; } + static Tm ge (Tv v1, Tv v2) { return v1>=v2; } + static Tm lt (Tv v1, Tv v2) { return v1 constexpr inline bool simd_exists = true; +template<> class helper_: public gnuvec_helper {}; +template<> constexpr inline bool simd_exists = true; +template<> class helper_: public gnuvec_helper {}; +#endif + +#if defined(DUCC0_USE_NEON) +template<> constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 2; + public: + using Tv = float64x2_t; + using Tm = uint64x2_t; + + static Tv loadu(const T *ptr) { return vld1q_f64(ptr); } + static void storeu(T *ptr, Tv v) { vst1q_f64(ptr, v); } + + static Tv from_scalar(T v) { return vdupq_n_f64(v); } + static Tv abs(Tv v) { return vabsq_f64(v); } + static Tv max(Tv v1, Tv v2) { return vmaxq_f64(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) + { return vbslq_f64(m, v1, v2); } + static Tv sqrt(Tv v) { return vsqrtq_f64(v); } + static Tm gt (Tv v1, Tv v2) { return vcgtq_f64(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return vcgeq_f64(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return vcltq_f64(v1,v2); } + static Tm ne (Tv v1, Tv v2) + { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(v1,v2)))); } + static Tm mask_and (Tm v1, Tm v2) { return vandq_u64(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return vorrq_u64(v1,v2); } + static size_t maskbits(Tm v) + { + auto high_bits = vshrq_n_u64(v, 63); + return vgetq_lane_u64(high_bits, 0) | ((vgetq_lane_u64(high_bits, 1)<<1)); + } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 4; + public: + using Tv = float32x4_t; + using Tm = uint32x4_t; + + static Tv loadu(const T *ptr) { return vld1q_f32(ptr); } + static void storeu(T *ptr, Tv v) { vst1q_f32(ptr, v); } + + static Tv from_scalar(T v) { return vdupq_n_f32(v); } + static Tv abs(Tv v) { return vabsq_f32(v); } + static Tv max(Tv v1, Tv v2) { return vmaxq_f32(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return vbslq_f32(m, v1, v2); } + static Tv sqrt(Tv v) { return vsqrtq_f32(v); } + static Tm gt (Tv v1, Tv v2) { return vcgtq_f32(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return vcgeq_f32(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return vcltq_f32(v1,v2); } + static Tm ne (Tv v1, Tv v2) { return vmvnq_u32(vceqq_f32(v1,v2)); } + static Tm mask_and (Tm v1, Tm v2) { return vandq_u32(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return vorrq_u32(v1,v2); } + static size_t maskbits(Tm v) + { + static constexpr int32x4_t shift = {0, 1, 2, 3}; + auto tmp = vshrq_n_u32(v, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); + } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< using native_simd = vtp>; +#elif defined(__AVX__) +template using native_simd = vtp>; +#elif defined(__SSE2__) +template using native_simd = vtp>; +#elif defined(DUCC0_USE_SVE) +template using native_simd = vtp>; +#elif defined(DUCC0_USE_NEON) +template using native_simd = vtp>; +#else +template using native_simd = vtp; +#endif + +#else // DUCC0_NO_SIMD is defined +/// The SIMD type for \a T with the largest vector length on this platform. +template using native_simd = vtp; +#endif +/// Provides a SIMD type for \a T with vector length \a len, if it exists. +template struct simd_select + { using type = vtp; }; +template inline vtp sin(vtp in) + { return apply(in,[](T v){return std::sin(v);}); } +template inline vtp cos(vtp in) + { return apply(in,[](T v){return std::cos(v);}); } + +} + +using detail_simd::element_aligned_tag; +using detail_simd::native_simd; +using detail_simd::simd_select; +using detail_simd::simd_exists; +using detail_simd::vectorizable; + +} +#endif +#endif diff --git a/benchees/duccfft/ducc0/infra/threading.cc b/benchees/duccfft/ducc0/infra/threading.cc new file mode 100644 index 0000000..cf0e036 --- /dev/null +++ b/benchees/duccfft/ducc0/infra/threading.cc @@ -0,0 +1,570 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** \file ducc0/infra/threading.cc + * + * \copyright Copyright (C) 2019-2021 Peter Bell, Max-Planck-Society + * \authors Peter Bell, Martin Reinecke + */ + +#include "ducc0/infra/threading.h" + +#ifndef DUCC0_NO_THREADING +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if __has_include() +#include +#endif +#include "ducc0/infra/misc_utils.h" +#endif + +namespace ducc0 { + +namespace detail_threading { + +#ifndef DUCC0_NO_THREADING + +static const size_t max_threads_ = std::max(1, std::thread::hardware_concurrency()); + +std::atomic default_nthreads_(max_threads_); + +size_t get_default_nthreads() + { return default_nthreads_; } + +void set_default_nthreads(size_t new_default_nthreads) + { default_nthreads_ = std::max(1, new_default_nthreads); } + +size_t max_threads() { return max_threads_; } + +class latch + { + std::atomic num_left_; + std::mutex mut_; + std::condition_variable completed_; + using lock_t = std::unique_lock; + + public: + latch(size_t n): num_left_(n) {} + + void count_down() + { + lock_t lock(mut_); + if (--num_left_) + return; + completed_.notify_all(); + } + + void wait() + { + lock_t lock(mut_); + completed_.wait(lock, [this]{ return is_ready(); }); + } + bool is_ready() { return num_left_ == 0; } + }; + +template class concurrent_queue + { + std::queue q_; + std::mutex mut_; + std::atomic size_; + using lock_t = std::lock_guard; + + public: + void push(T val) + { + lock_t lock(mut_); + ++size_; + q_.push(std::move(val)); + } + + bool try_pop(T &val) + { + if (size_==0) return false; + lock_t lock(mut_); + // Queue might have been emptied while we acquired the lock + if (q_.empty()) return false; + + val = std::move(q_.front()); + --size_; + q_.pop(); + return true; + } + + bool empty() const { return size_==0; } + }; + +class thread_pool + { + private: + // A reasonable guess, probably close enough for most hardware + static constexpr size_t cache_line_size = 64; + struct alignas(cache_line_size) worker + { + std::thread thread; + std::condition_variable work_ready; + std::mutex mut; + std::atomic_flag busy_flag = ATOMIC_FLAG_INIT; + std::function work; + + void worker_main( + std::atomic &shutdown_flag, + std::atomic &unscheduled_tasks, + concurrent_queue> &overflow_work) + { + using lock_t = std::unique_lock; + bool expect_work = true; + while (!shutdown_flag || expect_work) + { + std::function local_work; + if (expect_work || unscheduled_tasks == 0) + { + lock_t lock(mut); + // Wait until there is work to be executed + work_ready.wait(lock, [&]{ return (work || shutdown_flag); }); + local_work.swap(work); + expect_work = false; + } + + bool marked_busy = false; + if (local_work) + { + marked_busy = true; + local_work(); + } + + if (!overflow_work.empty()) + { + if (!marked_busy && busy_flag.test_and_set()) + { + expect_work = true; + continue; + } + marked_busy = true; + + while (overflow_work.try_pop(local_work)) + { + --unscheduled_tasks; + local_work(); + } + } + + if (marked_busy) busy_flag.clear(); + } + } + }; + + concurrent_queue> overflow_work_; + std::mutex mut_; + std::vector workers_; + std::atomic shutdown_; + std::atomic unscheduled_tasks_; + using lock_t = std::lock_guard; + + void create_threads() + { + lock_t lock(mut_); + size_t nthreads=workers_.size(); + for (size_t i=0; ibusy_flag.clear(); + worker->work = nullptr; + worker->thread = std::thread( + [worker, this]{ worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_); }); + } + catch (...) + { + shutdown_locked(); + throw; + } + } + } + + void shutdown_locked() + { + shutdown_ = true; + for (auto &worker : workers_) + worker.work_ready.notify_all(); + + for (auto &worker : workers_) + if (worker.thread.joinable()) + worker.thread.join(); + } + + public: + explicit thread_pool(size_t nthreads): + workers_(nthreads) + { create_threads(); } + + thread_pool(): thread_pool(max_threads_) {} + + ~thread_pool() { shutdown(); } + + void submit(std::function work) + { + lock_t lock(mut_); + if (shutdown_) + throw std::runtime_error("Work item submitted after shutdown"); + + ++unscheduled_tasks_; + + // First check for any idle workers and wake those + for (auto &worker : workers_) + if (!worker.busy_flag.test_and_set()) + { + --unscheduled_tasks_; + { + lock_t lock(worker.mut); + worker.work = std::move(work); + } + worker.work_ready.notify_one(); + return; + } + + // If no workers were idle, push onto the overflow queue for later + overflow_work_.push(std::move(work)); + } + + void shutdown() + { + lock_t lock(mut_); + shutdown_locked(); + } + + void restart() + { + shutdown_ = false; + create_threads(); + } + }; + +inline thread_pool &get_pool() + { + static thread_pool pool; +#if __has_include() + static std::once_flag f; + call_once(f, + []{ + pthread_atfork( + +[]{ get_pool().shutdown(); }, // prepare + +[]{ get_pool().restart(); }, // parent + +[]{ get_pool().restart(); } // child + ); + }); +#endif + + return pool; + } + +class Distribution + { + private: + size_t nthreads_; + std::mutex mut_; + size_t nwork_; + size_t cur_; + std::atomic cur_dynamic_; + size_t chunksize_; + double fact_max_; + std::vector nextstart; + enum SchedMode { SINGLE, STATIC, DYNAMIC, GUIDED }; + SchedMode mode; + bool single_done; + + void thread_map(std::function f); + + public: + size_t nthreads() const { return nthreads_; } + + void execSingle(size_t nwork, std::function f) + { + mode = SINGLE; + single_done = false; + nwork_ = nwork; + nthreads_ = 1; + thread_map(move(f)); + } + void execStatic(size_t nwork, size_t nthreads, size_t chunksize, + std::function f) + { + mode = STATIC; + nthreads_ = (nthreads==0) ? get_default_nthreads() : nthreads; + if (nthreads_ == 1) + return execSingle(nwork, move(f)); + nwork_ = nwork; + chunksize_ = (chunksize<1) ? (nwork_+nthreads_-1)/nthreads_ + : chunksize; + if (chunksize_>=nwork_) + return execSingle(nwork_, move(f)); + nextstart.resize(nthreads_); + for (size_t i=0; i f) + { + mode = DYNAMIC; + nthreads_ = (nthreads==0) ? get_default_nthreads() : nthreads; + if (nthreads_ == 1) + return execSingle(nwork, move(f)); + nwork_ = nwork; + chunksize_ = (chunksize<1) ? 1 : chunksize; + if (chunksize_ >= nwork) + return execSingle(nwork, move(f)); + if (chunksize_*nthreads_>=nwork_) + return execStatic(nwork, nthreads, 0, move(f)); + cur_dynamic_ = 0; + thread_map(move(f)); + } + void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min, + double fact_max, std::function f) + { + mode = GUIDED; + nthreads_ = (nthreads==0) ? get_default_nthreads() : nthreads; + if (nthreads_ == 1) + return execSingle(nwork, move(f)); + nwork_ = nwork; + chunksize_ = (chunksize_min<1) ? 1 : chunksize_min; + if (chunksize_*nthreads_>=nwork_) + return execStatic(nwork, nthreads, 0, move(f)); + fact_max_ = fact_max; + cur_ = 0; + thread_map(move(f)); + } + void execParallel(size_t nthreads, std::function f) + { + mode = STATIC; + nthreads_ = (nthreads==0) ? get_default_nthreads() : nthreads; + nwork_ = nthreads_; + chunksize_ = 1; + thread_map(move(f)); + } + Range getNext(size_t thread_id) + { + switch (mode) + { + case SINGLE: + { + if (single_done) return Range(); + single_done=true; + return Range(0, nwork_); + } + case STATIC: + { + if (nextstart[thread_id]>=nwork_) return Range(); + size_t lo=nextstart[thread_id]; + size_t hi=std::min(lo+chunksize_,nwork_); + nextstart[thread_id] += nthreads_*chunksize_; + return Range(lo, hi); + } + case DYNAMIC: + { + auto curval = cur_dynamic_.fetch_add(chunksize_); + return Range(std::min(curval, nwork_), + std::min(curval+chunksize_, nwork_)); + } + case GUIDED: + { + std::unique_lock lck(mut_); + if (cur_>=nwork_) return Range(); + auto rem = nwork_-cur_; + size_t tmp = size_t((fact_max_*double(rem))/double(nthreads_)); + auto sz = std::min(rem, std::max(chunksize_, tmp)); + size_t lo=cur_; + cur_+=sz; + size_t hi=cur_; + return Range(lo, hi); + } + } + return Range(); + } + }; + +class MyScheduler: public Scheduler + { + private: + Distribution &dist_; + size_t ithread_; + + public: + MyScheduler(Distribution &dist, size_t ithread) + : dist_(dist), ithread_(ithread) {} + virtual size_t num_threads() const { return dist_.nthreads(); } + virtual size_t thread_num() const { return ithread_; } + virtual Range getNext() { return dist_.getNext(ithread_); } + }; + +void Distribution::thread_map(std::function f) + { + if (nthreads_ == 1) + { + MyScheduler sched(*this, 0); + f(sched); + return; + } + + auto & pool = get_pool(); + latch counter(nthreads_); + std::exception_ptr ex; + std::mutex ex_mut; + for (size_t i=0; i lock(ex_mut); + ex = std::current_exception(); + } + counter.count_down(); + }); + } + counter.wait(); + if (ex) + std::rethrow_exception(ex); + } + +void execSingle(size_t nwork, std::function func) + { + Distribution dist; + dist.execSingle(nwork, move(func)); + } +void execStatic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func) + { + Distribution dist; + dist.execStatic(nwork, nthreads, chunksize, move(func)); + } +void execDynamic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func) + { + Distribution dist; + dist.execDynamic(nwork, nthreads, chunksize, move(func)); + } +void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min, + double fact_max, std::function func) + { + Distribution dist; + dist.execGuided(nwork, nthreads, chunksize_min, fact_max, move(func)); + } +void execParallel(size_t nthreads, std::function func) + { + Distribution dist; + dist.execParallel(nthreads, move(func)); + } +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func) + { + nthreads = (nthreads==0) ? get_default_nthreads() : nthreads; + execParallel(nthreads, [&](Scheduler &sched) + { + auto tid = sched.thread_num(); + auto [lo, hi] = calcShare(nthreads, tid, work_lo, work_hi); + func(lo, hi); + }); + } +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func) + { + nthreads = (nthreads==0) ? get_default_nthreads() : nthreads; + execParallel(nthreads, [&](Scheduler &sched) + { + auto tid = sched.thread_num(); + auto [lo, hi] = calcShare(nthreads, tid, work_lo, work_hi); + func(tid, lo, hi); + }); + } + +#else + +size_t get_default_nthreads() { return 1; } +void set_default_nthreads(size_t /* new_default_nthreads */) {} +size_t max_threads() { return 1; } + +class MyScheduler: public Scheduler + { + private: + size_t nwork_; + + public: + MyScheduler(size_t nwork) : nwork_(nwork) {} + virtual size_t num_threads() const { return 1; } + virtual size_t thread_num() const { return 0; } + virtual Range getNext() + { + Range res(0, nwork_); + nwork_=0; + return res; + } + }; + +void execSingle(size_t nwork, std::function func) + { + MyScheduler sched(nwork); + func(sched); + } +void execStatic(size_t nwork, size_t, size_t, + std::function func) + { + MyScheduler sched(nwork); + func(sched); + } +void execDynamic(size_t nwork, size_t, size_t, + std::function func) + { + MyScheduler sched(nwork); + func(sched); + } +void execGuided(size_t nwork, size_t, size_t, double, + std::function func) + { + MyScheduler sched(nwork); + func(sched); + } +void execParallel(size_t, std::function func) + { + MyScheduler sched(1); + func(sched); + } +void execParallel(size_t work_lo, size_t work_hi, size_t, + std::function func) + { func(work_lo, work_hi); } +void execParallel(size_t work_lo, size_t work_hi, size_t, + std::function func) + { func(0, work_lo, work_hi); } + +#endif + +}} diff --git a/benchees/duccfft/ducc0/infra/threading.h b/benchees/duccfft/ducc0/infra/threading.h new file mode 100644 index 0000000..be14058 --- /dev/null +++ b/benchees/duccfft/ducc0/infra/threading.h @@ -0,0 +1,131 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** \file ducc0/infra/threading.h + * Mulithreading support, similar to functionality provided by OpenMP + * + * \copyright Copyright (C) 2019-2021 Peter Bell, Max-Planck-Society + * \authors Peter Bell, Martin Reinecke + */ + +#ifndef DUCC0_THREADING_H +#define DUCC0_THREADING_H + +#include +#include + +namespace ducc0 { + +namespace detail_threading { + +using std::size_t; + +/// Index range describing a chunk of work inside a parallellized loop +struct Range + { + size_t lo, //< first index of the chunk + hi; //< one-past-last index of the chunk + Range() : lo(0), hi(0) {} + Range(size_t lo_, size_t hi_) : lo(lo_), hi(hi_) {} + /// Returns true iff the chunk is not empty + operator bool() const { return hi>lo; } + }; + +/// Class supplied to parallel regions, which allows them to determine their +/// work chunks. +class Scheduler + { + public: + virtual ~Scheduler() {} + /// Returns the number of threads working in this parallel region + virtual size_t num_threads() const = 0; + /// Returns the number of this thread, from the range 0 to num_threads()-1. + virtual size_t thread_num() const = 0; + /// Returns information about the next chunk of work. + /// If this chunk is empty, the work on this thread is done. + virtual Range getNext() = 0; + }; + +/// Returns the maximum number of threads that are supported by the hardware. +/** More threads can be used, but this will probably hurt performance. */ +size_t max_threads(); +void set_default_nthreads(size_t new_default_nthreads); +size_t get_default_nthreads(); + +/// Execute \a func over \a nwork work items, on a single thread. +void execSingle(size_t nwork, + std::function func); +/// Execute \a func over \a nwork work items, on \a nthreads threads. +/** Chunks will have the size \a chunksize, except for the last one which + * may be smaller. + * + * Chunks are statically assigned to threads at startup. */ +void execStatic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func); +/// Execute \a func over \a nwork work items, on \a nthreads threads. +/** Chunks will have the size \a chunksize, except for the last one which + * may be smaller. + * + * Chunks are assigned dynamically to threads;whenever a thread is finished + * with its current chunk, it will obtain the next one from the list of + * remaining chunks. */ +void execDynamic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func); +void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min, + double fact_max, std::function func); +/// Execute \a func on \a nthreads threads. +/** Work subdivision must be organized within \a func. */ +void execParallel(size_t nthreads, std::function func); +/// Execute \a func on work items [\a lo; \a hi[ over \a nthreads threads. +/** Work items are subdivided fairly among threads. */ +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func); +/// Execute \a func on work items [0; \a nwork[ over \a nthreads threads. +/** Work items are subdivided fairly among threads. */ +inline void execParallel(size_t nwork, size_t nthreads, + std::function func) + { execParallel(0, nwork, nthreads, func); } +/// Execute \a func on work items [\a lo; \a hi[ over \a nthreads threads. +/** The first argument to \a func is the thread number. + * + * Work items are subdivided fairly among threads. */ +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func); +/// Execute \a func on work items [0; \a nwork[ over \a nthreads threads. +/** The first argument to \a func is the thread number. + * + * Work items are subdivided fairly among threads. */ +inline void execParallel(size_t nwork, size_t nthreads, + std::function func) + { execParallel(0, nwork, nthreads, func); } + +} // end of namespace detail_threading + +using detail_threading::max_threads; +using detail_threading::get_default_nthreads; +using detail_threading::set_default_nthreads; +using detail_threading::Scheduler; +using detail_threading::execSingle; +using detail_threading::execStatic; +using detail_threading::execDynamic; +using detail_threading::execGuided; +using detail_threading::execParallel; + +} // end of namespace ducc0 + +#endif diff --git a/benchees/duccfft/ducc0/infra/useful_macros.h b/benchees/duccfft/ducc0/infra/useful_macros.h new file mode 100644 index 0000000..2de3999 --- /dev/null +++ b/benchees/duccfft/ducc0/infra/useful_macros.h @@ -0,0 +1,21 @@ +#ifndef DUCC0_USEFUL_MACROS_H +#define DUCC0_USEFUL_MACROS_H + +#if defined(__GNUC__) +#define DUCC0_NOINLINE [[gnu::noinline]] +#define DUCC0_RESTRICT __restrict__ +#define DUCC0_PREFETCH_R(addr) __builtin_prefetch(addr); +#define DUCC0_PREFETCH_W(addr) __builtin_prefetch(addr,1); +#elif defined(_MSC_VER) +#define DUCC0_NOINLINE __declspec(noinline) +#define DUCC0_RESTRICT __restrict +#define DUCC0_PREFETCH_R(addr) +#define DUCC0_PREFETCH_W(addr) +#else +#define DUCC0_NOINLINE +#define DUCC0_RESTRICT +#define DUCC0_PREFETCH_R(addr) +#define DUCC0_PREFETCH_W(addr) +#endif + +#endif diff --git a/benchees/duccfft/ducc0/math/cmplx.h b/benchees/duccfft/ducc0/math/cmplx.h new file mode 100644 index 0000000..14232b1 --- /dev/null +++ b/benchees/duccfft/ducc0/math/cmplx.h @@ -0,0 +1,80 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** \file ducc0/math/cmplx.h + * Minimalistic complex number class + * + * \copyright Copyright (C) 2019-2021 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef DUCC0_CMPLX_H +#define DUCC0_CMPLX_H + +namespace ducc0 { + +/// Very basic class representing complex numbers +/** Meant exclusively for internal low-level use, e.g. in FFT routines. */ +template struct Cmplx { + T r, i; + Cmplx() {} + constexpr Cmplx(T r_, T i_) : r(r_), i(i_) {} + void Set(T r_, T i_) { r=r_; i=i_; } + void Set(T r_) { r=r_; i=T(0); } + void Split(T &r_, T &i_) const { r_=r; i_=i; } + void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; } + Cmplx &operator+= (const Cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templateCmplx &operator*= (T2 other) + { r*=other; i*=other; return *this; } + templateCmplx &operator*= (const Cmplx &other) + { + T tmp = r*other.r - i*other.i; + i = r*other.i + i*other.r; + r = tmp; + return *this; + } + Cmplx conj() const { return {r, -i}; } + templateCmplx &operator+= (const Cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templateCmplx &operator-= (const Cmplx &other) + { r-=other.r; i-=other.i; return *this; } + template auto operator* (const T2 &other) const + -> Cmplx + { return {r*other, i*other}; } + template auto operator+ (const Cmplx &other) const + -> Cmplx + { return {r+other.r, i+other.i}; } + template auto operator- (const Cmplx &other) const + -> Cmplx + { return {r-other.r, i-other.i}; } + template auto operator* (const Cmplx &other) const + -> Cmplx + { return {r*other.r-i*other.i, r*other.i + i*other.r}; } + template auto special_mul (const Cmplx &other) const + -> Cmplx + { + using Tres = Cmplx; + return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i) + : Tres(r*other.r-i*other.i, r*other.i+i*other.r); + } + }; + +} + +#endif diff --git a/benchees/duccfft/ducc0/math/unity_roots.h b/benchees/duccfft/ducc0/math/unity_roots.h new file mode 100644 index 0000000..166be42 --- /dev/null +++ b/benchees/duccfft/ducc0/math/unity_roots.h @@ -0,0 +1,214 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Copyright (C) 2019-2021 Max-Planck-Society + Author: Martin Reinecke */ + +#ifndef DUCC0_UNITY_ROOTS_H +#define DUCC0_UNITY_ROOTS_H + +#include +#include +#include +#include + +namespace ducc0 { + +namespace detail_unity_roots { + +using namespace std; + +template class UnityRoots + { + private: + using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; + struct cmplx_ { Thigh r, i; }; + size_t N, mask, shift; + vector v1, v2; + + static cmplx_ calc(size_t x, size_t n, Thigh ang) + { + x<<=3; + if (x<4*n) // first half + { + if (x<2*n) // first quadrant + { + if (x>shift]; + return Tc(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); + } + idx = N-idx; + auto x1=v1[idx&mask], x2=v2[idx>>shift]; + return Tc(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r)); + } + }; + +template class MultiExp + { + private: + using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; + struct cmplx_ { Thigh r, i; }; + size_t N, mask, shift; + vector v1, v2; + + public: + MultiExp(T ang0, size_t n) + : N(n) + { + Thigh ang = ang0; + size_t nval = n+2; + shift = 1; + while((size_t(1)<>shift]; + return Tc(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); + } + }; + +} + +using detail_unity_roots::UnityRoots; +using detail_unity_roots::MultiExp; + +} + +#endif diff --git a/benchees/pocketfft_cxx/Makefile.am b/benchees/pocketfft_cxx/Makefile.am new file mode 100644 index 0000000..4f1ecac --- /dev/null +++ b/benchees/pocketfft_cxx/Makefile.am @@ -0,0 +1,8 @@ +PRG=doit + +AM_CPPFLAGS = $(INCLBENCH) + +doit_SOURCES=doit.cc +doit_LDADD=$(LIBBENCH) @FLIBS@ + +include ../Makefile.common diff --git a/benchees/pocketfft_cxx/doit.cc b/benchees/pocketfft_cxx/doit.cc new file mode 100644 index 0000000..3bda34b --- /dev/null +++ b/benchees/pocketfft_cxx/doit.cc @@ -0,0 +1,80 @@ +/* this program is in the public domain */ + +#include "bench-user.h" +#include "pocketfft_hdronly.h" +#include +#include + +using namespace std; +using namespace pocketfft; + +BEGIN_BENCH_DOC +BENCH_DOC("name", "pocketfft_cxx") +BENCH_DOC("author", "Martin Reinecke") +BENCH_DOC("year", "2019") +BENCH_DOC("version", "1.0") +BENCH_DOC("language", "C++") +BENCH_DOC("url", "https://gitlab.mpcdf.mpg.de/mtr/pypocketfft") +BENCH_DOC("url-was-valid-on", "Fri Jul 23 23:06:24 ACST 2020") +BENCH_DOC("copyright", "3 clause BSDL") +END_BENCH_DOC + +int can_do(struct problem *p) +{ + return true; +} + +void copy_h2c(struct problem *p, bench_complex *out) +{ + copy_h2c_1d_fftpack(p, out, -1.0); +} + +void copy_c2h(struct problem *p, bench_complex *in) +{ + copy_c2h_1d_fftpack(p, in, -1.0); +} + + +void setup(struct problem *p) +{ + BENCH_ASSERT(can_do(p)); + // populate the transform cache + doit(1,p); +} + +void doit(int iter, struct problem *p) +{ + static shape_t shape, axes; + static stride_t strides; + shape.resize(p->rank); + strides.resize(p->rank); + axes.resize(p->rank); + for (int i=0; irank; ++i) { + shape[i] = p->n[i]; + axes[i] = i; + } + ptrdiff_t str=sizeof(bench_real); + if (p->kind == PROBLEM_COMPLEX) str*=2; + for (int i=p->rank-1; i>=0; --i) { + strides[i] = str; + str *= shape[i]; + } + + if (p->kind == PROBLEM_COMPLEX) { + auto in = reinterpret_cast *>(p->in); + auto out = reinterpret_cast *>(p->out); + for (int i = 0; i < iter; ++i) { + c2c(shape, strides, strides, axes,p->sign==-1,in, out,bench_real(1)); + } + } else { + auto in = reinterpret_cast(p->in); + auto out = reinterpret_cast(p->out); + for (int i = 0; i < iter; ++i) { + r2r_fftpack(shape, strides,strides,axes,p->sign==-1,p->sign==-1,in,out,bench_real(1)); + } + } +} + +void done(struct problem *p) +{ +} diff --git a/benchees/pocketfft_cxx/pocketfft_hdronly.h b/benchees/pocketfft_cxx/pocketfft_hdronly.h new file mode 100644 index 0000000..4b3095a --- /dev/null +++ b/benchees/pocketfft_cxx/pocketfft_hdronly.h @@ -0,0 +1,3578 @@ +/* +This file is part of pocketfft. + +Copyright (C) 2010-2021 Max-Planck-Society +Copyright (C) 2019-2020 Peter Bell + +For the odd-sized DCT-IV transforms: + Copyright (C) 2003, 2007-14 Matteo Frigo + Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology + +Authors: Martin Reinecke, Peter Bell + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef POCKETFFT_HDRONLY_H +#define POCKETFFT_HDRONLY_H + +#ifndef __cplusplus +#error This file is C++ and requires a C++ compiler. +#endif + +#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L) +#error This file requires at least C++11 support. +#endif + +#ifndef POCKETFFT_CACHE_SIZE +#define POCKETFFT_CACHE_SIZE 16 +#endif + +#include +#include +#include +#include +#include +#include +#include +#if POCKETFFT_CACHE_SIZE!=0 +#include +#include +#endif + +#ifndef POCKETFFT_NO_MULTITHREADING +#include +#include +#include +#include +#include +#include +#include + +#ifdef POCKETFFT_PTHREADS +# include +#endif +#endif + +#if defined(__GNUC__) +#define POCKETFFT_NOINLINE __attribute__((noinline)) +#define POCKETFFT_RESTRICT __restrict__ +#elif defined(_MSC_VER) +#define POCKETFFT_NOINLINE __declspec(noinline) +#define POCKETFFT_RESTRICT __restrict +#else +#define POCKETFFT_NOINLINE +#define POCKETFFT_RESTRICT +#endif + +namespace pocketfft { + +namespace detail { +using std::size_t; +using std::ptrdiff_t; + +// Always use std:: for functions +template T cos(T) = delete; +template T sin(T) = delete; +template T sqrt(T) = delete; + +using shape_t = std::vector; +using stride_t = std::vector; + +constexpr bool FORWARD = true, + BACKWARD = false; + +// only enable vector support for gcc>=5.0 and clang>=5.0 +#ifndef POCKETFFT_NO_VECTORS +#define POCKETFFT_NO_VECTORS +#if defined(__INTEL_COMPILER) +// do nothing. This is necessary because this compiler also sets __GNUC__. +#elif defined(__clang__) +// AppleClang has their own version numbering +#ifdef __apple_build_version__ +# if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1) +# undef POCKETFFT_NO_VECTORS +# endif +#elif __clang_major__ >= 5 +# undef POCKETFFT_NO_VECTORS +#endif +#elif defined(__GNUC__) +#if __GNUC__>=5 +#undef POCKETFFT_NO_VECTORS +#endif +#endif +#endif + +template struct VLEN { static constexpr size_t val=1; }; + +#ifndef POCKETFFT_NO_VECTORS +#if (defined(__AVX512F__)) +template<> struct VLEN { static constexpr size_t val=16; }; +template<> struct VLEN { static constexpr size_t val=8; }; +#elif (defined(__AVX__)) +template<> struct VLEN { static constexpr size_t val=8; }; +template<> struct VLEN { static constexpr size_t val=4; }; +#elif (defined(__SSE2__)) +template<> struct VLEN { static constexpr size_t val=4; }; +template<> struct VLEN { static constexpr size_t val=2; }; +#elif (defined(__VSX__)) +template<> struct VLEN { static constexpr size_t val=4; }; +template<> struct VLEN { static constexpr size_t val=2; }; +#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) +template<> struct VLEN { static constexpr size_t val=4; }; +template<> struct VLEN { static constexpr size_t val=2; }; +#else +#define POCKETFFT_NO_VECTORS +#endif +#endif + +#if __cplusplus >= 201703L +inline void *aligned_alloc(size_t align, size_t size) + { + // aligned_alloc() requires that the requested size is a multiple of "align" + void *ptr = ::aligned_alloc(align,(size+align-1)&(~(align-1))); + if (!ptr) throw std::bad_alloc(); + return ptr; + } +inline void aligned_dealloc(void *ptr) + { free(ptr); } +#else // portable emulation +inline void *aligned_alloc(size_t align, size_t size) + { + align = std::max(align, alignof(max_align_t)); + void *ptr = malloc(size+align); + if (!ptr) throw std::bad_alloc(); + void *res = reinterpret_cast + ((reinterpret_cast(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align)); + (reinterpret_cast(res))[-1] = ptr; + return res; + } +inline void aligned_dealloc(void *ptr) + { if (ptr) free((reinterpret_cast(ptr))[-1]); } +#endif + +template class arr + { + private: + T *p; + size_t sz; + +#if defined(POCKETFFT_NO_VECTORS) + static T *ralloc(size_t num) + { + if (num==0) return nullptr; + void *res = malloc(num*sizeof(T)); + if (!res) throw std::bad_alloc(); + return reinterpret_cast(res); + } + static void dealloc(T *ptr) + { free(ptr); } +#else + static T *ralloc(size_t num) + { + if (num==0) return nullptr; + void *ptr = aligned_alloc(64, num*sizeof(T)); + return static_cast(ptr); + } + static void dealloc(T *ptr) + { aligned_dealloc(ptr); } +#endif + + public: + arr() : p(0), sz(0) {} + arr(size_t n) : p(ralloc(n)), sz(n) {} + arr(arr &&other) + : p(other.p), sz(other.sz) + { other.p=nullptr; other.sz=0; } + ~arr() { dealloc(p); } + + void resize(size_t n) + { + if (n==sz) return; + dealloc(p); + p = ralloc(n); + sz = n; + } + + T &operator[](size_t idx) { return p[idx]; } + const T &operator[](size_t idx) const { return p[idx]; } + + T *data() { return p; } + const T *data() const { return p; } + + size_t size() const { return sz; } + }; + +template struct cmplx { + T r, i; + cmplx() {} + cmplx(T r_, T i_) : r(r_), i(i_) {} + void Set(T r_, T i_) { r=r_; i=i_; } + void Set(T r_) { r=r_; i=T(0); } + cmplx &operator+= (const cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templatecmplx &operator*= (T2 other) + { r*=other; i*=other; return *this; } + templatecmplx &operator*= (const cmplx &other) + { + T tmp = r*other.r - i*other.i; + i = r*other.i + i*other.r; + r = tmp; + return *this; + } + templatecmplx &operator+= (const cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templatecmplx &operator-= (const cmplx &other) + { r-=other.r; i-=other.i; return *this; } + template auto operator* (const T2 &other) const + -> cmplx + { return {r*other, i*other}; } + template auto operator+ (const cmplx &other) const + -> cmplx + { return {r+other.r, i+other.i}; } + template auto operator- (const cmplx &other) const + -> cmplx + { return {r-other.r, i-other.i}; } + template auto operator* (const cmplx &other) const + -> cmplx + { return {r*other.r-i*other.i, r*other.i + i*other.r}; } + template auto special_mul (const cmplx &other) const + -> cmplx + { + using Tres = cmplx; + return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i) + : Tres(r*other.r-i*other.i, r*other.i+i*other.r); + } +}; +template inline void PM(T &a, T &b, T c, T d) + { a=c+d; b=c-d; } +template inline void PMINPLACE(T &a, T &b) + { T t = a; a+=b; b=t-b; } +template inline void MPINPLACE(T &a, T &b) + { T t = a; a-=b; b=t+b; } +template cmplx conj(const cmplx &a) + { return {a.r, -a.i}; } +template void special_mul (const cmplx &v1, const cmplx &v2, cmplx &res) + { + res = fwd ? cmplx(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) + : cmplx(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); + } + +template void ROT90(cmplx &a) + { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } +template void ROTX90(cmplx &a) + { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } + +// +// twiddle factor section +// +template class sincos_2pibyn + { + private: + using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type; + size_t N, mask, shift; + arr> v1, v2; + + static cmplx calc(size_t x, size_t n, Thigh ang) + { + x<<=3; + if (x<4*n) // first half + { + if (x<2*n) // first quadrant + { + if (x(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang)); + return cmplx(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang)); + } + else // second quadrant + { + x-=2*n; + if (x(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang)); + return cmplx(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang)); + } + } + else + { + x=8*n-x; + if (x<2*n) // third quadrant + { + if (x(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang)); + return cmplx(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang)); + } + else // fourth quadrant + { + x-=2*n; + if (x(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang)); + return cmplx(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang)); + } + } + } + + public: + POCKETFFT_NOINLINE sincos_2pibyn(size_t n) + : N(n) + { + constexpr auto pi = 3.141592653589793238462643383279502884197L; + Thigh ang = Thigh(0.25L*pi/n); + size_t nval = (n+2)/2; + shift = 1; + while((size_t(1)< operator[](size_t idx) const + { + if (2*idx<=N) + { + auto x1=v1[idx&mask], x2=v2[idx>>shift]; + return cmplx(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); + } + idx = N-idx; + auto x1=v1[idx&mask], x2=v2[idx>>shift]; + return cmplx(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r)); + } + }; + +struct util // hack to avoid duplicate symbols + { + static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n) + { + size_t res=1; + while ((n&1)==0) + { res=2; n>>=1; } + for (size_t x=3; x*x<=n; x+=2) + while ((n%x)==0) + { res=x; n/=x; } + if (n>1) res=n; + return res; + } + + static POCKETFFT_NOINLINE double cost_guess (size_t n) + { + constexpr double lfp=1.1; // penalty for non-hardcoded larger factors + size_t ni=n; + double result=0.; + while ((n&1)==0) + { result+=2; n>>=1; } + for (size_t x=3; x*x<=n; x+=2) + while ((n%x)==0) + { + result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors + n/=x; + } + if (n>1) result+=(n<=5) ? double(n) : lfp*double(n); + return result*double(ni); + } + + /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ + static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n) + { + if (n<=12) return n; + + size_t bestfac=2*n; + for (size_t f11=1; f11n) + { + if (x>=1; + } + else + return n; + } + } + return bestfac; + } + + /* returns the smallest composite of 2, 3, 5 which is >= n */ + static POCKETFFT_NOINLINE size_t good_size_real(size_t n) + { + if (n<=6) return n; + + size_t bestfac=2*n; + for (size_t f5=1; f5n) + { + if (x>=1; + } + else + return n; + } + } + return bestfac; + } + + static size_t prod(const shape_t &shape) + { + size_t res=1; + for (auto sz: shape) + res*=sz; + return res; + } + + static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, bool inplace) + { + auto ndim = shape.size(); + if (ndim<1) throw std::runtime_error("ndim must be >= 1"); + if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim)) + throw std::runtime_error("stride dimension mismatch"); + if (inplace && (stride_in!=stride_out)) + throw std::runtime_error("stride mismatch"); + } + + static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, bool inplace, + const shape_t &axes) + { + sanity_check(shape, stride_in, stride_out, inplace); + auto ndim = shape.size(); + shape_t tmp(ndim,0); + for (auto ax : axes) + { + if (ax>=ndim) throw std::invalid_argument("bad axis number"); + if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly"); + } + } + + static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, bool inplace, + size_t axis) + { + sanity_check(shape, stride_in, stride_out, inplace); + if (axis>=shape.size()) throw std::invalid_argument("bad axis number"); + } + +#ifdef POCKETFFT_NO_MULTITHREADING + static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/, + size_t /*axis*/, size_t /*vlen*/) + { return 1; } +#else + static size_t thread_count (size_t nthreads, const shape_t &shape, + size_t axis, size_t vlen) + { + if (nthreads==1) return 1; + size_t size = prod(shape); + size_t parallel = size / (shape[axis] * vlen); + if (shape[axis] < 1000) + parallel /= 4; + size_t max_threads = nthreads == 0 ? + std::thread::hardware_concurrency() : nthreads; + return std::max(size_t(1), std::min(parallel, max_threads)); + } +#endif + }; + +namespace threading { + +#ifdef POCKETFFT_NO_MULTITHREADING + +constexpr inline size_t thread_id() { return 0; } +constexpr inline size_t num_threads() { return 1; } + +template +void thread_map(size_t /* nthreads */, Func f) + { f(); } + +#else + +inline size_t &thread_id() + { + static thread_local size_t thread_id_=0; + return thread_id_; + } +inline size_t &num_threads() + { + static thread_local size_t num_threads_=1; + return num_threads_; + } +static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency()); + +class latch + { + std::atomic num_left_; + std::mutex mut_; + std::condition_variable completed_; + using lock_t = std::unique_lock; + + public: + latch(size_t n): num_left_(n) {} + + void count_down() + { + lock_t lock(mut_); + if (--num_left_) + return; + completed_.notify_all(); + } + + void wait() + { + lock_t lock(mut_); + completed_.wait(lock, [this]{ return is_ready(); }); + } + bool is_ready() { return num_left_ == 0; } + }; + +template class concurrent_queue + { + std::queue q_; + std::mutex mut_; + std::atomic size_; + using lock_t = std::lock_guard; + + public: + + void push(T val) + { + lock_t lock(mut_); + ++size_; + q_.push(std::move(val)); + } + + bool try_pop(T &val) + { + if (size_ == 0) return false; + lock_t lock(mut_); + // Queue might have been emptied while we acquired the lock + if (q_.empty()) return false; + + val = std::move(q_.front()); + --size_; + q_.pop(); + return true; + } + + bool empty() const { return size_==0; } + }; + +// C++ allocator with support for over-aligned types +template struct aligned_allocator + { + using value_type = T; + template + aligned_allocator(const aligned_allocator&) {} + aligned_allocator() = default; + + T *allocate(size_t n) + { + void* mem = aligned_alloc(alignof(T), n*sizeof(T)); + return static_cast(mem); + } + + void deallocate(T *p, size_t /*n*/) + { aligned_dealloc(p); } + }; + +class thread_pool + { + // A reasonable guess, probably close enough for most hardware + static constexpr size_t cache_line_size = 64; + struct alignas(cache_line_size) worker + { + std::thread thread; + std::condition_variable work_ready; + std::mutex mut; + std::atomic_flag busy_flag = ATOMIC_FLAG_INIT; + std::function work; + + void worker_main( + std::atomic &shutdown_flag, + std::atomic &unscheduled_tasks, + concurrent_queue> &overflow_work) + { + using lock_t = std::unique_lock; + bool expect_work = true; + while (!shutdown_flag || expect_work) + { + std::function local_work; + if (expect_work || unscheduled_tasks == 0) + { + lock_t lock(mut); + // Wait until there is work to be executed + work_ready.wait(lock, [&]{ return (work || shutdown_flag); }); + local_work.swap(work); + expect_work = false; + } + + bool marked_busy = false; + if (local_work) + { + marked_busy = true; + local_work(); + } + + if (!overflow_work.empty()) + { + if (!marked_busy && busy_flag.test_and_set()) + { + expect_work = true; + continue; + } + marked_busy = true; + + while (overflow_work.try_pop(local_work)) + { + --unscheduled_tasks; + local_work(); + } + } + + if (marked_busy) busy_flag.clear(); + } + } + }; + + concurrent_queue> overflow_work_; + std::mutex mut_; + std::vector> workers_; + std::atomic shutdown_; + std::atomic unscheduled_tasks_; + using lock_t = std::lock_guard; + + void create_threads() + { + lock_t lock(mut_); + size_t nthreads=workers_.size(); + for (size_t i=0; ibusy_flag.clear(); + worker->work = nullptr; + worker->thread = std::thread([worker, this] + { + worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_); + }); + } + catch (...) + { + shutdown_locked(); + throw; + } + } + } + + void shutdown_locked() + { + shutdown_ = true; + for (auto &worker : workers_) + worker.work_ready.notify_all(); + + for (auto &worker : workers_) + if (worker.thread.joinable()) + worker.thread.join(); + } + + public: + explicit thread_pool(size_t nthreads): + workers_(nthreads) + { create_threads(); } + + thread_pool(): thread_pool(max_threads) {} + + ~thread_pool() { shutdown(); } + + void submit(std::function work) + { + lock_t lock(mut_); + if (shutdown_) + throw std::runtime_error("Work item submitted after shutdown"); + + ++unscheduled_tasks_; + + // First check for any idle workers and wake those + for (auto &worker : workers_) + if (!worker.busy_flag.test_and_set()) + { + --unscheduled_tasks_; + { + lock_t lock(worker.mut); + worker.work = std::move(work); + } + worker.work_ready.notify_one(); + return; + } + + // If no workers were idle, push onto the overflow queue for later + overflow_work_.push(std::move(work)); + } + + void shutdown() + { + lock_t lock(mut_); + shutdown_locked(); + } + + void restart() + { + shutdown_ = false; + create_threads(); + } + }; + +inline thread_pool & get_pool() + { + static thread_pool pool; +#ifdef POCKETFFT_PTHREADS + static std::once_flag f; + std::call_once(f, + []{ + pthread_atfork( + +[]{ get_pool().shutdown(); }, // prepare + +[]{ get_pool().restart(); }, // parent + +[]{ get_pool().restart(); } // child + ); + }); +#endif + + return pool; + } + +/** Map a function f over nthreads */ +template +void thread_map(size_t nthreads, Func f) + { + if (nthreads == 0) + nthreads = max_threads; + + if (nthreads == 1) + { f(); return; } + + auto & pool = get_pool(); + latch counter(nthreads); + std::exception_ptr ex; + std::mutex ex_mut; + for (size_t i=0; i lock(ex_mut); + ex = std::current_exception(); + } + counter.count_down(); + }); + } + counter.wait(); + if (ex) + std::rethrow_exception(ex); + } + +#endif + +} + +// +// complex FFTPACK transforms +// + +template class cfftp + { + private: + struct fctdata + { + size_t fct; + cmplx *tw, *tws; + }; + + size_t length; + arr> mem; + std::vector fact; + + void add_factor(size_t factor) + { fact.push_back({factor, nullptr, nullptr}); } + +template void pass2 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+2*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1)); + } + } + } + +#define POCKETFFT_PREP3(idx) \ + T t0 = CC(idx,0,k), t1, t2; \ + PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ + CH(idx,k,0)=t0+t1; +#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ + { \ + T ca=t0+t1*twr; \ + T cb{-t2.i*twi, t2.r*twi}; \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ + } +#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ + { \ + T ca=t0+t1*twr; \ + T cb{-t2.i*twi, t2.r*twi}; \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ + } +template void pass3 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + constexpr T0 tw1r=-0.5, + tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L); + + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+3*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k void pass4 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+4*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k(t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); + } + else + for (size_t k=0; k(t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); + } + for (size_t i=1; i(t4); + CH(i,k,0) = t2+t3; + special_mul(t1+t4,WA(0,i),CH(i,k,1)); + special_mul(t2-t3,WA(1,i),CH(i,k,2)); + special_mul(t1-t4,WA(2,i),CH(i,k,3)); + } + } + } + +#define POCKETFFT_PREP5(idx) \ + T t0 = CC(idx,0,k), t1, t2, t3, t4; \ + PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ + PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ + CH(idx,k,0).r=t0.r+t1.r+t2.r; \ + CH(idx,k,0).i=t0.i+t1.i+t2.i; + +#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ + { \ + T ca,cb; \ + ca.r=t0.r+twar*t1.r+twbr*t2.r; \ + ca.i=t0.i+twar*t1.i+twbr*t2.i; \ + cb.i=twai*t4.r twbi*t3.r; \ + cb.r=-(twai*t4.i twbi*t3.i); \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ + } + +#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ + { \ + T ca,cb,da,db; \ + ca.r=t0.r+twar*t1.r+twbr*t2.r; \ + ca.i=t0.i+twar*t1.i+twbr*t2.i; \ + cb.i=twai*t4.r twbi*t3.r; \ + cb.r=-(twai*t4.i twbi*t3.i); \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ + } +template void pass5 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), + tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L), + tw2r= T0(-0.8090169943749474241022934171828191L), + tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L); + + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+5*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ + } + +template void pass7(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), + tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L), + tw2r= T0(-0.2225209339563144042889025644967948L), + tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L), + tw3r= T0(-0.9009688679024191262361023195074451L), + tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L); + + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+7*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k void ROTX45(T &a) const + { + constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); + if (fwd) + { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); } + else + { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); } + } +template void ROTX135(T &a) const + { + constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); + if (fwd) + { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); } + else + { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); } + } + +template void pass8 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+8*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k(a3); + + ROTX90(a7); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); + ROTX90(a6); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); + } + else + for (size_t k=0; k(a3); + + ROTX90(a7); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); + ROTX90(a6); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); + } + for (size_t i=1; i(a7); + PMINPLACE(a1,a3); + ROTX90(a3); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + PM(a0,a4,CC(i,0,k),CC(i,4,k)); + PM(a2,a6,CC(i,2,k),CC(i,6,k)); + PMINPLACE(a0,a2); + CH(i,k,0) = a0+a1; + special_mul(a0-a1,WA(3,i),CH(i,k,4)); + special_mul(a2+a3,WA(1,i),CH(i,k,2)); + special_mul(a2-a3,WA(5,i),CH(i,k,6)); + ROTX90(a6); + PMINPLACE(a4,a6); + special_mul(a4+a5,WA(0,i),CH(i,k,1)); + special_mul(a4-a5,WA(4,i),CH(i,k,5)); + special_mul(a6+a7,WA(2,i),CH(i,k,3)); + special_mul(a6-a7,WA(6,i),CH(i,k,7)); + } + } + } + + +#define POCKETFFT_PREP11(idx) \ + T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ + PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ + PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ + PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ + PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ + PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ + CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ + CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; + +#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ + { \ + T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ + cb; \ + cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ + cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ + PM(out1,out2,ca,cb); \ + } +#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ + POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) +#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ + { \ + T da,db; \ + POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ + } + +template void pass11 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa) const + { + constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), + tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L), + tw2r= T0(0.4154150130018864255292741492296232L), + tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L), + tw3r= T0(-0.1423148382732851404437926686163697L), + tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L), + tw4r= T0(-0.6548607339452850640569250724662936L), + tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L), + tw5r= T0(-0.9594929736144973898903680570663277L), + tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L); + + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+11*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + + if (ido==1) + for (size_t k=0; k void passg (size_t ido, size_t ip, + size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const cmplx * POCKETFFT_RESTRICT wa, + const cmplx * POCKETFFT_RESTRICT csarr) const + { + const size_t cdim=ip; + size_t ipph = (ip+1)/2; + size_t idl1 = ido*l1; + + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T& + { return cc[a+ido*(b+l1*c)]; }; + auto CX2 = [cc, idl1](size_t a, size_t b) -> T& + { return cc[a+idl1*b]; }; + auto CH2 = [ch, idl1](size_t a, size_t b) -> const T& + { return ch[a+idl1*b]; }; + + arr> wal(ip); + wal[0] = cmplx(1., 0.); + for (size_t i=1; i(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i); + + for (size_t k=0; kip) iwal-=ip; + cmplx xwal=wal[iwal]; + iwal+=l; if (iwal>ip) iwal-=ip; + cmplx xwal2=wal[iwal]; + for (size_t ik=0; ikip) iwal-=ip; + cmplx xwal=wal[iwal]; + for (size_t ik=0; ik(x1,wa[idij],CX(i,k,j)); + idij=(jc-1)*(ido-1)+i-1; + special_mul(x2,wa[idij],CX(i,k,jc)); + } + } + } + } + +template void pass_all(T c[], T0 fct) const + { + if (length==1) { c[0]*=fct; return; } + size_t l1=1; + arr ch(length); + T *p1=c, *p2=ch.data(); + + for(size_t k1=0; k1 (ido, l1, p1, p2, fact[k1].tw); + else if(ip==8) + pass8(ido, l1, p1, p2, fact[k1].tw); + else if(ip==2) + pass2(ido, l1, p1, p2, fact[k1].tw); + else if(ip==3) + pass3 (ido, l1, p1, p2, fact[k1].tw); + else if(ip==5) + pass5 (ido, l1, p1, p2, fact[k1].tw); + else if(ip==7) + pass7 (ido, l1, p1, p2, fact[k1].tw); + else if(ip==11) + pass11 (ido, l1, p1, p2, fact[k1].tw); + else + { + passg(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws); + std::swap(p1,p2); + } + std::swap(p1,p2); + l1=l2; + } + if (p1!=c) + { + if (fct!=1.) + for (size_t i=0; i void exec(T c[], T0 fct, bool fwd) const + { fwd ? pass_all(c, fct) : pass_all(c, fct); } + + private: + POCKETFFT_NOINLINE void factorize() + { + size_t len=length; + while ((len&7)==0) + { add_factor(8); len>>=3; } + while ((len&3)==0) + { add_factor(4); len>>=2; } + if ((len&1)==0) + { + len>>=1; + // factor 2 should be at the front of the factor list + add_factor(2); + std::swap(fact[0].fct, fact.back().fct); + } + for (size_t divisor=3; divisor*divisor<=len; divisor+=2) + while ((len%divisor)==0) + { + add_factor(divisor); + len/=divisor; + } + if (len>1) add_factor(len); + } + + size_t twsize() const + { + size_t twsize=0, l1=1; + for (size_t k=0; k11) + twsize+=ip; + l1*=ip; + } + return twsize; + } + + void comp_twiddle() + { + sincos_2pibyn twiddle(length); + size_t l1=1; + size_t memofs=0; + for (size_t k=0; k11) + { + fact[k].tws=mem.data()+memofs; + memofs+=ip; + for (size_t j=0; j class rfftp + { + private: + struct fctdata + { + size_t fct; + T0 *tw, *tws; + }; + + size_t length; + arr mem; + std::vector fact; + + void add_factor(size_t factor) + { fact.push_back({factor, nullptr, nullptr}); } + +/* (a+ib) = conj(c+id) * (e+if) */ +template inline void MULPM + (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const + { a=c*e+d*f; b=c*f-d*e; } + +template void radf2 (size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+2*c)]; }; + + for (size_t k=0; k void radf3(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); + + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+3*c)]; }; + + for (size_t k=0; k void radf4(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); + + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+4*c)]; }; + + for (size_t k=0; k void radf5(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), + ti11= T0(0.9510565162951535721164393333793821L), + tr12= T0(-0.8090169943749474241022934171828191L), + ti12= T0(0.5877852522924731291687059546390728L); + + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+5*c)]; }; + + for (size_t k=0; k void radfg(size_t ido, size_t ip, size_t l1, + T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const + { + const size_t cdim=ip; + size_t ipph=(ip+1)/2; + size_t idl1 = ido*l1; + + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1] (size_t a, size_t b) -> T& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1] (size_t a, size_t b) -> T& + { return ch[a+idl1*b]; }; + + if (ido>1) + { + for (size_t j=1, jc=ip-1; j=ip) iang-=ip; + T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + T0 ar=csarr[2*iang], ai=csarr[2*iang+1]; + for (size_t ik=0; ik void radb2(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+2*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k void radb3(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); + + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+3*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k void radb4(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+4*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k void radb5(size_t ido, size_t l1, + const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa) const + { + constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), + ti11= T0(0.9510565162951535721164393333793821L), + tr12= T0(-0.8090169943749474241022934171828191L), + ti12= T0(0.5877852522924731291687059546390728L); + + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+5*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k void radbg(size_t ido, size_t ip, size_t l1, + T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, + const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const + { + const size_t cdim=ip; + size_t ipph=(ip+1)/ 2; + size_t idl1 = ido*l1; + + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1](size_t a, size_t b) -> T& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1](size_t a, size_t b) -> T& + { return ch[a+idl1*b]; }; + + for (size_t k=0; kip) iang-=ip; + T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + T0 war=csarr[2*iang], wai=csarr[2*iang+1]; + for (size_t ik=0; ik void copy_and_norm(T *c, T *p1, T0 fct) const + { + if (p1!=c) + { + if (fct!=1.) + for (size_t i=0; i void exec(T c[], T0 fct, bool r2hc) const + { + if (length==1) { c[0]*=fct; return; } + size_t nf=fact.size(); + arr ch(length); + T *p1=c, *p2=ch.data(); + + if (r2hc) + for(size_t k1=0, l1=length; k1>=2; } + if ((len%2)==0) + { + len>>=1; + // factor 2 should be at the front of the factor list + add_factor(2); + std::swap(fact[0].fct, fact.back().fct); + } + for (size_t divisor=3; divisor*divisor<=len; divisor+=2) + while ((len%divisor)==0) + { + add_factor(divisor); + len/=divisor; + } + if (len>1) add_factor(len); + } + + size_t twsize() const + { + size_t twsz=0, l1=1; + for (size_t k=0; k5) twsz+=2*ip; + l1*=ip; + } + return twsz; + } + + void comp_twiddle() + { + sincos_2pibyn twid(length); + size_t l1=1; + T0 *ptr=mem.data(); + for (size_t k=0; k5) // special factors required by *g functions + { + fact[k].tws=ptr; ptr+=2*ip; + fact[k].tws[0] = 1.; + fact[k].tws[1] = 0.; + for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2) + { + fact[k].tws[i ] = twid[i/2*(length/ip)].r; + fact[k].tws[i+1] = twid[i/2*(length/ip)].i; + fact[k].tws[ic] = twid[i/2*(length/ip)].r; + fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i; + } + } + l1*=ip; + } + } + + public: + POCKETFFT_NOINLINE rfftp(size_t length_) + : length(length_) + { + if (length==0) throw std::runtime_error("zero-length FFT requested"); + if (length==1) return; + factorize(); + mem.resize(twsize()); + comp_twiddle(); + } +}; + +// +// complex Bluestein transforms +// + +template class fftblue + { + private: + size_t n, n2; + cfftp plan; + arr> mem; + cmplx *bk, *bkf; + + template void fft(cmplx c[], T0 fct) const + { + arr> akf(n2); + + /* initialize a_k and FFT it */ + for (size_t m=0; m(c[m],bk[m],akf[m]); + auto zero = akf[0]*T0(0); + for (size_t m=n; m(bkf[0]); + for (size_t m=1; m<(n2+1)/2; ++m) + { + akf[m] = akf[m].template special_mul(bkf[m]); + akf[n2-m] = akf[n2-m].template special_mul(bkf[m]); + } + if ((n2&1)==0) + akf[n2/2] = akf[n2/2].template special_mul(bkf[n2/2]); + + /* inverse FFT */ + plan.exec (akf.data(),1.,false); + + /* multiply by b_k */ + for (size_t m=0; m(bk[m])*fct; + } + + public: + POCKETFFT_NOINLINE fftblue(size_t length) + : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1), + bk(mem.data()), bkf(mem.data()+n) + { + /* initialize b_k */ + sincos_2pibyn tmp(2*n); + bk[0].Set(1, 0); + + size_t coeff=0; + for (size_t m=1; m=2*n) coeff-=2*n; + bk[m] = tmp[coeff]; + } + + /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + arr> tbkf(n2); + T0 xn2 = T0(1)/T0(n2); + tbkf[0] = bk[0]*xn2; + for (size_t m=1; m void exec(cmplx c[], T0 fct, bool fwd) const + { fwd ? fft(c,fct) : fft(c,fct); } + + template void exec_r(T c[], T0 fct, bool fwd) + { + arr> tmp(n); + if (fwd) + { + auto zero = T0(0)*c[0]; + for (size_t m=0; m(tmp.data(),fct); + c[0] = tmp[0].r; + std::copy_n (&tmp[1].r, n-1, &c[1]); + } + else + { + tmp[0].Set(c[0],c[0]*0); + std::copy_n (c+1, n-1, &tmp[1].r); + if ((n&1)==0) tmp[n/2].i=T0(0)*c[0]; + for (size_t m=1; 2*m(tmp.data(),fct); + for (size_t m=0; m class pocketfft_c + { + private: + std::unique_ptr> packplan; + std::unique_ptr> blueplan; + size_t len; + + public: + POCKETFFT_NOINLINE pocketfft_c(size_t length) + : len(length) + { + if (length==0) throw std::runtime_error("zero-length FFT requested"); + size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length); + if (tmp*tmp <= length) + { + packplan=std::unique_ptr>(new cfftp(length)); + return; + } + double comp1 = util::cost_guess(length); + double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2>(new fftblue(length)); + else + packplan=std::unique_ptr>(new cfftp(length)); + } + + template POCKETFFT_NOINLINE void exec(cmplx c[], T0 fct, bool fwd) const + { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); } + + size_t length() const { return len; } + }; + +// +// flexible (FFTPACK/Bluestein) real-valued 1D transform +// + +template class pocketfft_r + { + private: + std::unique_ptr> packplan; + std::unique_ptr> blueplan; + size_t len; + + public: + POCKETFFT_NOINLINE pocketfft_r(size_t length) + : len(length) + { + if (length==0) throw std::runtime_error("zero-length FFT requested"); + size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length); + if (tmp*tmp <= length) + { + packplan=std::unique_ptr>(new rfftp(length)); + return; + } + double comp1 = 0.5*util::cost_guess(length); + double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2>(new fftblue(length)); + else + packplan=std::unique_ptr>(new rfftp(length)); + } + + template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const + { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); } + + size_t length() const { return len; } + }; + + +// +// sine/cosine transforms +// + +template class T_dct1 + { + private: + pocketfft_r fftplan; + + public: + POCKETFFT_NOINLINE T_dct1(size_t length) + : fftplan(2*(length-1)) {} + + template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=fftplan.length(), n=N/2+1; + if (ortho) + { c[0]*=sqrt2; c[n-1]*=sqrt2; } + arr tmp(N); + tmp[0] = c[0]; + for (size_t i=1; i class T_dst1 + { + private: + pocketfft_r fftplan; + + public: + POCKETFFT_NOINLINE T_dst1(size_t length) + : fftplan(2*(length+1)) {} + + template POCKETFFT_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/) const + { + size_t N=fftplan.length(), n=N/2-1; + arr tmp(N); + tmp[0] = tmp[n+1] = c[0]*0; + for (size_t i=0; i class T_dcst23 + { + private: + pocketfft_r fftplan; + std::vector twiddle; + + public: + POCKETFFT_NOINLINE T_dcst23(size_t length) + : fftplan(length), twiddle(length) + { + sincos_2pibyn tw(4*length); + for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, + int type, bool cosine) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + size_t NS2 = (N+1)/2; + if (type==2) + { + if (!cosine) + for (size_t k=1; k class T_dcst4 + { + private: + size_t N; + std::unique_ptr> fft; + std::unique_ptr> rfft; + arr> C2; + + public: + POCKETFFT_NOINLINE T_dcst4(size_t length) + : N(length), + fft((N&1) ? nullptr : new pocketfft_c(N/2)), + rfft((N&1)? new pocketfft_r(N) : nullptr), + C2((N&1) ? 0 : N/2) + { + if ((N&1)==0) + { + sincos_2pibyn tw(16*N); + for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine) const + { + size_t n2 = N/2; + if (!cosine) + for (size_t k=0, kc=N-1; k y(N); + { + size_t i=0, m=n2; + for (; mexec(y.data(), fct, true); + { + auto SGN = [](size_t i) + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + return (i&2) ? -sqrt2 : sqrt2; + }; + c[n2] = y[0]*SGN(n2+1); + size_t i=0, i1=1, k=1; + for (; k> y(n2); + for(size_t i=0; iexec(y.data(), fct, true); + for(size_t i=0, ic=n2-1; i std::shared_ptr get_plan(size_t length) + { +#if POCKETFFT_CACHE_SIZE==0 + return std::make_shared(length); +#else + constexpr size_t nmax=POCKETFFT_CACHE_SIZE; + static std::array, nmax> cache; + static std::array last_access{{0}}; + static size_t access_counter = 0; + static std::mutex mut; + + auto find_in_cache = [&]() -> std::shared_ptr + { + for (size_t i=0; ilength()==length)) + { + // no need to update if this is already the most recent entry + if (last_access[i]!=access_counter) + { + last_access[i] = ++access_counter; + // Guard against overflow + if (access_counter == 0) + last_access.fill(0); + } + return cache[i]; + } + + return nullptr; + }; + + { + std::lock_guard lock(mut); + auto p = find_in_cache(); + if (p) return p; + } + auto plan = std::make_shared(length); + { + std::lock_guard lock(mut); + auto p = find_in_cache(); + if (p) return p; + + size_t lru = 0; + for (size_t i=1; i class cndarr: public arr_info + { + protected: + const char *d; + + public: + cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_) + : arr_info(shape_, stride_), + d(reinterpret_cast(data_)) {} + const T &operator[](ptrdiff_t ofs) const + { return *reinterpret_cast(d+ofs); } + }; + +template class ndarr: public cndarr + { + public: + ndarr(void *data_, const shape_t &shape_, const stride_t &stride_) + : cndarr::cndarr(const_cast(data_), shape_, stride_) + {} + T &operator[](ptrdiff_t ofs) + { return *reinterpret_cast(const_cast(cndarr::d+ofs)); } + }; + +template class multi_iter + { + private: + shape_t pos; + const arr_info &iarr, &oarr; + ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o; + size_t idim, rem; + + void advance_i() + { + for (int i_=int(pos.size())-1; i_>=0; --i_) + { + auto i = size_t(i_); + if (i==idim) continue; + p_ii += iarr.stride(i); + p_oi += oarr.stride(i); + if (++pos[i] < iarr.shape(i)) + return; + pos[i] = 0; + p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i); + p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i); + } + } + + public: + multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_) + : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0), + str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)), + idim(idim_), rem(iarr.size()/iarr.shape(idim)) + { + auto nshares = threading::num_threads(); + if (nshares==1) return; + if (nshares==0) throw std::runtime_error("can't run with zero threads"); + auto myshare = threading::thread_id(); + if (myshare>=nshares) throw std::runtime_error("impossible share requested"); + size_t nbase = rem/nshares; + size_t additional = rem%nshares; + size_t lo = myshare*nbase + ((myshare=0; --i_) + { + auto i = size_t(i_); + p += arr.stride(i); + if (++pos[i] < arr.shape(i)) + return; + pos[i] = 0; + p -= ptrdiff_t(arr.shape(i))*arr.stride(i); + } + } + ptrdiff_t ofs() const { return p; } + size_t remaining() const { return rem; } + }; + +class rev_iter + { + private: + shape_t pos; + const arr_info &arr; + std::vector rev_axis; + std::vector rev_jump; + size_t last_axis, last_size; + shape_t shp; + ptrdiff_t p, rp; + size_t rem; + + public: + rev_iter(const arr_info &arr_, const shape_t &axes) + : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0), + rev_jump(arr_.ndim(), 1), p(0), rp(0) + { + for (auto ax: axes) + rev_axis[ax]=1; + last_axis = axes.back(); + last_size = arr.shape(last_axis)/2 + 1; + shp = arr.shape(); + shp[last_axis] = last_size; + rem=1; + for (auto i: shp) + rem *= i; + } + void advance() + { + --rem; + for (int i_=int(pos.size())-1; i_>=0; --i_) + { + auto i = size_t(i_); + p += arr.stride(i); + if (!rev_axis[i]) + rp += arr.stride(i); + else + { + rp -= arr.stride(i); + if (rev_jump[i]) + { + rp += ptrdiff_t(arr.shape(i))*arr.stride(i); + rev_jump[i] = 0; + } + } + if (++pos[i] < shp[i]) + return; + pos[i] = 0; + p -= ptrdiff_t(shp[i])*arr.stride(i); + if (rev_axis[i]) + { + rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i); + rev_jump[i] = 1; + } + else + rp -= ptrdiff_t(shp[i])*arr.stride(i); + } + } + ptrdiff_t ofs() const { return p; } + ptrdiff_t rev_ofs() const { return rp; } + size_t remaining() const { return rem; } + }; + +template struct VTYPE {}; +template using vtype_t = typename VTYPE::type; + +#ifndef POCKETFFT_NO_VECTORS +template<> struct VTYPE + { + using type = float __attribute__ ((vector_size (VLEN::val*sizeof(float)))); + }; +template<> struct VTYPE + { + using type = double __attribute__ ((vector_size (VLEN::val*sizeof(double)))); + }; +template<> struct VTYPE + { + using type = long double __attribute__ ((vector_size (VLEN::val*sizeof(long double)))); + }; +#endif + +template arr alloc_tmp(const shape_t &shape, + size_t axsize, size_t elemsize) + { + auto othersize = util::prod(shape)/axsize; + auto tmpsize = axsize*((othersize>=VLEN::val) ? VLEN::val : 1); + return arr(tmpsize*elemsize); + } +template arr alloc_tmp(const shape_t &shape, + const shape_t &axes, size_t elemsize) + { + size_t fullsize=util::prod(shape); + size_t tmpsize=0; + for (size_t i=0; i=VLEN::val) ? VLEN::val : 1); + if (sz>tmpsize) tmpsize=sz; + } + return arr(tmpsize*elemsize); + } + +template void copy_input(const multi_iter &it, + const cndarr> &src, cmplx> *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i void copy_input(const multi_iter &it, + const cndarr &src, vtype_t *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i void copy_input(const multi_iter &it, + const cndarr &src, T *POCKETFFT_RESTRICT dst) + { + if (dst == &src[it.iofs(0)]) return; // in-place + for (size_t i=0; i void copy_output(const multi_iter &it, + const cmplx> *POCKETFFT_RESTRICT src, ndarr> &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + if (src == &dst[it.oofs(0)]) return; // in-place + for (size_t i=0; i struct add_vec { using type = vtype_t; }; +template struct add_vec> + { using type = cmplx>; }; +template using add_vec_t = typename add_vec::type; + +template +POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, + const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec, + const bool allow_inplace=true) + { + std::shared_ptr plan; + + for (size_t iax=0; iaxlength())) + plan = get_plan(len); + + threading::thread_map( + util::thread_count(nthreads, in.shape(), axes[iax], VLEN::val), + [&] { + constexpr auto vlen = VLEN::val; + auto storage = alloc_tmp(in.shape(), len, sizeof(T)); + const auto &tin(iax==0? in : out); + multi_iter it(tin, out, axes[iax]); +#ifndef POCKETFFT_NO_VECTORS + if (vlen>1) + while (it.remaining()>=vlen) + { + it.advance(vlen); + auto tdatav = reinterpret_cast *>(storage.data()); + exec(it, tin, out, tdatav, *plan, fct); + } +#endif + while (it.remaining()>0) + { + it.advance(1); + auto buf = allow_inplace && it.stride_out() == sizeof(T) ? + &out[it.oofs(0)] : reinterpret_cast(storage.data()); + exec(it, tin, out, buf, *plan, fct); + } + }); // end of parallel region + fct = T0(1); // factor has been applied, use 1 for remaining axes + } + } + +struct ExecC2C + { + bool forward; + + template void operator () ( + const multi_iter &it, const cndarr> &in, + ndarr> &out, T * buf, const pocketfft_c &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, forward); + copy_output(it, buf, out); + } + }; + +template void copy_hartley(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t j=0; j void copy_hartley(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + dst[it.oofs(0)] = src[0]; + size_t i=1, i1=1, i2=it.length_out()-1; + for (i=1; i void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, + T * buf, const pocketfft_r &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, true); + copy_hartley(it, buf, out); + } + }; + +struct ExecDcst + { + bool ortho; + int type; + bool cosine; + + template + void operator () (const multi_iter &it, const cndarr &in, + ndarr &out, T * buf, const Tplan &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, ortho, type, cosine); + copy_output(it, buf, out); + } + }; + +template POCKETFFT_NOINLINE void general_r2c( + const cndarr &in, ndarr> &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + auto plan = get_plan>(in.shape(axis)); + size_t len=in.shape(axis); + threading::thread_map( + util::thread_count(nthreads, in.shape(), axis, VLEN::val), + [&] { + constexpr auto vlen = VLEN::val; + auto storage = alloc_tmp(in.shape(), len, sizeof(T)); + multi_iter it(in, out, axis); +#ifndef POCKETFFT_NO_VECTORS + if (vlen>1) + while (it.remaining()>=vlen) + { + it.advance(vlen); + auto tdatav = reinterpret_cast *>(storage.data()); + copy_input(it, in, tdatav); + plan->exec(tdatav, fct, true); + for (size_t j=0; j0) + { + it.advance(1); + auto tdata = reinterpret_cast(storage.data()); + copy_input(it, in, tdata); + plan->exec(tdata, fct, true); + out[it.oofs(0)].Set(tdata[0]); + size_t i=1, ii=1; + if (forward) + for (; i POCKETFFT_NOINLINE void general_c2r( + const cndarr> &in, ndarr &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + auto plan = get_plan>(out.shape(axis)); + size_t len=out.shape(axis); + threading::thread_map( + util::thread_count(nthreads, in.shape(), axis, VLEN::val), + [&] { + constexpr auto vlen = VLEN::val; + auto storage = alloc_tmp(out.shape(), len, sizeof(T)); + multi_iter it(in, out, axis); +#ifndef POCKETFFT_NO_VECTORS + if (vlen>1) + while (it.remaining()>=vlen) + { + it.advance(vlen); + auto tdatav = reinterpret_cast *>(storage.data()); + for (size_t j=0; jexec(tdatav, fct, false); + copy_output(it, tdatav, out); + } +#endif + while (it.remaining()>0) + { + it.advance(1); + auto tdata = reinterpret_cast(storage.data()); + tdata[0]=in[it.iofs(0)].r; + { + size_t i=1, ii=1; + if (forward) + for (; iexec(tdata, fct, false); + copy_output(it, tdata, out); + } + }); // end of parallel region + } + +struct ExecR2R + { + bool r2h, forward; + + template void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, T * buf, + const pocketfft_r &plan, T0 fct) const + { + copy_input(it, in, buf); + if ((!r2h) && forward) + for (size_t i=2; i void c2c(const shape_t &shape, const stride_t &stride_in, + const stride_t &stride_out, const shape_t &axes, bool forward, + const std::complex *data_in, std::complex *data_out, T fct, + size_t nthreads=1) + { + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr> ain(data_in, shape, stride_in); + ndarr> aout(data_out, shape, stride_out); + general_nd>(ain, aout, axes, fct, nthreads, ExecC2C{forward}); + } + +template void dct(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1) + { + if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type"); + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr ain(data_in, shape, stride_in); + ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, true}; + if (type==1) + general_nd>(ain, aout, axes, fct, nthreads, exec); + else if (type==4) + general_nd>(ain, aout, axes, fct, nthreads, exec); + else + general_nd>(ain, aout, axes, fct, nthreads, exec); + } + +template void dst(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1) + { + if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type"); + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr ain(data_in, shape, stride_in); + ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, false}; + if (type==1) + general_nd>(ain, aout, axes, fct, nthreads, exec); + else if (type==4) + general_nd>(ain, aout, axes, fct, nthreads, exec); + else + general_nd>(ain, aout, axes, fct, nthreads, exec); + } + +template void r2c(const shape_t &shape_in, + const stride_t &stride_in, const stride_t &stride_out, size_t axis, + bool forward, const T *data_in, std::complex *data_out, T fct, + size_t nthreads=1) + { + if (util::prod(shape_in)==0) return; + util::sanity_check(shape_in, stride_in, stride_out, false, axis); + cndarr ain(data_in, shape_in, stride_in); + shape_t shape_out(shape_in); + shape_out[axis] = shape_in[axis]/2 + 1; + ndarr> aout(data_out, shape_out, stride_out); + general_r2c(ain, aout, axis, forward, fct, nthreads); + } + +template void r2c(const shape_t &shape_in, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + bool forward, const T *data_in, std::complex *data_out, T fct, + size_t nthreads=1) + { + if (util::prod(shape_in)==0) return; + util::sanity_check(shape_in, stride_in, stride_out, false, axes); + r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out, + fct, nthreads); + if (axes.size()==1) return; + + shape_t shape_out(shape_in); + shape_out[axes.back()] = shape_in[axes.back()]/2 + 1; + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out, + T(1), nthreads); + } + +template void c2r(const shape_t &shape_out, + const stride_t &stride_in, const stride_t &stride_out, size_t axis, + bool forward, const std::complex *data_in, T *data_out, T fct, + size_t nthreads=1) + { + if (util::prod(shape_out)==0) return; + util::sanity_check(shape_out, stride_in, stride_out, false, axis); + shape_t shape_in(shape_out); + shape_in[axis] = shape_out[axis]/2 + 1; + cndarr> ain(data_in, shape_in, stride_in); + ndarr aout(data_out, shape_out, stride_out); + general_c2r(ain, aout, axis, forward, fct, nthreads); + } + +template void c2r(const shape_t &shape_out, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + bool forward, const std::complex *data_in, T *data_out, T fct, + size_t nthreads=1) + { + if (util::prod(shape_out)==0) return; + if (axes.size()==1) + return c2r(shape_out, stride_in, stride_out, axes[0], forward, + data_in, data_out, fct, nthreads); + util::sanity_check(shape_out, stride_in, stride_out, false, axes); + auto shape_in = shape_out; + shape_in[axes.back()] = shape_out[axes.back()]/2 + 1; + auto nval = util::prod(shape_in); + stride_t stride_inter(shape_in.size()); + stride_inter.back() = sizeof(cmplx); + for (int i=int(shape_in.size())-2; i>=0; --i) + stride_inter[size_t(i)] = + stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]); + arr> tmp(nval); + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(), + T(1), nthreads); + c2r(shape_out, stride_inter, stride_out, axes.back(), forward, + tmp.data(), data_out, fct, nthreads); + } + +template void r2r_fftpack(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct, + size_t nthreads=1) + { + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr ain(data_in, shape, stride_in); + ndarr aout(data_out, shape, stride_out); + general_nd>(ain, aout, axes, fct, nthreads, + ExecR2R{real2hermitian, forward}); + } + +template void r2r_separable_hartley(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + const T *data_in, T *data_out, T fct, size_t nthreads=1) + { + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr ain(data_in, shape, stride_in); + ndarr aout(data_out, shape, stride_out); + general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}, + false); + } + +template void r2r_genuine_hartley(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + const T *data_in, T *data_out, T fct, size_t nthreads=1) + { + if (util::prod(shape)==0) return; + if (axes.size()==1) + return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in, + data_out, fct, nthreads); + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + shape_t tshp(shape); + tshp[axes.back()] = tshp[axes.back()]/2+1; + arr> tdata(util::prod(tshp)); + stride_t tstride(shape.size()); + tstride.back()=sizeof(std::complex); + for (size_t i=tstride.size()-1; i>0; --i) + tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]); + r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads); + cndarr> atmp(tdata.data(), tshp, tstride); + ndarr aout(data_out, shape, stride_out); + simple_iter iin(atmp); + rev_iter iout(aout, axes); + while(iin.remaining()>0) + { + auto v = atmp[iin.ofs()]; + aout[iout.ofs()] = v.r+v.i; + aout[iout.rev_ofs()] = v.r-v.i; + iin.advance(); iout.advance(); + } + } + +} // namespace detail + +using detail::FORWARD; +using detail::BACKWARD; +using detail::shape_t; +using detail::stride_t; +using detail::c2c; +using detail::c2r; +using detail::r2c; +using detail::r2r_fftpack; +using detail::r2r_separable_hartley; +using detail::r2r_genuine_hartley; +using detail::dct; +using detail::dst; + +} // namespace pocketfft + +#undef POCKETFFT_NOINLINE +#undef POCKETFFT_RESTRICT + +#endif // POCKETFFT_HDRONLY_H diff --git a/configure.ac b/configure.ac index 89ea77f..f6bc2c7 100644 --- a/configure.ac +++ b/configure.ac @@ -308,7 +308,8 @@ AC_MSG_RESULT(${ok}) dnl check for optimization options dnl This macro sets ANSI mode for certain compilers, and must dnl thus come before the subsequenct checks -AX_CC_MAXOPT +dnl TEMPORARY: disabling this, since I didn't find a way of overriding this from the command line. +dnl AX_CC_MAXOPT AX_GCC_ARCHFLAG(no, [GCC_ARCH=`echo $ax_cv_gcc_archflag | cut -d= -f2`]) AC_SUBST(GCC_ARCH) @@ -405,6 +406,7 @@ benchees/cross/Makefile benchees/cwplib/Makefile benchees/dfftpack/Makefile benchees/dsp/Makefile +benchees/duccfft/Makefile benchees/dxml/Makefile benchees/emayer/Makefile benchees/esrfft/Makefile @@ -444,6 +446,7 @@ benchees/nr/Makefile benchees/numutils/Makefile benchees/ooura/Makefile benchees/pocketfft/Makefile +benchees/pocketfft_cxx/Makefile benchees/qft/Makefile benchees/ransom/Makefile benchees/rmayer/Makefile From 2239d5d734da4fad99ffb54305ec25ed98bf9a9f Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 16 Oct 2021 13:45:07 +0200 Subject: [PATCH 2/5] remove customization --- benchees/Makefile.am | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchees/Makefile.am b/benchees/Makefile.am index 4b473a5..c612450 100644 --- a/benchees/Makefile.am +++ b/benchees/Makefile.am @@ -6,9 +6,6 @@ mpfun77 mpfun90 nag napack newsplit nr numutils ooura pocketfft pocketfft_cxx qf ransom rmayer scimark2c sciport sgimath singleton sorensen spiral-fft \ statlib sunperf temperton teneyck valkenburg vbigdsp vdsp -SUBDIRS = duccfft pocketfft_cxx pocketfft dfftpack - #fftw3 - EXTRA_DIST = Makefile.common distclean-local: From cd96298dcae9458e7e686055014319285fa9b256 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 16 Oct 2021 19:25:49 +0200 Subject: [PATCH 3/5] revert debugging code --- benchees/duccfft/ducc0/infra/aligned_array.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchees/duccfft/ducc0/infra/aligned_array.h b/benchees/duccfft/ducc0/infra/aligned_array.h index c72a984..c12123b 100644 --- a/benchees/duccfft/ducc0/infra/aligned_array.h +++ b/benchees/duccfft/ducc0/infra/aligned_array.h @@ -57,7 +57,7 @@ template class array_base if (num==0) return nullptr; // FIXME: let's not use aligned_alloc on Apple for the moment, // it's only supported from 10.15 on... -#if 0//((__cplusplus >= 201703L) && (!defined(__APPLE__))) +#if ((__cplusplus >= 201703L) && (!defined(__APPLE__))) // aligned_alloc requires the allocated size to be a multiple of the // requested alignment, so increase size if necessary void *res = aligned_alloc(alignment,((num*sizeof(T)+alignment-1)/alignment)*alignment); @@ -76,7 +76,7 @@ template class array_base if constexpr(alignment<=alignof(max_align_t)) free(ptr); else -#if 0//((__cplusplus >= 201703L) && (!defined(__APPLE__))) +#if ((__cplusplus >= 201703L) && (!defined(__APPLE__))) free(ptr); #else if (ptr) free((reinterpret_cast(ptr))[-1]); From e6fa94289604ecc0c7728de6662ecc0ccf69ca94 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 16 Oct 2021 20:03:12 +0200 Subject: [PATCH 4/5] implement suggestions --- benchees/duccfft/doit.cc | 41 +++++++++++++++++------------- benchees/duccfft/ducc0/infra/mav.h | 4 ++- benchees/pocketfft_cxx/doit.cc | 16 +++++------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/benchees/duccfft/doit.cc b/benchees/duccfft/doit.cc index 7dc9c22..ceeced9 100644 --- a/benchees/duccfft/doit.cc +++ b/benchees/duccfft/doit.cc @@ -36,21 +36,19 @@ void copy_c2h(struct problem *p, bench_complex *in) copy_c2h_1d_fftpack(p, in, -1.0); } +static fmav_info::shape_t axes; +static cfmav> in_c; +static vfmav> out_c; +static cfmav in_r; +static vfmav out_r; void setup(struct problem *p) { BENCH_ASSERT(can_do(p)); - // populate the transform cache - doit(1,p); -} -void doit(int iter, struct problem *p) -{ - static fmav_info::shape_t shape(p->rank); - static fmav_info::shape_t axes(p->rank); - shape.resize(p->rank); - axes.resize(p->rank); - for (int i=0; irank; ++i) { + fmav_info::shape_t shape(p->rank); + axes.resize(p->rank); + for (int i=0; irank; ++i) { shape[i] = p->n[i]; axes[i] = i; } @@ -58,18 +56,25 @@ void doit(int iter, struct problem *p) if (p->kind == PROBLEM_COMPLEX) { auto in = reinterpret_cast *>(p->in); auto out = reinterpret_cast *>(p->out); - cfmav> min(in, shape); - vfmav> mout(out, shape); - for (int i = 0; i < iter; ++i) { - c2c(min,mout,axes,p->sign==-1,bench_real(1)); - } + in_c.assign(cfmav>(in, shape)); + out_c.assign(vfmav>(out, shape)); } else { auto in = reinterpret_cast(p->in); auto out = reinterpret_cast(p->out); - cfmav min(in, shape); - vfmav mout(out, shape); + in_r.assign(cfmav(in, shape)); + out_r.assign(vfmav(out, shape)); + } +} + +void doit(int iter, struct problem *p) +{ + if (p->kind == PROBLEM_COMPLEX) { + for (int i = 0; i < iter; ++i) { + c2c(in_c,out_c,axes,p->sign==-1,bench_real(1)); + } + } else { for (int i = 0; i < iter; ++i) { - r2r_fftpack(min,mout,axes,p->sign==-1,p->sign==-1,bench_real(1)); + r2r_fftpack(in_r,out_r,axes,p->sign==-1,p->sign==-1,bench_real(1)); } } } diff --git a/benchees/duccfft/ducc0/infra/mav.h b/benchees/duccfft/ducc0/infra/mav.h index 9987048..a12bc25 100644 --- a/benchees/duccfft/ducc0/infra/mav.h +++ b/benchees/duccfft/ducc0/infra/mav.h @@ -410,6 +410,7 @@ template class cfmav: public fmav_info, public cmembuf : tinfo(info), tbuf(d_, buf) {} public: + cfmav() {} cfmav(const T *d_, const shape_t &shp_, const stride_t &str_) : tinfo(shp_, str_), tbuf(d_) {} cfmav(const T *d_, const shape_t &shp_) @@ -459,6 +460,7 @@ template class vfmav: public cfmav public: using tbuf::raw, tbuf::data, tinfo::ndim; + vfmav() {} vfmav(T *d_, const fmav_info &info) : cfmav(d_, info) {} vfmav(T *d_, const shape_t &shp_, const stride_t &str_) @@ -487,7 +489,7 @@ template class vfmav: public cfmav template T &raw(I i) { return data()[i]; } - void assign(vfmav &other) + void assign(const vfmav &other) { fmav_info::assign(other); cmembuf::assign(other); diff --git a/benchees/pocketfft_cxx/doit.cc b/benchees/pocketfft_cxx/doit.cc index 3bda34b..f2ac811 100644 --- a/benchees/pocketfft_cxx/doit.cc +++ b/benchees/pocketfft_cxx/doit.cc @@ -19,7 +19,7 @@ BENCH_DOC("url-was-valid-on", "Fri Jul 23 23:06:24 ACST 2020") BENCH_DOC("copyright", "3 clause BSDL") END_BENCH_DOC -int can_do(struct problem *p) +int can_do(struct problem * /*p*/) { return true; } @@ -34,18 +34,13 @@ void copy_c2h(struct problem *p, bench_complex *in) copy_c2h_1d_fftpack(p, in, -1.0); } +static shape_t shape, axes; +static stride_t strides; void setup(struct problem *p) { BENCH_ASSERT(can_do(p)); - // populate the transform cache - doit(1,p); -} - -void doit(int iter, struct problem *p) -{ - static shape_t shape, axes; - static stride_t strides; + shape.resize(p->rank); strides.resize(p->rank); axes.resize(p->rank); @@ -59,7 +54,10 @@ void doit(int iter, struct problem *p) strides[i] = str; str *= shape[i]; } +} +void doit(int iter, struct problem *p) +{ if (p->kind == PROBLEM_COMPLEX) { auto in = reinterpret_cast *>(p->in); auto out = reinterpret_cast *>(p->out); From 1c91a1d3ad96691d1952f74717dbd64406b7ad24 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 16 Oct 2021 20:05:54 +0200 Subject: [PATCH 5/5] cosmetics --- benchees/duccfft/doit.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchees/duccfft/doit.cc b/benchees/duccfft/doit.cc index ceeced9..63daa95 100644 --- a/benchees/duccfft/doit.cc +++ b/benchees/duccfft/doit.cc @@ -21,7 +21,7 @@ BENCH_DOC("url-was-valid-on", "Fri Jul 23 23:06:24 ACST 2020") BENCH_DOC("copyright", "GPLv2+") END_BENCH_DOC -int can_do(struct problem *p) +int can_do(struct problem * /*p*/) { return true; }