1/*
2This file is part of pocketfft.
3
4Copyright (C) 2010-2021 Max-Planck-Society
5Copyright (C) 2019-2020 Peter Bell
6
7For the odd-sized DCT-IV transforms:
8 Copyright (C) 2003, 2007-14 Matteo Frigo
9 Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
10
11Authors: Martin Reinecke, Peter Bell
12
13All rights reserved.
14
15Redistribution and use in source and binary forms, with or without modification,
16are permitted provided that the following conditions are met:
17
18* Redistributions of source code must retain the above copyright notice, this
19 list of conditions and the following disclaimer.
20* Redistributions in binary form must reproduce the above copyright notice, this
21 list of conditions and the following disclaimer in the documentation and/or
22 other materials provided with the distribution.
23* Neither the name of the copyright holder nor the names of its contributors may
24 be used to endorse or promote products derived from this software without
25 specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
28ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
29WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
31ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
32(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
33LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
34ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37*/
38
39#ifndef POCKETFFT_HDRONLY_H
40#define POCKETFFT_HDRONLY_H
41
42#ifndef __cplusplus
43#error This file is C++ and requires a C++ compiler.
44#endif
45
46#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47#error This file requires at least C++11 support.
48#endif
49
50#ifndef POCKETFFT_CACHE_SIZE
51#define POCKETFFT_CACHE_SIZE 0
52#endif
53
54#include <cmath>
55#include <cstdlib>
56#include <stdexcept>
57#include <memory>
58#include <vector>
59#include <complex>
60#include <algorithm>
61#if POCKETFFT_CACHE_SIZE!=0
62#include <array>
63#include <mutex>
64#endif
65
66#ifndef POCKETFFT_NO_MULTITHREADING
67#include <mutex>
68#include <condition_variable>
69#include <thread>
70#include <queue>
71#include <atomic>
72#include <functional>
73#include <new>
74
75#ifdef POCKETFFT_PTHREADS
76# include <pthread.h>
77#endif
78#endif
79
80#if defined(__GNUC__)
81#define POCKETFFT_NOINLINE __attribute__((noinline))
82#define POCKETFFT_RESTRICT __restrict__
83#elif defined(_MSC_VER)
84#define POCKETFFT_NOINLINE __declspec(noinline)
85#define POCKETFFT_RESTRICT __restrict
86#else
87#define POCKETFFT_NOINLINE
88#define POCKETFFT_RESTRICT
89#endif
90
91namespace pocketfft {
92
93namespace detail {
94using std::size_t;
95using std::ptrdiff_t;
96
97// Always use std:: for <cmath> functions
98template <typename T> T cos(T) = delete;
99template <typename T> T sin(T) = delete;
100template <typename T> T sqrt(T) = delete;
101
102using shape_t = std::vector<size_t>;
103using stride_t = std::vector<ptrdiff_t>;
104
105constexpr bool FORWARD = true,
106 BACKWARD = false;
107
108// only enable vector support for gcc>=5.0 and clang>=5.0
109#ifndef POCKETFFT_NO_VECTORS
110#define POCKETFFT_NO_VECTORS
111#if defined(__INTEL_COMPILER)
112// do nothing. This is necessary because this compiler also sets __GNUC__.
113#elif defined(__clang__)
114// AppleClang has their own version numbering
115#ifdef __apple_build_version__
116# if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
117# undef POCKETFFT_NO_VECTORS
118# endif
119#elif __clang_major__ >= 5
120# undef POCKETFFT_NO_VECTORS
121#endif
122#elif defined(__GNUC__)
123#if __GNUC__>=5
124#undef POCKETFFT_NO_VECTORS
125#endif
126#endif
127#endif
128
129template<typename T> struct VLEN { static constexpr size_t val=1; };
130
131#ifndef POCKETFFT_NO_VECTORS
132#if (defined(__AVX512F__))
133template<> struct VLEN<float> { static constexpr size_t val=16; };
134template<> struct VLEN<double> { static constexpr size_t val=8; };
135#elif (defined(__AVX__))
136template<> struct VLEN<float> { static constexpr size_t val=8; };
137template<> struct VLEN<double> { static constexpr size_t val=4; };
138#elif (defined(__SSE2__))
139template<> struct VLEN<float> { static constexpr size_t val=4; };
140template<> struct VLEN<double> { static constexpr size_t val=2; };
141#elif (defined(__VSX__))
142template<> struct VLEN<float> { static constexpr size_t val=4; };
143template<> struct VLEN<double> { static constexpr size_t val=2; };
144#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
145template<> struct VLEN<float> { static constexpr size_t val=4; };
146template<> struct VLEN<double> { static constexpr size_t val=2; };
147#else
148#define POCKETFFT_NO_VECTORS
149#endif
150#endif
151
152#if __cplusplus >= 201703L
153inline void *aligned_alloc(size_t align, size_t size)
154 {
155 // aligned_alloc() requires that the requested size is a multiple of "align"
156 void *ptr = ::aligned_alloc(align,(size+align-1)&(~(align-1)));
157 if (!ptr) throw std::bad_alloc();
158 return ptr;
159 }
160inline void aligned_dealloc(void *ptr)
161 { free(ptr); }
162#else // portable emulation
163inline void *aligned_alloc(size_t align, size_t size)
164 {
165 align = std::max(align, alignof(max_align_t));
166 void *ptr = malloc(size+align);
167 if (!ptr) throw std::bad_alloc();
168 void *res = reinterpret_cast<void *>
169 ((reinterpret_cast<uintptr_t>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align));
170 (reinterpret_cast<void**>(res))[-1] = ptr;
171 return res;
172 }
173inline void aligned_dealloc(void *ptr)
174 { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
175#endif
176
177template<typename T> class arr
178 {
179 private:
180 T *p;
181 size_t sz;
182
183#if defined(POCKETFFT_NO_VECTORS)
184 static T *ralloc(size_t num)
185 {
186 if (num==0) return nullptr;
187 void *res = malloc(num*sizeof(T));
188 if (!res) throw std::bad_alloc();
189 return reinterpret_cast<T *>(res);
190 }
191 static void dealloc(T *ptr)
192 { free(ptr); }
193#else
194 static T *ralloc(size_t num)
195 {
196 if (num==0) return nullptr;
197 void *ptr = aligned_alloc(64, num*sizeof(T));
198 return static_cast<T*>(ptr);
199 }
200 static void dealloc(T *ptr)
201 { aligned_dealloc(ptr); }
202#endif
203
204 public:
205 arr() : p(0), sz(0) {}
206 arr(size_t n) : p(ralloc(n)), sz(n) {}
207 arr(arr &&other)
208 : p(other.p), sz(other.sz)
209 { other.p=nullptr; other.sz=0; }
210 ~arr() { dealloc(p); }
211
212 void resize(size_t n)
213 {
214 if (n==sz) return;
215 dealloc(p);
216 p = ralloc(n);
217 sz = n;
218 }
219
220 T &operator[](size_t idx) { return p[idx]; }
221 const T &operator[](size_t idx) const { return p[idx]; }
222
223 T *data() { return p; }
224 const T *data() const { return p; }
225
226 size_t size() const { return sz; }
227 };
228
229template<typename T> struct cmplx {
230 T r, i;
231 cmplx() {}
232 cmplx(T r_, T i_) : r(r_), i(i_) {}
233 void Set(T r_, T i_) { r=r_; i=i_; }
234 void Set(T r_) { r=r_; i=T(0); }
235 cmplx &operator+= (const cmplx &other)
236 { r+=other.r; i+=other.i; return *this; }
237 template<typename T2>cmplx &operator*= (T2 other)
238 { r*=other; i*=other; return *this; }
239 template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
240 {
241 T tmp = r*other.r - i*other.i;
242 i = r*other.i + i*other.r;
243 r = tmp;
244 return *this;
245 }
246 template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
247 { r+=other.r; i+=other.i; return *this; }
248 template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
249 { r-=other.r; i-=other.i; return *this; }
250 template<typename T2> auto operator* (const T2 &other) const
251 -> cmplx<decltype(r*other)>
252 { return {r*other, i*other}; }
253 template<typename T2> auto operator+ (const cmplx<T2> &other) const
254 -> cmplx<decltype(r+other.r)>
255 { return {r+other.r, i+other.i}; }
256 template<typename T2> auto operator- (const cmplx<T2> &other) const
257 -> cmplx<decltype(r+other.r)>
258 { return {r-other.r, i-other.i}; }
259 template<typename T2> auto operator* (const cmplx<T2> &other) const
260 -> cmplx<decltype(r+other.r)>
261 { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
262 template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
263 -> cmplx<decltype(r+other.r)>
264 {
265 using Tres = cmplx<decltype(r+other.r)>;
266 return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
267 : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
268 }
269};
270template<typename T> inline void PM(T &a, T &b, T c, T d)
271 { a=c+d; b=c-d; }
272template<typename T> inline void PMINPLACE(T &a, T &b)
273 { T t = a; a+=b; b=t-b; }
274template<typename T> inline void MPINPLACE(T &a, T &b)
275 { T t = a; a-=b; b=t+b; }
276template<typename T> cmplx<T> conj(const cmplx<T> &a)
277 { return {a.r, -a.i}; }
278template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
279 {
280 res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
281 : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
282 }
283
284template<typename T> void ROT90(cmplx<T> &a)
285 { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
286template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
287 { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
288
289//
290// twiddle factor section
291//
292template<typename T> class sincos_2pibyn
293 {
294 private:
295 using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
296 size_t N, mask, shift;
297 arr<cmplx<Thigh>> v1, v2;
298
299 static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
300 {
301 x<<=3;
302 if (x<4*n) // first half
303 {
304 if (x<2*n) // first quadrant
305 {
306 if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
307 return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
308 }
309 else // second quadrant
310 {
311 x-=2*n;
312 if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
313 return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
314 }
315 }
316 else
317 {
318 x=8*n-x;
319 if (x<2*n) // third quadrant
320 {
321 if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
322 return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
323 }
324 else // fourth quadrant
325 {
326 x-=2*n;
327 if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
328 return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
329 }
330 }
331 }
332
333 public:
334 POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
335 : N(n)
336 {
337 constexpr auto pi = 3.141592653589793238462643383279502884197L;
338 Thigh ang = Thigh(0.25L*pi/n);
339 size_t nval = (n+2)/2;
340 shift = 1;
341 while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
342 mask = (size_t(1)<<shift)-1;
343 v1.resize(mask+1);
344 v1[0].Set(Thigh(1), Thigh(0));
345 for (size_t i=1; i<v1.size(); ++i)
346 v1[i]=calc(i,n,ang);
347 v2.resize((nval+mask)/(mask+1));
348 v2[0].Set(Thigh(1), Thigh(0));
349 for (size_t i=1; i<v2.size(); ++i)
350 v2[i]=calc(i*(mask+1),n,ang);
351 }
352
353 cmplx<T> operator[](size_t idx) const
354 {
355 if (2*idx<=N)
356 {
357 auto x1=v1[idx&mask], x2=v2[idx>>shift];
358 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
359 }
360 idx = N-idx;
361 auto x1=v1[idx&mask], x2=v2[idx>>shift];
362 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
363 }
364 };
365
366struct util // hack to avoid duplicate symbols
367 {
368 static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
369 {
370 size_t res=1;
371 while ((n&1)==0)
372 { res=2; n>>=1; }
373 for (size_t x=3; x*x<=n; x+=2)
374 while ((n%x)==0)
375 { res=x; n/=x; }
376 if (n>1) res=n;
377 return res;
378 }
379
380 static POCKETFFT_NOINLINE double cost_guess (size_t n)
381 {
382 constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
383 size_t ni=n;
384 double result=0.;
385 while ((n&1)==0)
386 { result+=2; n>>=1; }
387 for (size_t x=3; x*x<=n; x+=2)
388 while ((n%x)==0)
389 {
390 result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
391 n/=x;
392 }
393 if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
394 return result*double(ni);
395 }
396
397 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
398 static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
399 {
400 if (n<=12) return n;
401
402 size_t bestfac=2*n;
403 for (size_t f11=1; f11<bestfac; f11*=11)
404 for (size_t f117=f11; f117<bestfac; f117*=7)
405 for (size_t f1175=f117; f1175<bestfac; f1175*=5)
406 {
407 size_t x=f1175;
408 while (x<n) x*=2;
409 for (;;)
410 {
411 if (x<n)
412 x*=3;
413 else if (x>n)
414 {
415 if (x<bestfac) bestfac=x;
416 if (x&1) break;
417 x>>=1;
418 }
419 else
420 return n;
421 }
422 }
423 return bestfac;
424 }
425
426 /* returns the smallest composite of 2, 3, 5 which is >= n */
427 static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
428 {
429 if (n<=6) return n;
430
431 size_t bestfac=2*n;
432 for (size_t f5=1; f5<bestfac; f5*=5)
433 {
434 size_t x = f5;
435 while (x<n) x *= 2;
436 for (;;)
437 {
438 if (x<n)
439 x*=3;
440 else if (x>n)
441 {
442 if (x<bestfac) bestfac=x;
443 if (x&1) break;
444 x>>=1;
445 }
446 else
447 return n;
448 }
449 }
450 return bestfac;
451 }
452
453 static size_t prod(const shape_t &shape)
454 {
455 size_t res=1;
456 for (auto sz: shape)
457 res*=sz;
458 return res;
459 }
460
461 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
462 const stride_t &stride_in, const stride_t &stride_out, bool inplace)
463 {
464 auto ndim = shape.size();
465 if (ndim<1) throw std::runtime_error("ndim must be >= 1");
466 if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
467 throw std::runtime_error("stride dimension mismatch");
468 if (inplace && (stride_in!=stride_out))
469 throw std::runtime_error("stride mismatch");
470 }
471
472 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
473 const stride_t &stride_in, const stride_t &stride_out, bool inplace,
474 const shape_t &axes)
475 {
476 sanity_check(shape, stride_in, stride_out, inplace);
477 auto ndim = shape.size();
478 shape_t tmp(ndim,0);
479 for (auto ax : axes)
480 {
481 if (ax>=ndim) throw std::invalid_argument("bad axis number");
482 if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
483 }
484 }
485
486 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
487 const stride_t &stride_in, const stride_t &stride_out, bool inplace,
488 size_t axis)
489 {
490 sanity_check(shape, stride_in, stride_out, inplace);
491 if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
492 }
493
494#ifdef POCKETFFT_NO_MULTITHREADING
495 static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
496 size_t /*axis*/, size_t /*vlen*/)
497 { return 1; }
498#else
499 static size_t thread_count (size_t nthreads, const shape_t &shape,
500 size_t axis, size_t vlen)
501 {
502 if (nthreads==1) return 1;
503 size_t size = prod(shape);
504 size_t parallel = size / (shape[axis] * vlen);
505 if (shape[axis] < 1000)
506 parallel /= 4;
507 size_t max_threads = nthreads == 0 ?
508 std::thread::hardware_concurrency() : nthreads;
509 return std::max(size_t(1), std::min(parallel, max_threads));
510 }
511#endif
512 };
513
514namespace threading {
515
516#ifdef POCKETFFT_NO_MULTITHREADING
517
518constexpr inline size_t thread_id() { return 0; }
519constexpr inline size_t num_threads() { return 1; }
520
521template <typename Func>
522void thread_map(size_t /* nthreads */, Func f)
523 { f(); }
524
525#else
526
527inline size_t &thread_id()
528 {
529 static thread_local size_t thread_id_=0;
530 return thread_id_;
531 }
532inline size_t &num_threads()
533 {
534 static thread_local size_t num_threads_=1;
535 return num_threads_;
536 }
537static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
538
539class latch
540 {
541 std::atomic<size_t> num_left_;
542 std::mutex mut_;
543 std::condition_variable completed_;
544 using lock_t = std::unique_lock<std::mutex>;
545
546 public:
547 latch(size_t n): num_left_(n) {}
548
549 void count_down()
550 {
551 lock_t lock(mut_);
552 if (--num_left_)
553 return;
554 completed_.notify_all();
555 }
556
557 void wait()
558 {
559 lock_t lock(mut_);
560 completed_.wait(lock, [this]{ return is_ready(); });
561 }
562 bool is_ready() { return num_left_ == 0; }
563 };
564
565template <typename T> class concurrent_queue
566 {
567 std::queue<T> q_;
568 std::mutex mut_;
569 std::atomic<size_t> size_;
570 using lock_t = std::lock_guard<std::mutex>;
571
572 public:
573
574 void push(T val)
575 {
576 lock_t lock(mut_);
577 ++size_;
578 q_.push(std::move(val));
579 }
580
581 bool try_pop(T &val)
582 {
583 if (size_ == 0) return false;
584 lock_t lock(mut_);
585 // Queue might have been emptied while we acquired the lock
586 if (q_.empty()) return false;
587
588 val = std::move(q_.front());
589 --size_;
590 q_.pop();
591 return true;
592 }
593
594 bool empty() const { return size_==0; }
595 };
596
597// C++ allocator with support for over-aligned types
598template <typename T> struct aligned_allocator
599 {
600 using value_type = T;
601 template <class U>
602 aligned_allocator(const aligned_allocator<U>&) {}
603 aligned_allocator() = default;
604
605 T *allocate(size_t n)
606 {
607 void* mem = aligned_alloc(alignof(T), n*sizeof(T));
608 return static_cast<T*>(mem);
609 }
610
611 void deallocate(T *p, size_t /*n*/)
612 { aligned_dealloc(p); }
613 };
614
615class thread_pool
616 {
617 // A reasonable guess, probably close enough for most hardware
618 static constexpr size_t cache_line_size = 64;
619 struct alignas(cache_line_size) worker
620 {
621 std::thread thread;
622 std::condition_variable work_ready;
623 std::mutex mut;
624 std::atomic_flag busy_flag = ATOMIC_FLAG_INIT;
625 std::function<void()> work;
626
627 void worker_main(
628 std::atomic<bool> &shutdown_flag,
629 std::atomic<size_t> &unscheduled_tasks,
630 concurrent_queue<std::function<void()>> &overflow_work)
631 {
632 using lock_t = std::unique_lock<std::mutex>;
633 bool expect_work = true;
634 while (!shutdown_flag || expect_work)
635 {
636 std::function<void()> local_work;
637 if (expect_work || unscheduled_tasks == 0)
638 {
639 lock_t lock(mut);
640 // Wait until there is work to be executed
641 work_ready.wait(lock, [&]{ return (work || shutdown_flag); });
642 local_work.swap(work);
643 expect_work = false;
644 }
645
646 bool marked_busy = false;
647 if (local_work)
648 {
649 marked_busy = true;
650 local_work();
651 }
652
653 if (!overflow_work.empty())
654 {
655 if (!marked_busy && busy_flag.test_and_set())
656 {
657 expect_work = true;
658 continue;
659 }
660 marked_busy = true;
661
662 while (overflow_work.try_pop(local_work))
663 {
664 --unscheduled_tasks;
665 local_work();
666 }
667 }
668
669 if (marked_busy) busy_flag.clear();
670 }
671 }
672 };
673
674 concurrent_queue<std::function<void()>> overflow_work_;
675 std::mutex mut_;
676 std::vector<worker, aligned_allocator<worker>> workers_;
677 std::atomic<bool> shutdown_;
678 std::atomic<size_t> unscheduled_tasks_;
679 using lock_t = std::lock_guard<std::mutex>;
680
681 void create_threads()
682 {
683 lock_t lock(mut_);
684 size_t nthreads=workers_.size();
685 for (size_t i=0; i<nthreads; ++i)
686 {
687 try
688 {
689 auto *worker = &workers_[i];
690 worker->busy_flag.clear();
691 worker->work = nullptr;
692 worker->thread = std::thread([worker, this]
693 {
694 worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_);
695 });
696 }
697 catch (...)
698 {
699 shutdown_locked();
700 throw;
701 }
702 }
703 }
704
705 void shutdown_locked()
706 {
707 shutdown_ = true;
708 for (auto &worker : workers_)
709 worker.work_ready.notify_all();
710
711 for (auto &worker : workers_)
712 if (worker.thread.joinable())
713 worker.thread.join();
714 }
715
716 public:
717 explicit thread_pool(size_t nthreads):
718 workers_(nthreads)
719 { create_threads(); }
720
721 thread_pool(): thread_pool(max_threads) {}
722
723 ~thread_pool() { shutdown(); }
724
725 void submit(std::function<void()> work)
726 {
727 lock_t lock(mut_);
728 if (shutdown_)
729 throw std::runtime_error("Work item submitted after shutdown");
730
731 ++unscheduled_tasks_;
732
733 // First check for any idle workers and wake those
734 for (auto &worker : workers_)
735 if (!worker.busy_flag.test_and_set())
736 {
737 --unscheduled_tasks_;
738 {
739 lock_t lock(worker.mut);
740 worker.work = std::move(work);
741 }
742 worker.work_ready.notify_one();
743 return;
744 }
745
746 // If no workers were idle, push onto the overflow queue for later
747 overflow_work_.push(std::move(work));
748 }
749
750 void shutdown()
751 {
752 lock_t lock(mut_);
753 shutdown_locked();
754 }
755
756 void restart()
757 {
758 shutdown_ = false;
759 create_threads();
760 }
761 };
762
763inline thread_pool & get_pool()
764 {
765 static thread_pool pool;
766#ifdef POCKETFFT_PTHREADS
767 static std::once_flag f;
768 std::call_once(f,
769 []{
770 pthread_atfork(
771 +[]{ get_pool().shutdown(); }, // prepare
772 +[]{ get_pool().restart(); }, // parent
773 +[]{ get_pool().restart(); } // child
774 );
775 });
776#endif
777
778 return pool;
779 }
780
781/** Map a function f over nthreads */
782template <typename Func>
783void thread_map(size_t nthreads, Func f)
784 {
785 if (nthreads == 0)
786 nthreads = max_threads;
787
788 if (nthreads == 1)
789 { f(); return; }
790
791 auto & pool = get_pool();
792 latch counter(nthreads);
793 std::exception_ptr ex;
794 std::mutex ex_mut;
795 for (size_t i=0; i<nthreads; ++i)
796 {
797 pool.submit(
798 [&f, &counter, &ex, &ex_mut, i, nthreads] {
799 thread_id() = i;
800 num_threads() = nthreads;
801 try { f(); }
802 catch (...)
803 {
804 std::lock_guard<std::mutex> lock(ex_mut);
805 ex = std::current_exception();
806 }
807 counter.count_down();
808 });
809 }
810 counter.wait();
811 if (ex)
812 std::rethrow_exception(ex);
813 }
814
815#endif
816
817}
818
819//
820// complex FFTPACK transforms
821//
822
823template<typename T0> class cfftp
824 {
825 private:
826 struct fctdata
827 {
828 size_t fct;
829 cmplx<T0> *tw, *tws;
830 };
831
832 size_t length;
833 arr<cmplx<T0>> mem;
834 std::vector<fctdata> fact;
835
836 void add_factor(size_t factor)
837 { fact.push_back({factor, nullptr, nullptr}); }
838
839template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
840 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
841 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
842 {
843 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
844 { return ch[a+ido*(b+l1*c)]; };
845 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
846 { return cc[a+ido*(b+2*c)]; };
847 auto WA = [wa, ido](size_t x, size_t i)
848 { return wa[i-1+x*(ido-1)]; };
849
850 if (ido==1)
851 for (size_t k=0; k<l1; ++k)
852 {
853 CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
854 CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
855 }
856 else
857 for (size_t k=0; k<l1; ++k)
858 {
859 CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
860 CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
861 for (size_t i=1; i<ido; ++i)
862 {
863 CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
864 special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
865 }
866 }
867 }
868
869#define POCKETFFT_PREP3(idx) \
870 T t0 = CC(idx,0,k), t1, t2; \
871 PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
872 CH(idx,k,0)=t0+t1;
873#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
874 { \
875 T ca=t0+t1*twr; \
876 T cb{-t2.i*twi, t2.r*twi}; \
877 PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
878 }
879#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
880 { \
881 T ca=t0+t1*twr; \
882 T cb{-t2.i*twi, t2.r*twi}; \
883 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
884 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
885 }
886template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
887 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
888 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
889 {
890 constexpr T0 tw1r=-0.5,
891 tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
892
893 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
894 { return ch[a+ido*(b+l1*c)]; };
895 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
896 { return cc[a+ido*(b+3*c)]; };
897 auto WA = [wa, ido](size_t x, size_t i)
898 { return wa[i-1+x*(ido-1)]; };
899
900 if (ido==1)
901 for (size_t k=0; k<l1; ++k)
902 {
903 POCKETFFT_PREP3(0)
904 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
905 }
906 else
907 for (size_t k=0; k<l1; ++k)
908 {
909 {
910 POCKETFFT_PREP3(0)
911 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
912 }
913 for (size_t i=1; i<ido; ++i)
914 {
915 POCKETFFT_PREP3(i)
916 POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
917 }
918 }
919 }
920
921#undef POCKETFFT_PARTSTEP3b
922#undef POCKETFFT_PARTSTEP3a
923#undef POCKETFFT_PREP3
924
925template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
926 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
927 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
928 {
929 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
930 { return ch[a+ido*(b+l1*c)]; };
931 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
932 { return cc[a+ido*(b+4*c)]; };
933 auto WA = [wa, ido](size_t x, size_t i)
934 { return wa[i-1+x*(ido-1)]; };
935
936 if (ido==1)
937 for (size_t k=0; k<l1; ++k)
938 {
939 T t1, t2, t3, t4;
940 PM(t2,t1,CC(0,0,k),CC(0,2,k));
941 PM(t3,t4,CC(0,1,k),CC(0,3,k));
942 ROTX90<fwd>(t4);
943 PM(CH(0,k,0),CH(0,k,2),t2,t3);
944 PM(CH(0,k,1),CH(0,k,3),t1,t4);
945 }
946 else
947 for (size_t k=0; k<l1; ++k)
948 {
949 {
950 T t1, t2, t3, t4;
951 PM(t2,t1,CC(0,0,k),CC(0,2,k));
952 PM(t3,t4,CC(0,1,k),CC(0,3,k));
953 ROTX90<fwd>(t4);
954 PM(CH(0,k,0),CH(0,k,2),t2,t3);
955 PM(CH(0,k,1),CH(0,k,3),t1,t4);
956 }
957 for (size_t i=1; i<ido; ++i)
958 {
959 T t1, t2, t3, t4;
960 T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
961 PM(t2,t1,cc0,cc2);
962 PM(t3,t4,cc1,cc3);
963 ROTX90<fwd>(t4);
964 CH(i,k,0) = t2+t3;
965 special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
966 special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
967 special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
968 }
969 }
970 }
971
972#define POCKETFFT_PREP5(idx) \
973 T t0 = CC(idx,0,k), t1, t2, t3, t4; \
974 PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
975 PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
976 CH(idx,k,0).r=t0.r+t1.r+t2.r; \
977 CH(idx,k,0).i=t0.i+t1.i+t2.i;
978
979#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
980 { \
981 T ca,cb; \
982 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
983 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
984 cb.i=twai*t4.r twbi*t3.r; \
985 cb.r=-(twai*t4.i twbi*t3.i); \
986 PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
987 }
988
989#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
990 { \
991 T ca,cb,da,db; \
992 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
993 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
994 cb.i=twai*t4.r twbi*t3.r; \
995 cb.r=-(twai*t4.i twbi*t3.i); \
996 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
997 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
998 }
999template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
1000 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1001 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1002 {
1003 constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
1004 tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
1005 tw2r= T0(-0.8090169943749474241022934171828191L),
1006 tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
1007
1008 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1009 { return ch[a+ido*(b+l1*c)]; };
1010 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1011 { return cc[a+ido*(b+5*c)]; };
1012 auto WA = [wa, ido](size_t x, size_t i)
1013 { return wa[i-1+x*(ido-1)]; };
1014
1015 if (ido==1)
1016 for (size_t k=0; k<l1; ++k)
1017 {
1018 POCKETFFT_PREP5(0)
1019 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1020 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1021 }
1022 else
1023 for (size_t k=0; k<l1; ++k)
1024 {
1025 {
1026 POCKETFFT_PREP5(0)
1027 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1028 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1029 }
1030 for (size_t i=1; i<ido; ++i)
1031 {
1032 POCKETFFT_PREP5(i)
1033 POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
1034 POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
1035 }
1036 }
1037 }
1038
1039#undef POCKETFFT_PARTSTEP5b
1040#undef POCKETFFT_PARTSTEP5a
1041#undef POCKETFFT_PREP5
1042
1043#define POCKETFFT_PREP7(idx) \
1044 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
1045 PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
1046 PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
1047 PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
1048 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
1049 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
1050
1051#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
1052 { \
1053 T ca,cb; \
1054 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
1055 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
1056 cb.i=y1*t7.r y2*t6.r y3*t5.r; \
1057 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
1058 PM(out1,out2,ca,cb); \
1059 }
1060#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
1061 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
1062#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
1063 { \
1064 T da,db; \
1065 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
1066 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1067 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1068 }
1069
1070template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
1071 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1072 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1073 {
1074 constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
1075 tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
1076 tw2r= T0(-0.2225209339563144042889025644967948L),
1077 tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
1078 tw3r= T0(-0.9009688679024191262361023195074451L),
1079 tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
1080
1081 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1082 { return ch[a+ido*(b+l1*c)]; };
1083 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1084 { return cc[a+ido*(b+7*c)]; };
1085 auto WA = [wa, ido](size_t x, size_t i)
1086 { return wa[i-1+x*(ido-1)]; };
1087
1088 if (ido==1)
1089 for (size_t k=0; k<l1; ++k)
1090 {
1091 POCKETFFT_PREP7(0)
1092 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1093 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1094 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1095 }
1096 else
1097 for (size_t k=0; k<l1; ++k)
1098 {
1099 {
1100 POCKETFFT_PREP7(0)
1101 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1102 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1103 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1104 }
1105 for (size_t i=1; i<ido; ++i)
1106 {
1107 POCKETFFT_PREP7(i)
1108 POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1109 POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1110 POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1111 }
1112 }
1113 }
1114
1115#undef POCKETFFT_PARTSTEP7
1116#undef POCKETFFT_PARTSTEP7a0
1117#undef POCKETFFT_PARTSTEP7a
1118#undef POCKETFFT_PREP7
1119
1120template <bool fwd, typename T> void ROTX45(T &a) const
1121 {
1122 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1123 if (fwd)
1124 { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
1125 else
1126 { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
1127 }
1128template <bool fwd, typename T> void ROTX135(T &a) const
1129 {
1130 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1131 if (fwd)
1132 { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
1133 else
1134 { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
1135 }
1136
1137template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
1138 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1139 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1140 {
1141 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1142 { return ch[a+ido*(b+l1*c)]; };
1143 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1144 { return cc[a+ido*(b+8*c)]; };
1145 auto WA = [wa, ido](size_t x, size_t i)
1146 { return wa[i-1+x*(ido-1)]; };
1147
1148 if (ido==1)
1149 for (size_t k=0; k<l1; ++k)
1150 {
1151 T a0, a1, a2, a3, a4, a5, a6, a7;
1152 PM(a1,a5,CC(0,1,k),CC(0,5,k));
1153 PM(a3,a7,CC(0,3,k),CC(0,7,k));
1154 PMINPLACE(a1,a3);
1155 ROTX90<fwd>(a3);
1156
1157 ROTX90<fwd>(a7);
1158 PMINPLACE(a5,a7);
1159 ROTX45<fwd>(a5);
1160 ROTX135<fwd>(a7);
1161
1162 PM(a0,a4,CC(0,0,k),CC(0,4,k));
1163 PM(a2,a6,CC(0,2,k),CC(0,6,k));
1164 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1165 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1166 ROTX90<fwd>(a6);
1167 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1168 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1169 }
1170 else
1171 for (size_t k=0; k<l1; ++k)
1172 {
1173 {
1174 T a0, a1, a2, a3, a4, a5, a6, a7;
1175 PM(a1,a5,CC(0,1,k),CC(0,5,k));
1176 PM(a3,a7,CC(0,3,k),CC(0,7,k));
1177 PMINPLACE(a1,a3);
1178 ROTX90<fwd>(a3);
1179
1180 ROTX90<fwd>(a7);
1181 PMINPLACE(a5,a7);
1182 ROTX45<fwd>(a5);
1183 ROTX135<fwd>(a7);
1184
1185 PM(a0,a4,CC(0,0,k),CC(0,4,k));
1186 PM(a2,a6,CC(0,2,k),CC(0,6,k));
1187 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1188 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1189 ROTX90<fwd>(a6);
1190 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1191 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1192 }
1193 for (size_t i=1; i<ido; ++i)
1194 {
1195 T a0, a1, a2, a3, a4, a5, a6, a7;
1196 PM(a1,a5,CC(i,1,k),CC(i,5,k));
1197 PM(a3,a7,CC(i,3,k),CC(i,7,k));
1198 ROTX90<fwd>(a7);
1199 PMINPLACE(a1,a3);
1200 ROTX90<fwd>(a3);
1201 PMINPLACE(a5,a7);
1202 ROTX45<fwd>(a5);
1203 ROTX135<fwd>(a7);
1204 PM(a0,a4,CC(i,0,k),CC(i,4,k));
1205 PM(a2,a6,CC(i,2,k),CC(i,6,k));
1206 PMINPLACE(a0,a2);
1207 CH(i,k,0) = a0+a1;
1208 special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
1209 special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
1210 special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
1211 ROTX90<fwd>(a6);
1212 PMINPLACE(a4,a6);
1213 special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
1214 special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
1215 special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
1216 special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
1217 }
1218 }
1219 }
1220
1221
1222#define POCKETFFT_PREP11(idx) \
1223 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
1224 PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
1225 PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
1226 PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
1227 PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
1228 PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
1229 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
1230 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
1231
1232#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
1233 { \
1234 T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
1235 cb; \
1236 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
1237 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
1238 PM(out1,out2,ca,cb); \
1239 }
1240#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1241 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
1242#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1243 { \
1244 T da,db; \
1245 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
1246 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1247 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1248 }
1249
1250template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
1251 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1252 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1253 {
1254 constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
1255 tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
1256 tw2r= T0(0.4154150130018864255292741492296232L),
1257 tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
1258 tw3r= T0(-0.1423148382732851404437926686163697L),
1259 tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
1260 tw4r= T0(-0.6548607339452850640569250724662936L),
1261 tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
1262 tw5r= T0(-0.9594929736144973898903680570663277L),
1263 tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
1264
1265 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1266 { return ch[a+ido*(b+l1*c)]; };
1267 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1268 { return cc[a+ido*(b+11*c)]; };
1269 auto WA = [wa, ido](size_t x, size_t i)
1270 { return wa[i-1+x*(ido-1)]; };
1271
1272 if (ido==1)
1273 for (size_t k=0; k<l1; ++k)
1274 {
1275 POCKETFFT_PREP11(0)
1276 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1277 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1278 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1279 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1280 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1281 }
1282 else
1283 for (size_t k=0; k<l1; ++k)
1284 {
1285 {
1286 POCKETFFT_PREP11(0)
1287 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1288 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1289 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1290 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1291 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1292 }
1293 for (size_t i=1; i<ido; ++i)
1294 {
1295 POCKETFFT_PREP11(i)
1296 POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1297 POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1298 POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1299 POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1300 POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1301 }
1302 }
1303 }
1304
1305#undef POCKETFFT_PARTSTEP11
1306#undef POCKETFFT_PARTSTEP11a0
1307#undef POCKETFFT_PARTSTEP11a
1308#undef POCKETFFT_PREP11
1309
1310template<bool fwd, typename T> void passg (size_t ido, size_t ip,
1311 size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1312 const cmplx<T0> * POCKETFFT_RESTRICT wa,
1313 const cmplx<T0> * POCKETFFT_RESTRICT csarr) const
1314 {
1315 const size_t cdim=ip;
1316 size_t ipph = (ip+1)/2;
1317 size_t idl1 = ido*l1;
1318
1319 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1320 { return ch[a+ido*(b+l1*c)]; };
1321 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
1322 { return cc[a+ido*(b+cdim*c)]; };
1323 auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T&
1324 { return cc[a+ido*(b+l1*c)]; };
1325 auto CX2 = [cc, idl1](size_t a, size_t b) -> T&
1326 { return cc[a+idl1*b]; };
1327 auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
1328 { return ch[a+idl1*b]; };
1329
1330 arr<cmplx<T0>> wal(ip);
1331 wal[0] = cmplx<T0>(1., 0.);
1332 for (size_t i=1; i<ip; ++i)
1333 wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
1334
1335 for (size_t k=0; k<l1; ++k)
1336 for (size_t i=0; i<ido; ++i)
1337 CH(i,k,0) = CC(i,0,k);
1338 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1339 for (size_t k=0; k<l1; ++k)
1340 for (size_t i=0; i<ido; ++i)
1341 PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
1342 for (size_t k=0; k<l1; ++k)
1343 for (size_t i=0; i<ido; ++i)
1344 {
1345 T tmp = CH(i,k,0);
1346 for (size_t j=1; j<ipph; ++j)
1347 tmp+=CH(i,k,j);
1348 CX(i,k,0) = tmp;
1349 }
1350 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
1351 {
1352 // j=0
1353 for (size_t ik=0; ik<idl1; ++ik)
1354 {
1355 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
1356 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
1357 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
1358 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
1359 }
1360
1361 size_t iwal=2*l;
1362 size_t j=3, jc=ip-3;
1363 for (; j<ipph-1; j+=2, jc-=2)
1364 {
1365 iwal+=l; if (iwal>ip) iwal-=ip;
1366 cmplx<T0> xwal=wal[iwal];
1367 iwal+=l; if (iwal>ip) iwal-=ip;
1368 cmplx<T0> xwal2=wal[iwal];
1369 for (size_t ik=0; ik<idl1; ++ik)
1370 {
1371 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
1372 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
1373 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
1374 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
1375 }
1376 }
1377 for (; j<ipph; ++j, --jc)
1378 {
1379 iwal+=l; if (iwal>ip) iwal-=ip;
1380 cmplx<T0> xwal=wal[iwal];
1381 for (size_t ik=0; ik<idl1; ++ik)
1382 {
1383 CX2(ik,l).r += CH2(ik,j).r*xwal.r;
1384 CX2(ik,l).i += CH2(ik,j).i*xwal.r;
1385 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
1386 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
1387 }
1388 }
1389 }
1390
1391 // shuffling and twiddling
1392 if (ido==1)
1393 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1394 for (size_t ik=0; ik<idl1; ++ik)
1395 {
1396 T t1=CX2(ik,j), t2=CX2(ik,jc);
1397 PM(CX2(ik,j),CX2(ik,jc),t1,t2);
1398 }
1399 else
1400 {
1401 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1402 for (size_t k=0; k<l1; ++k)
1403 {
1404 T t1=CX(0,k,j), t2=CX(0,k,jc);
1405 PM(CX(0,k,j),CX(0,k,jc),t1,t2);
1406 for (size_t i=1; i<ido; ++i)
1407 {
1408 T x1, x2;
1409 PM(x1,x2,CX(i,k,j),CX(i,k,jc));
1410 size_t idij=(j-1)*(ido-1)+i-1;
1411 special_mul<fwd>(x1,wa[idij],CX(i,k,j));
1412 idij=(jc-1)*(ido-1)+i-1;
1413 special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
1414 }
1415 }
1416 }
1417 }
1418
1419template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
1420 {
1421 if (length==1) { c[0]*=fct; return; }
1422 size_t l1=1;
1423 arr<T> ch(length);
1424 T *p1=c, *p2=ch.data();
1425
1426 for(size_t k1=0; k1<fact.size(); k1++)
1427 {
1428 size_t ip=fact[k1].fct;
1429 size_t l2=ip*l1;
1430 size_t ido = length/l2;
1431 if (ip==4)
1432 pass4<fwd> (ido, l1, p1, p2, fact[k1].tw);
1433 else if(ip==8)
1434 pass8<fwd>(ido, l1, p1, p2, fact[k1].tw);
1435 else if(ip==2)
1436 pass2<fwd>(ido, l1, p1, p2, fact[k1].tw);
1437 else if(ip==3)
1438 pass3<fwd> (ido, l1, p1, p2, fact[k1].tw);
1439 else if(ip==5)
1440 pass5<fwd> (ido, l1, p1, p2, fact[k1].tw);
1441 else if(ip==7)
1442 pass7<fwd> (ido, l1, p1, p2, fact[k1].tw);
1443 else if(ip==11)
1444 pass11<fwd> (ido, l1, p1, p2, fact[k1].tw);
1445 else
1446 {
1447 passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
1448 std::swap(p1,p2);
1449 }
1450 std::swap(p1,p2);
1451 l1=l2;
1452 }
1453 if (p1!=c)
1454 {
1455 if (fct!=1.)
1456 for (size_t i=0; i<length; ++i)
1457 c[i] = ch[i]*fct;
1458 else
1459 std::copy_n (p1, length, c);
1460 }
1461 else
1462 if (fct!=1.)
1463 for (size_t i=0; i<length; ++i)
1464 c[i] *= fct;
1465 }
1466
1467 public:
1468 template<typename T> void exec(T c[], T0 fct, bool fwd) const
1469 { fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); }
1470
1471 private:
1472 POCKETFFT_NOINLINE void factorize()
1473 {
1474 size_t len=length;
1475 while ((len&7)==0)
1476 { add_factor(8); len>>=3; }
1477 while ((len&3)==0)
1478 { add_factor(4); len>>=2; }
1479 if ((len&1)==0)
1480 {
1481 len>>=1;
1482 // factor 2 should be at the front of the factor list
1483 add_factor(2);
1484 std::swap(fact[0].fct, fact.back().fct);
1485 }
1486 for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
1487 while ((len%divisor)==0)
1488 {
1489 add_factor(divisor);
1490 len/=divisor;
1491 }
1492 if (len>1) add_factor(len);
1493 }
1494
1495 size_t twsize() const
1496 {
1497 size_t twsize=0, l1=1;
1498 for (size_t k=0; k<fact.size(); ++k)
1499 {
1500 size_t ip=fact[k].fct, ido= length/(l1*ip);
1501 twsize+=(ip-1)*(ido-1);
1502 if (ip>11)
1503 twsize+=ip;
1504 l1*=ip;
1505 }
1506 return twsize;
1507 }
1508
1509 void comp_twiddle()
1510 {
1511 sincos_2pibyn<T0> twiddle(length);
1512 size_t l1=1;
1513 size_t memofs=0;
1514 for (size_t k=0; k<fact.size(); ++k)
1515 {
1516 size_t ip=fact[k].fct, ido=length/(l1*ip);
1517 fact[k].tw=mem.data()+memofs;
1518 memofs+=(ip-1)*(ido-1);
1519 for (size_t j=1; j<ip; ++j)
1520 for (size_t i=1; i<ido; ++i)
1521 fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
1522 if (ip>11)
1523 {
1524 fact[k].tws=mem.data()+memofs;
1525 memofs+=ip;
1526 for (size_t j=0; j<ip; ++j)
1527 fact[k].tws[j] = twiddle[j*l1*ido];
1528 }
1529 l1*=ip;
1530 }
1531 }
1532
1533 public:
1534 POCKETFFT_NOINLINE cfftp(size_t length_)
1535 : length(length_)
1536 {
1537 if (length==0) throw std::runtime_error("zero-length FFT requested");
1538 if (length==1) return;
1539 factorize();
1540 mem.resize(twsize());
1541 comp_twiddle();
1542 }
1543 };
1544
1545//
1546// real-valued FFTPACK transforms
1547//
1548
1549template<typename T0> class rfftp
1550 {
1551 private:
1552 struct fctdata
1553 {
1554 size_t fct;
1555 T0 *tw, *tws;
1556 };
1557
1558 size_t length;
1559 arr<T0> mem;
1560 std::vector<fctdata> fact;
1561
1562 void add_factor(size_t factor)
1563 { fact.push_back({factor, nullptr, nullptr}); }
1564
1565/* (a+ib) = conj(c+id) * (e+if) */
1566template<typename T1, typename T2, typename T3> inline void MULPM
1567 (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
1568 { a=c*e+d*f; b=c*f-d*e; }
1569
1570template<typename T> void radf2 (size_t ido, size_t l1,
1571 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1572 const T0 * POCKETFFT_RESTRICT wa) const
1573 {
1574 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1575 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1576 { return cc[a+ido*(b+l1*c)]; };
1577 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1578 { return ch[a+ido*(b+2*c)]; };
1579
1580 for (size_t k=0; k<l1; k++)
1581 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
1582 if ((ido&1)==0)
1583 for (size_t k=0; k<l1; k++)
1584 {
1585 CH( 0,1,k) = -CC(ido-1,k,1);
1586 CH(ido-1,0,k) = CC(ido-1,k,0);
1587 }
1588 if (ido<=2) return;
1589 for (size_t k=0; k<l1; k++)
1590 for (size_t i=2; i<ido; i+=2)
1591 {
1592 size_t ic=ido-i;
1593 T tr2, ti2;
1594 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1595 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
1596 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0));
1597 }
1598 }
1599
1600// a2=a+b; b2=i*(b-a);
1601#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
1602 {\
1603 auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
1604 rx=t1; ix=t3; ry=t4; iy=t2; \
1605 }
1606
1607template<typename T> void radf3(size_t ido, size_t l1,
1608 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1609 const T0 * POCKETFFT_RESTRICT wa) const
1610 {
1611 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1612
1613 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1614 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1615 { return cc[a+ido*(b+l1*c)]; };
1616 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1617 { return ch[a+ido*(b+3*c)]; };
1618
1619 for (size_t k=0; k<l1; k++)
1620 {
1621 T cr2=CC(0,k,1)+CC(0,k,2);
1622 CH(0,0,k) = CC(0,k,0)+cr2;
1623 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1624 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1625 }
1626 if (ido==1) return;
1627 for (size_t k=0; k<l1; k++)
1628 for (size_t i=2; i<ido; i+=2)
1629 {
1630 size_t ic=ido-i;
1631 T di2, di3, dr2, dr3;
1632 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
1633 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
1634 POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
1635 CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
1636 CH(i ,0,k) = CC(i ,k,0)+di2;
1637 T tr2 = CC(i-1,k,0)+taur*dr2; // c add
1638 T ti2 = CC(i ,k,0)+taur*di2;
1639 T tr3 = taui*dr3; // t3 = taui*i*(d3-d2)?
1640 T ti3 = taui*di3;
1641 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
1642 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
1643 }
1644 }
1645
1646template<typename T> void radf4(size_t ido, size_t l1,
1647 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1648 const T0 * POCKETFFT_RESTRICT wa) const
1649 {
1650 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1651
1652 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1653 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1654 { return cc[a+ido*(b+l1*c)]; };
1655 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1656 { return ch[a+ido*(b+4*c)]; };
1657
1658 for (size_t k=0; k<l1; k++)
1659 {
1660 T tr1,tr2;
1661 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
1662 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
1663 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
1664 }
1665 if ((ido&1)==0)
1666 for (size_t k=0; k<l1; k++)
1667 {
1668 T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1669 T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1670 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
1671 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2));
1672 }
1673 if (ido<=2) return;
1674 for (size_t k=0; k<l1; k++)
1675 for (size_t i=2; i<ido; i+=2)
1676 {
1677 size_t ic=ido-i;
1678 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1679 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1680 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1681 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1682 PM(tr1,tr4,cr4,cr2);
1683 PM(ti1,ti4,ci2,ci4);
1684 PM(tr2,tr3,CC(i-1,k,0),cr3);
1685 PM(ti2,ti3,CC(i ,k,0),ci3);
1686 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
1687 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2);
1688 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
1689 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3);
1690 }
1691 }
1692
1693template<typename T> void radf5(size_t ido, size_t l1,
1694 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1695 const T0 * POCKETFFT_RESTRICT wa) const
1696 {
1697 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
1698 ti11= T0(0.9510565162951535721164393333793821L),
1699 tr12= T0(-0.8090169943749474241022934171828191L),
1700 ti12= T0(0.5877852522924731291687059546390728L);
1701
1702 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1703 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1704 { return cc[a+ido*(b+l1*c)]; };
1705 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1706 { return ch[a+ido*(b+5*c)]; };
1707
1708 for (size_t k=0; k<l1; k++)
1709 {
1710 T cr2, cr3, ci4, ci5;
1711 PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
1712 PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
1713 CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1714 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1715 CH(0,2,k)=ti11*ci5+ti12*ci4;
1716 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1717 CH(0,4,k)=ti12*ci5-ti11*ci4;
1718 }
1719 if (ido==1) return;
1720 for (size_t k=0; k<l1;++k)
1721 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1722 {
1723 T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
1724 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1725 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1726 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1727 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
1728 POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
1729 POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
1730 CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
1731 CH(i ,0,k)=CC(i ,k,0)+di2+di3;
1732 T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
1733 T ti2=CC(i ,k,0)+tr11*di2+tr12*di3;
1734 T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
1735 T ti3=CC(i ,k,0)+tr12*di2+tr11*di3;
1736 T tr5 = ti11*dr5 + ti12*dr4;
1737 T ti5 = ti11*di5 + ti12*di4;
1738 T tr4 = ti12*dr5 - ti11*dr4;
1739 T ti4 = ti12*di5 - ti11*di4;
1740 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
1741 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2);
1742 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
1743 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3);
1744 }
1745 }
1746
1747#undef POCKETFFT_REARRANGE
1748
1749template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
1750 T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1751 const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
1752 {
1753 const size_t cdim=ip;
1754 size_t ipph=(ip+1)/2;
1755 size_t idl1 = ido*l1;
1756
1757 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T&
1758 { return cc[a+ido*(b+cdim*c)]; };
1759 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T&
1760 { return ch[a+ido*(b+l1*c)]; };
1761 auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T&
1762 { return cc[a+ido*(b+l1*c)]; };
1763 auto C2 = [cc,idl1] (size_t a, size_t b) -> T&
1764 { return cc[a+idl1*b]; };
1765 auto CH2 = [ch,idl1] (size_t a, size_t b) -> T&
1766 { return ch[a+idl1*b]; };
1767
1768 if (ido>1)
1769 {
1770 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1771 {
1772 size_t is=(j-1)*(ido-1),
1773 is2=(jc-1)*(ido-1);
1774 for (size_t k=0; k<l1; ++k) // 113
1775 {
1776 size_t idij=is;
1777 size_t idij2=is2;
1778 for (size_t i=1; i<=ido-2; i+=2) // 112
1779 {
1780 T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1781 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1782 T x1=wa[idij]*t1 + wa[idij+1]*t2,
1783 x2=wa[idij]*t2 - wa[idij+1]*t1,
1784 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1785 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1786 PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
1787 PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
1788 idij+=2;
1789 idij2+=2;
1790 }
1791 }
1792 }
1793 }
1794
1795 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1796 for (size_t k=0; k<l1; ++k) // 122
1797 MPINPLACE(C1(0,k,jc), C1(0,k,j));
1798
1799//everything in C
1800//memset(ch,0,ip*l1*ido*sizeof(double));
1801
1802 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1803 {
1804 for (size_t ik=0; ik<idl1; ++ik) // 124
1805 {
1806 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1807 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1808 }
1809 size_t iang = 2*l;
1810 size_t j=3, jc=ip-3;
1811 for (; j<ipph-3; j+=4,jc-=4) // 126
1812 {
1813 iang+=l; if (iang>=ip) iang-=ip;
1814 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1815 iang+=l; if (iang>=ip) iang-=ip;
1816 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1817 iang+=l; if (iang>=ip) iang-=ip;
1818 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1819 iang+=l; if (iang>=ip) iang-=ip;
1820 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1821 for (size_t ik=0; ik<idl1; ++ik) // 125
1822 {
1823 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1824 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1825 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1826 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1827 }
1828 }
1829 for (; j<ipph-1; j+=2,jc-=2) // 126
1830 {
1831 iang+=l; if (iang>=ip) iang-=ip;
1832 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1833 iang+=l; if (iang>=ip) iang-=ip;
1834 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1835 for (size_t ik=0; ik<idl1; ++ik) // 125
1836 {
1837 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1838 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1839 }
1840 }
1841 for (; j<ipph; ++j,--jc) // 126
1842 {
1843 iang+=l; if (iang>=ip) iang-=ip;
1844 T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
1845 for (size_t ik=0; ik<idl1; ++ik) // 125
1846 {
1847 CH2(ik,l ) += ar*C2(ik,j );
1848 CH2(ik,lc) += ai*C2(ik,jc);
1849 }
1850 }
1851 }
1852 for (size_t ik=0; ik<idl1; ++ik) // 101
1853 CH2(ik,0) = C2(ik,0);
1854 for (size_t j=1; j<ipph; ++j) // 129
1855 for (size_t ik=0; ik<idl1; ++ik) // 128
1856 CH2(ik,0) += C2(ik,j);
1857
1858// everything in CH at this point!
1859//memset(cc,0,ip*l1*ido*sizeof(double));
1860
1861 for (size_t k=0; k<l1; ++k) // 131
1862 for (size_t i=0; i<ido; ++i) // 130
1863 CC(i,0,k) = CH(i,k,0);
1864
1865 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
1866 {
1867 size_t j2=2*j-1;
1868 for (size_t k=0; k<l1; ++k) // 136
1869 {
1870 CC(ido-1,j2,k) = CH(0,k,j);
1871 CC(0,j2+1,k) = CH(0,k,jc);
1872 }
1873 }
1874
1875 if (ido==1) return;
1876
1877 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
1878 {
1879 size_t j2=2*j-1;
1880 for(size_t k=0; k<l1; ++k) // 139
1881 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
1882 {
1883 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
1884 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
1885 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1886 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1887 }
1888 }
1889 }
1890
1891template<typename T> void radb2(size_t ido, size_t l1,
1892 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1893 const T0 * POCKETFFT_RESTRICT wa) const
1894 {
1895 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1896 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1897 { return cc[a+ido*(b+2*c)]; };
1898 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1899 { return ch[a+ido*(b+l1*c)]; };
1900
1901 for (size_t k=0; k<l1; k++)
1902 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
1903 if ((ido&1)==0)
1904 for (size_t k=0; k<l1; k++)
1905 {
1906 CH(ido-1,k,0) = 2*CC(ido-1,0,k);
1907 CH(ido-1,k,1) =-2*CC(0 ,1,k);
1908 }
1909 if (ido<=2) return;
1910 for (size_t k=0; k<l1;++k)
1911 for (size_t i=2; i<ido; i+=2)
1912 {
1913 size_t ic=ido-i;
1914 T ti2, tr2;
1915 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
1916 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k));
1917 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
1918 }
1919 }
1920
1921template<typename T> void radb3(size_t ido, size_t l1,
1922 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1923 const T0 * POCKETFFT_RESTRICT wa) const
1924 {
1925 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1926
1927 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1928 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1929 { return cc[a+ido*(b+3*c)]; };
1930 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1931 { return ch[a+ido*(b+l1*c)]; };
1932
1933 for (size_t k=0; k<l1; k++)
1934 {
1935 T tr2=2*CC(ido-1,1,k);
1936 T cr2=CC(0,0,k)+taur*tr2;
1937 CH(0,k,0)=CC(0,0,k)+tr2;
1938 T ci3=2*taui*CC(0,2,k);
1939 PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1940 }
1941 if (ido==1) return;
1942 for (size_t k=0; k<l1; k++)
1943 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1944 {
1945 T tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1946 T ti2=CC(i ,2,k)-CC(ic ,1,k);
1947 T cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
1948 T ci2=CC(i ,0,k)+taur*ti2;
1949 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
1950 CH(i ,k,0)=CC(i ,0,k)+ti2;
1951 T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1952 T ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
1953 T di2, di3, dr2, dr3;
1954 PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1955 PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1956 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
1957 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
1958 }
1959 }
1960
1961template<typename T> void radb4(size_t ido, size_t l1,
1962 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1963 const T0 * POCKETFFT_RESTRICT wa) const
1964 {
1965 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
1966
1967 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1968 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1969 { return cc[a+ido*(b+4*c)]; };
1970 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1971 { return ch[a+ido*(b+l1*c)]; };
1972
1973 for (size_t k=0; k<l1; k++)
1974 {
1975 T tr1, tr2;
1976 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
1977 T tr3=2*CC(ido-1,1,k);
1978 T tr4=2*CC(0,2,k);
1979 PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
1980 PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
1981 }
1982 if ((ido&1)==0)
1983 for (size_t k=0; k<l1; k++)
1984 {
1985 T tr1,tr2,ti1,ti2;
1986 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k));
1987 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
1988 CH(ido-1,k,0)=tr2+tr2;
1989 CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1990 CH(ido-1,k,2)=ti2+ti2;
1991 CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1992 }
1993 if (ido<=2) return;
1994 for (size_t k=0; k<l1;++k)
1995 for (size_t i=2; i<ido; i+=2)
1996 {
1997 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1998 size_t ic=ido-i;
1999 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
2000 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k));
2001 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k));
2002 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
2003 PM (CH(i-1,k,0),cr3,tr2,tr3);
2004 PM (CH(i ,k,0),ci3,ti2,ti3);
2005 PM (cr4,cr2,tr1,tr4);
2006 PM (ci2,ci4,ti1,ti4);
2007 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
2008 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
2009 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
2010 }
2011 }
2012
2013template<typename T> void radb5(size_t ido, size_t l1,
2014 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2015 const T0 * POCKETFFT_RESTRICT wa) const
2016 {
2017 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
2018 ti11= T0(0.9510565162951535721164393333793821L),
2019 tr12= T0(-0.8090169943749474241022934171828191L),
2020 ti12= T0(0.5877852522924731291687059546390728L);
2021
2022 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2023 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2024 { return cc[a+ido*(b+5*c)]; };
2025 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2026 { return ch[a+ido*(b+l1*c)]; };
2027
2028 for (size_t k=0; k<l1; k++)
2029 {
2030 T ti5=CC(0,2,k)+CC(0,2,k);
2031 T ti4=CC(0,4,k)+CC(0,4,k);
2032 T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
2033 T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
2034 CH(0,k,0)=CC(0,0,k)+tr2+tr3;
2035 T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
2036 T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
2037 T ci4, ci5;
2038 MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2039 PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
2040 PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
2041 }
2042 if (ido==1) return;
2043 for (size_t k=0; k<l1;++k)
2044 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2045 {
2046 T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
2047 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
2048 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k));
2049 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
2050 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k));
2051 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
2052 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
2053 T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
2054 T ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
2055 T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
2056 T ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
2057 T ci4, ci5, cr5, cr4;
2058 MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
2059 MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2060 T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
2061 PM(dr4,dr3,cr3,ci4);
2062 PM(di3,di4,ci3,cr4);
2063 PM(dr5,dr2,cr2,ci5);
2064 PM(di2,di5,ci2,cr5);
2065 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2066 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2067 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
2068 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
2069 }
2070 }
2071
2072template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
2073 T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2074 const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
2075 {
2076 const size_t cdim=ip;
2077 size_t ipph=(ip+1)/ 2;
2078 size_t idl1 = ido*l1;
2079
2080 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
2081 { return cc[a+ido*(b+cdim*c)]; };
2082 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2083 { return ch[a+ido*(b+l1*c)]; };
2084 auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
2085 { return cc[a+ido*(b+l1*c)]; };
2086 auto C2 = [cc,idl1](size_t a, size_t b) -> T&
2087 { return cc[a+idl1*b]; };
2088 auto CH2 = [ch,idl1](size_t a, size_t b) -> T&
2089 { return ch[a+idl1*b]; };
2090
2091 for (size_t k=0; k<l1; ++k) // 102
2092 for (size_t i=0; i<ido; ++i) // 101
2093 CH(i,k,0) = CC(i,0,k);
2094 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
2095 {
2096 size_t j2=2*j-1;
2097 for (size_t k=0; k<l1; ++k)
2098 {
2099 CH(0,k,j ) = 2*CC(ido-1,j2,k);
2100 CH(0,k,jc) = 2*CC(0,j2+1,k);
2101 }
2102 }
2103
2104 if (ido!=1)
2105 {
2106 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
2107 {
2108 size_t j2=2*j-1;
2109 for (size_t k=0; k<l1; ++k)
2110 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
2111 {
2112 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
2113 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
2114 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
2115 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
2116 }
2117 }
2118 }
2119 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
2120 {
2121 for (size_t ik=0; ik<idl1; ++ik)
2122 {
2123 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
2124 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
2125 }
2126 size_t iang=2*l;
2127 size_t j=3,jc=ip-3;
2128 for(; j<ipph-3; j+=4,jc-=4)
2129 {
2130 iang+=l; if(iang>ip) iang-=ip;
2131 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2132 iang+=l; if(iang>ip) iang-=ip;
2133 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2134 iang+=l; if(iang>ip) iang-=ip;
2135 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2136 iang+=l; if(iang>ip) iang-=ip;
2137 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2138 for (size_t ik=0; ik<idl1; ++ik)
2139 {
2140 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
2141 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
2142 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
2143 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
2144 }
2145 }
2146 for(; j<ipph-1; j+=2,jc-=2)
2147 {
2148 iang+=l; if(iang>ip) iang-=ip;
2149 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2150 iang+=l; if(iang>ip) iang-=ip;
2151 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2152 for (size_t ik=0; ik<idl1; ++ik)
2153 {
2154 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
2155 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
2156 }
2157 }
2158 for(; j<ipph; ++j,--jc)
2159 {
2160 iang+=l; if(iang>ip) iang-=ip;
2161 T0 war=csarr[2*iang], wai=csarr[2*iang+1];
2162 for (size_t ik=0; ik<idl1; ++ik)
2163 {
2164 C2(ik,l ) += war*CH2(ik,j );
2165 C2(ik,lc) += wai*CH2(ik,jc);
2166 }
2167 }
2168 }
2169 for (size_t j=1; j<ipph; ++j)
2170 for (size_t ik=0; ik<idl1; ++ik)
2171 CH2(ik,0) += CH2(ik,j);
2172 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
2173 for (size_t k=0; k<l1; ++k)
2174 PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
2175
2176 if (ido==1) return;
2177
2178 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
2179 for (size_t k=0; k<l1; ++k)
2180 for (size_t i=1; i<=ido-2; i+=2)
2181 {
2182 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
2183 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
2184 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
2185 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
2186 }
2187
2188// All in CH
2189
2190 for (size_t j=1; j<ip; ++j)
2191 {
2192 size_t is = (j-1)*(ido-1);
2193 for (size_t k=0; k<l1; ++k)
2194 {
2195 size_t idij = is;
2196 for (size_t i=1; i<=ido-2; i+=2)
2197 {
2198 T t1=CH(i,k,j), t2=CH(i+1,k,j);
2199 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
2200 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
2201 idij+=2;
2202 }
2203 }
2204 }
2205 }
2206
2207 template<typename T> void copy_and_norm(T *c, T *p1, T0 fct) const
2208 {
2209 if (p1!=c)
2210 {
2211 if (fct!=1.)
2212 for (size_t i=0; i<length; ++i)
2213 c[i] = fct*p1[i];
2214 else
2215 std::copy_n (p1, length, c);
2216 }
2217 else
2218 if (fct!=1.)
2219 for (size_t i=0; i<length; ++i)
2220 c[i] *= fct;
2221 }
2222
2223 public:
2224 template<typename T> void exec(T c[], T0 fct, bool r2hc) const
2225 {
2226 if (length==1) { c[0]*=fct; return; }
2227 size_t nf=fact.size();
2228 arr<T> ch(length);
2229 T *p1=c, *p2=ch.data();
2230
2231 if (r2hc)
2232 for(size_t k1=0, l1=length; k1<nf;++k1)
2233 {
2234 size_t k=nf-k1-1;
2235 size_t ip=fact[k].fct;
2236 size_t ido=length / l1;
2237 l1 /= ip;
2238 if(ip==4)
2239 radf4(ido, l1, p1, p2, fact[k].tw);
2240 else if(ip==2)
2241 radf2(ido, l1, p1, p2, fact[k].tw);
2242 else if(ip==3)
2243 radf3(ido, l1, p1, p2, fact[k].tw);
2244 else if(ip==5)
2245 radf5(ido, l1, p1, p2, fact[k].tw);
2246 else
2247 { radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); std::swap (p1,p2); }
2248 std::swap (p1,p2);
2249 }
2250 else
2251 for(size_t k=0, l1=1; k<nf; k++)
2252 {
2253 size_t ip = fact[k].fct,
2254 ido= length/(ip*l1);
2255 if(ip==4)
2256 radb4(ido, l1, p1, p2, fact[k].tw);
2257 else if(ip==2)
2258 radb2(ido, l1, p1, p2, fact[k].tw);
2259 else if(ip==3)
2260 radb3(ido, l1, p1, p2, fact[k].tw);
2261 else if(ip==5)
2262 radb5(ido, l1, p1, p2, fact[k].tw);
2263 else
2264 radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
2265 std::swap (p1,p2);
2266 l1*=ip;
2267 }
2268
2269 copy_and_norm(c,p1,fct);
2270 }
2271
2272 private:
2273 void factorize()
2274 {
2275 size_t len=length;
2276 while ((len%4)==0)
2277 { add_factor(4); len>>=2; }
2278 if ((len%2)==0)
2279 {
2280 len>>=1;
2281 // factor 2 should be at the front of the factor list
2282 add_factor(2);
2283 std::swap(fact[0].fct, fact.back().fct);
2284 }
2285 for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
2286 while ((len%divisor)==0)
2287 {
2288 add_factor(divisor);
2289 len/=divisor;
2290 }
2291 if (len>1) add_factor(len);
2292 }
2293
2294 size_t twsize() const
2295 {
2296 size_t twsz=0, l1=1;
2297 for (size_t k=0; k<fact.size(); ++k)
2298 {
2299 size_t ip=fact[k].fct, ido=length/(l1*ip);
2300 twsz+=(ip-1)*(ido-1);
2301 if (ip>5) twsz+=2*ip;
2302 l1*=ip;
2303 }
2304 return twsz;
2305 }
2306
2307 void comp_twiddle()
2308 {
2309 sincos_2pibyn<T0> twid(length);
2310 size_t l1=1;
2311 T0 *ptr=mem.data();
2312 for (size_t k=0; k<fact.size(); ++k)
2313 {
2314 size_t ip=fact[k].fct, ido=length/(l1*ip);
2315 if (k<fact.size()-1) // last factor doesn't need twiddles
2316 {
2317 fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
2318 for (size_t j=1; j<ip; ++j)
2319 for (size_t i=1; i<=(ido-1)/2; ++i)
2320 {
2321 fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
2322 fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
2323 }
2324 }
2325 if (ip>5) // special factors required by *g functions
2326 {
2327 fact[k].tws=ptr; ptr+=2*ip;
2328 fact[k].tws[0] = 1.;
2329 fact[k].tws[1] = 0.;
2330 for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
2331 {
2332 fact[k].tws[i ] = twid[i/2*(length/ip)].r;
2333 fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
2334 fact[k].tws[ic] = twid[i/2*(length/ip)].r;
2335 fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
2336 }
2337 }
2338 l1*=ip;
2339 }
2340 }
2341
2342 public:
2343 POCKETFFT_NOINLINE rfftp(size_t length_)
2344 : length(length_)
2345 {
2346 if (length==0) throw std::runtime_error("zero-length FFT requested");
2347 if (length==1) return;
2348 factorize();
2349 mem.resize(twsize());
2350 comp_twiddle();
2351 }
2352};
2353
2354//
2355// complex Bluestein transforms
2356//
2357
2358template<typename T0> class fftblue
2359 {
2360 private:
2361 size_t n, n2;
2362 cfftp<T0> plan;
2363 arr<cmplx<T0>> mem;
2364 cmplx<T0> *bk, *bkf;
2365
2366 template<bool fwd, typename T> void fft(cmplx<T> c[], T0 fct) const
2367 {
2368 arr<cmplx<T>> akf(n2);
2369
2370 /* initialize a_k and FFT it */
2371 for (size_t m=0; m<n; ++m)
2372 special_mul<fwd>(c[m],bk[m],akf[m]);
2373 auto zero = akf[0]*T0(0);
2374 for (size_t m=n; m<n2; ++m)
2375 akf[m]=zero;
2376
2377 plan.exec (akf.data(),1.,true);
2378
2379 /* do the convolution */
2380 akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
2381 for (size_t m=1; m<(n2+1)/2; ++m)
2382 {
2383 akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
2384 akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
2385 }
2386 if ((n2&1)==0)
2387 akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);
2388
2389 /* inverse FFT */
2390 plan.exec (akf.data(),1.,false);
2391
2392 /* multiply by b_k */
2393 for (size_t m=0; m<n; ++m)
2394 c[m] = akf[m].template special_mul<fwd>(bk[m])*fct;
2395 }
2396
2397 public:
2398 POCKETFFT_NOINLINE fftblue(size_t length)
2399 : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
2400 bk(mem.data()), bkf(mem.data()+n)
2401 {
2402 /* initialize b_k */
2403 sincos_2pibyn<T0> tmp(2*n);
2404 bk[0].Set(1, 0);
2405
2406 size_t coeff=0;
2407 for (size_t m=1; m<n; ++m)
2408 {
2409 coeff+=2*m-1;
2410 if (coeff>=2*n) coeff-=2*n;
2411 bk[m] = tmp[coeff];
2412 }
2413
2414 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
2415 arr<cmplx<T0>> tbkf(n2);
2416 T0 xn2 = T0(1)/T0(n2);
2417 tbkf[0] = bk[0]*xn2;
2418 for (size_t m=1; m<n; ++m)
2419 tbkf[m] = tbkf[n2-m] = bk[m]*xn2;
2420 for (size_t m=n;m<=(n2-n);++m)
2421 tbkf[m].Set(0.,0.);
2422 plan.exec(tbkf.data(),1.,true);
2423 for (size_t i=0; i<n2/2+1; ++i)
2424 bkf[i] = tbkf[i];
2425 }
2426
2427 template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const
2428 { fwd ? fft<true>(c,fct) : fft<false>(c,fct); }
2429
2430 template<typename T> void exec_r(T c[], T0 fct, bool fwd)
2431 {
2432 arr<cmplx<T>> tmp(n);
2433 if (fwd)
2434 {
2435 auto zero = T0(0)*c[0];
2436 for (size_t m=0; m<n; ++m)
2437 tmp[m].Set(c[m], zero);
2438 fft<true>(tmp.data(),fct);
2439 c[0] = tmp[0].r;
2440 std::copy_n (&tmp[1].r, n-1, &c[1]);
2441 }
2442 else
2443 {
2444 tmp[0].Set(c[0],c[0]*0);
2445 std::copy_n (c+1, n-1, &tmp[1].r);
2446 if ((n&1)==0) tmp[n/2].i=T0(0)*c[0];
2447 for (size_t m=1; 2*m<n; ++m)
2448 tmp[n-m].Set(tmp[m].r, -tmp[m].i);
2449 fft<false>(tmp.data(),fct);
2450 for (size_t m=0; m<n; ++m)
2451 c[m] = tmp[m].r;
2452 }
2453 }
2454 };
2455
2456//
2457// flexible (FFTPACK/Bluestein) complex 1D transform
2458//
2459
2460template<typename T0> class pocketfft_c
2461 {
2462 private:
2463 std::unique_ptr<cfftp<T0>> packplan;
2464 std::unique_ptr<fftblue<T0>> blueplan;
2465 size_t len;
2466
2467 public:
2468 POCKETFFT_NOINLINE pocketfft_c(size_t length)
2469 : len(length)
2470 {
2471 if (length==0) throw std::runtime_error("zero-length FFT requested");
2472 size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2473 if (tmp*tmp <= length)
2474 {
2475 packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2476 return;
2477 }
2478 double comp1 = util::cost_guess(length);
2479 double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2480 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2481 if (comp2<comp1) // use Bluestein
2482 blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2483 else
2484 packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2485 }
2486
2487 template<typename T> POCKETFFT_NOINLINE void exec(cmplx<T> c[], T0 fct, bool fwd) const
2488 { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); }
2489
2490 size_t length() const { return len; }
2491 };
2492
2493//
2494// flexible (FFTPACK/Bluestein) real-valued 1D transform
2495//
2496
2497template<typename T0> class pocketfft_r
2498 {
2499 private:
2500 std::unique_ptr<rfftp<T0>> packplan;
2501 std::unique_ptr<fftblue<T0>> blueplan;
2502 size_t len;
2503
2504 public:
2505 POCKETFFT_NOINLINE pocketfft_r(size_t length)
2506 : len(length)
2507 {
2508 if (length==0) throw std::runtime_error("zero-length FFT requested");
2509 size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2510 if (tmp*tmp <= length)
2511 {
2512 packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2513 return;
2514 }
2515 double comp1 = 0.5*util::cost_guess(length);
2516 double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2517 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2518 if (comp2<comp1) // use Bluestein
2519 blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2520 else
2521 packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2522 }
2523
2524 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
2525 { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); }
2526
2527 size_t length() const { return len; }
2528 };
2529
2530
2531//
2532// sine/cosine transforms
2533//
2534
2535template<typename T0> class T_dct1
2536 {
2537 private:
2538 pocketfft_r<T0> fftplan;
2539
2540 public:
2541 POCKETFFT_NOINLINE T_dct1(size_t length)
2542 : fftplan(2*(length-1)) {}
2543
2544 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2545 int /*type*/, bool /*cosine*/) const
2546 {
2547 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2548 size_t N=fftplan.length(), n=N/2+1;
2549 if (ortho)
2550 { c[0]*=sqrt2; c[n-1]*=sqrt2; }
2551 arr<T> tmp(N);
2552 tmp[0] = c[0];
2553 for (size_t i=1; i<n; ++i)
2554 tmp[i] = tmp[N-i] = c[i];
2555 fftplan.exec(tmp.data(), fct, true);
2556 c[0] = tmp[0];
2557 for (size_t i=1; i<n; ++i)
2558 c[i] = tmp[2*i-1];
2559 if (ortho)
2560 { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
2561 }
2562
2563 size_t length() const { return fftplan.length()/2+1; }
2564 };
2565
2566template<typename T0> class T_dst1
2567 {
2568 private:
2569 pocketfft_r<T0> fftplan;
2570
2571 public:
2572 POCKETFFT_NOINLINE T_dst1(size_t length)
2573 : fftplan(2*(length+1)) {}
2574
2575 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2576 bool /*ortho*/, int /*type*/, bool /*cosine*/) const
2577 {
2578 size_t N=fftplan.length(), n=N/2-1;
2579 arr<T> tmp(N);
2580 tmp[0] = tmp[n+1] = c[0]*0;
2581 for (size_t i=0; i<n; ++i)
2582 { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
2583 fftplan.exec(tmp.data(), fct, true);
2584 for (size_t i=0; i<n; ++i)
2585 c[i] = -tmp[2*i+2];
2586 }
2587
2588 size_t length() const { return fftplan.length()/2-1; }
2589 };
2590
2591template<typename T0> class T_dcst23
2592 {
2593 private:
2594 pocketfft_r<T0> fftplan;
2595 std::vector<T0> twiddle;
2596
2597 public:
2598 POCKETFFT_NOINLINE T_dcst23(size_t length)
2599 : fftplan(length), twiddle(length)
2600 {
2601 sincos_2pibyn<T0> tw(4*length);
2602 for (size_t i=0; i<length; ++i)
2603 twiddle[i] = tw[i+1].r;
2604 }
2605
2606 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2607 int type, bool cosine) const
2608 {
2609 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2610 size_t N=length();
2611 size_t NS2 = (N+1)/2;
2612 if (type==2)
2613 {
2614 if (!cosine)
2615 for (size_t k=1; k<N; k+=2)
2616 c[k] = -c[k];
2617 c[0] *= 2;
2618 if ((N&1)==0) c[N-1]*=2;
2619 for (size_t k=1; k<N-1; k+=2)
2620 MPINPLACE(c[k+1], c[k]);
2621 fftplan.exec(c, fct, false);
2622 for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2623 {
2624 T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k];
2625 T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc];
2626 c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
2627 }
2628 if ((N&1)==0)
2629 c[NS2] *= twiddle[NS2-1];
2630 if (!cosine)
2631 for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
2632 std::swap(c[k], c[kc]);
2633 if (ortho) c[0]*=sqrt2*T0(0.5);
2634 }
2635 else
2636 {
2637 if (ortho) c[0]*=sqrt2;
2638 if (!cosine)
2639 for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
2640 std::swap(c[k], c[kc]);
2641 for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2642 {
2643 T t1=c[k]+c[kc], t2=c[k]-c[kc];
2644 c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
2645 c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
2646 }
2647 if ((N&1)==0)
2648 c[NS2] *= 2*twiddle[NS2-1];
2649 fftplan.exec(c, fct, true);
2650 for (size_t k=1; k<N-1; k+=2)
2651 MPINPLACE(c[k], c[k+1]);
2652 if (!cosine)
2653 for (size_t k=1; k<N; k+=2)
2654 c[k] = -c[k];
2655 }
2656 }
2657
2658 size_t length() const { return fftplan.length(); }
2659 };
2660
2661template<typename T0> class T_dcst4
2662 {
2663 private:
2664 size_t N;
2665 std::unique_ptr<pocketfft_c<T0>> fft;
2666 std::unique_ptr<pocketfft_r<T0>> rfft;
2667 arr<cmplx<T0>> C2;
2668
2669 public:
2670 POCKETFFT_NOINLINE T_dcst4(size_t length)
2671 : N(length),
2672 fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
2673 rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
2674 C2((N&1) ? 0 : N/2)
2675 {
2676 if ((N&1)==0)
2677 {
2678 sincos_2pibyn<T0> tw(16*N);
2679 for (size_t i=0; i<N/2; ++i)
2680 C2[i] = conj(tw[8*i+1]);
2681 }
2682 }
2683
2684 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2685 bool /*ortho*/, int /*type*/, bool cosine) const
2686 {
2687 size_t n2 = N/2;
2688 if (!cosine)
2689 for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
2690 std::swap(c[k], c[kc]);
2691 if (N&1)
2692 {
2693 // The following code is derived from the FFTW3 function apply_re11()
2694 // and is released under the 3-clause BSD license with friendly
2695 // permission of Matteo Frigo and Steven G. Johnson.
2696
2697 arr<T> y(N);
2698 {
2699 size_t i=0, m=n2;
2700 for (; m<N; ++i, m+=4)
2701 y[i] = c[m];
2702 for (; m<2*N; ++i, m+=4)
2703 y[i] = -c[2*N-m-1];
2704 for (; m<3*N; ++i, m+=4)
2705 y[i] = -c[m-2*N];
2706 for (; m<4*N; ++i, m+=4)
2707 y[i] = c[4*N-m-1];
2708 for (; i<N; ++i, m+=4)
2709 y[i] = c[m-4*N];
2710 }
2711 rfft->exec(y.data(), fct, true);
2712 {
2713 auto SGN = [](size_t i)
2714 {
2715 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2716 return (i&2) ? -sqrt2 : sqrt2;
2717 };
2718 c[n2] = y[0]*SGN(n2+1);
2719 size_t i=0, i1=1, k=1;
2720 for (; k<n2; ++i, ++i1, k+=2)
2721 {
2722 c[i ] = y[2*k-1]*SGN(i1) + y[2*k ]*SGN(i);
2723 c[N -i1] = y[2*k-1]*SGN(N -i) - y[2*k ]*SGN(N -i1);
2724 c[n2-i1] = y[2*k+1]*SGN(n2-i) - y[2*k+2]*SGN(n2-i1);
2725 c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
2726 }
2727 if (k == n2)
2728 {
2729 c[i ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
2730 c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
2731 }
2732 }
2733
2734 // FFTW-derived code ends here
2735 }
2736 else
2737 {
2738 // even length algorithm from
2739 // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
2740 arr<cmplx<T>> y(n2);
2741 for(size_t i=0; i<n2; ++i)
2742 {
2743 y[i].Set(c[2*i],c[N-1-2*i]);
2744 y[i] *= C2[i];
2745 }
2746 fft->exec(y.data(), fct, true);
2747 for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
2748 {
2749 c[2*i ] = 2*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i);
2750 c[2*i+1] = -2*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i);
2751 }
2752 }
2753 if (!cosine)
2754 for (size_t k=1; k<N; k+=2)
2755 c[k] = -c[k];
2756 }
2757
2758 size_t length() const { return N; }
2759 };
2760
2761
2762//
2763// multi-D infrastructure
2764//
2765
2766template<typename T> std::shared_ptr<T> get_plan(size_t length)
2767 {
2768#if POCKETFFT_CACHE_SIZE==0
2769 return std::make_shared<T>(length);
2770#else
2771 constexpr size_t nmax=POCKETFFT_CACHE_SIZE;
2772 static std::array<std::shared_ptr<T>, nmax> cache;
2773 static std::array<size_t, nmax> last_access{{0}};
2774 static size_t access_counter = 0;
2775 static std::mutex mut;
2776
2777 auto find_in_cache = [&]() -> std::shared_ptr<T>
2778 {
2779 for (size_t i=0; i<nmax; ++i)
2780 if (cache[i] && (cache[i]->length()==length))
2781 {
2782 // no need to update if this is already the most recent entry
2783 if (last_access[i]!=access_counter)
2784 {
2785 last_access[i] = ++access_counter;
2786 // Guard against overflow
2787 if (access_counter == 0)
2788 last_access.fill(0);
2789 }
2790 return cache[i];
2791 }
2792
2793 return nullptr;
2794 };
2795
2796 {
2797 std::lock_guard<std::mutex> lock(mut);
2798 auto p = find_in_cache();
2799 if (p) return p;
2800 }
2801 auto plan = std::make_shared<T>(length);
2802 {
2803 std::lock_guard<std::mutex> lock(mut);
2804 auto p = find_in_cache();
2805 if (p) return p;
2806
2807 size_t lru = 0;
2808 for (size_t i=1; i<nmax; ++i)
2809 if (last_access[i] < last_access[lru])
2810 lru = i;
2811
2812 cache[lru] = plan;
2813 last_access[lru] = ++access_counter;
2814 }
2815 return plan;
2816#endif
2817 }
2818
2819class arr_info
2820 {
2821 protected:
2822 shape_t shp;
2823 stride_t str;
2824
2825 public:
2826 arr_info(const shape_t &shape_, const stride_t &stride_)
2827 : shp(shape_), str(stride_) {}
2828 size_t ndim() const { return shp.size(); }
2829 size_t size() const { return util::prod(shp); }
2830 const shape_t &shape() const { return shp; }
2831 size_t shape(size_t i) const { return shp[i]; }
2832 const stride_t &stride() const { return str; }
2833 const ptrdiff_t &stride(size_t i) const { return str[i]; }
2834 };
2835
2836template<typename T> class cndarr: public arr_info
2837 {
2838 protected:
2839 const char *d;
2840
2841 public:
2842 cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
2843 : arr_info(shape_, stride_),
2844 d(reinterpret_cast<const char *>(data_)) {}
2845 const T &operator[](ptrdiff_t ofs) const
2846 { return *reinterpret_cast<const T *>(d+ofs); }
2847 };
2848
2849template<typename T> class ndarr: public cndarr<T>
2850 {
2851 public:
2852 ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
2853 : cndarr<T>::cndarr(const_cast<const void *>(data_), shape_, stride_)
2854 {}
2855 T &operator[](ptrdiff_t ofs)
2856 { return *reinterpret_cast<T *>(const_cast<char *>(cndarr<T>::d+ofs)); }
2857 };
2858
2859template<size_t N> class multi_iter
2860 {
2861 private:
2862 shape_t pos;
2863 const arr_info &iarr, &oarr;
2864 ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o;
2865 size_t idim, rem;
2866
2867 void advance_i()
2868 {
2869 for (int i_=int(pos.size())-1; i_>=0; --i_)
2870 {
2871 auto i = size_t(i_);
2872 if (i==idim) continue;
2873 p_ii += iarr.stride(i);
2874 p_oi += oarr.stride(i);
2875 if (++pos[i] < iarr.shape(i))
2876 return;
2877 pos[i] = 0;
2878 p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i);
2879 p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i);
2880 }
2881 }
2882
2883 public:
2884 multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_)
2885 : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0),
2886 str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
2887 idim(idim_), rem(iarr.size()/iarr.shape(idim))
2888 {
2889 auto nshares = threading::num_threads();
2890 if (nshares==1) return;
2891 if (nshares==0) throw std::runtime_error("can't run with zero threads");
2892 auto myshare = threading::thread_id();
2893 if (myshare>=nshares) throw std::runtime_error("impossible share requested");
2894 size_t nbase = rem/nshares;
2895 size_t additional = rem%nshares;
2896 size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional);
2897 size_t hi = lo+nbase+(myshare<additional);
2898 size_t todo = hi-lo;
2899
2900 size_t chunk = rem;
2901 for (size_t i=0; i<pos.size(); ++i)
2902 {
2903 if (i==idim) continue;
2904 chunk /= iarr.shape(i);
2905 size_t n_advance = lo/chunk;
2906 pos[i] += n_advance;
2907 p_ii += ptrdiff_t(n_advance)*iarr.stride(i);
2908 p_oi += ptrdiff_t(n_advance)*oarr.stride(i);
2909 lo -= n_advance*chunk;
2910 }
2911 rem = todo;
2912 }
2913 void advance(size_t n)
2914 {
2915 if (rem<n) throw std::runtime_error("underrun");
2916 for (size_t i=0; i<n; ++i)
2917 {
2918 p_i[i] = p_ii;
2919 p_o[i] = p_oi;
2920 advance_i();
2921 }
2922 rem -= n;
2923 }
2924 ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; }
2925 ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; }
2926 ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; }
2927 ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; }
2928 size_t length_in() const { return iarr.shape(idim); }
2929 size_t length_out() const { return oarr.shape(idim); }
2930 ptrdiff_t stride_in() const { return str_i; }
2931 ptrdiff_t stride_out() const { return str_o; }
2932 size_t remaining() const { return rem; }
2933 };
2934
2935class simple_iter
2936 {
2937 private:
2938 shape_t pos;
2939 const arr_info &arr;
2940 ptrdiff_t p;
2941 size_t rem;
2942
2943 public:
2944 simple_iter(const arr_info &arr_)
2945 : pos(arr_.ndim(), 0), arr(arr_), p(0), rem(arr_.size()) {}
2946 void advance()
2947 {
2948 --rem;
2949 for (int i_=int(pos.size())-1; i_>=0; --i_)
2950 {
2951 auto i = size_t(i_);
2952 p += arr.stride(i);
2953 if (++pos[i] < arr.shape(i))
2954 return;
2955 pos[i] = 0;
2956 p -= ptrdiff_t(arr.shape(i))*arr.stride(i);
2957 }
2958 }
2959 ptrdiff_t ofs() const { return p; }
2960 size_t remaining() const { return rem; }
2961 };
2962
2963class rev_iter
2964 {
2965 private:
2966 shape_t pos;
2967 const arr_info &arr;
2968 std::vector<char> rev_axis;
2969 std::vector<char> rev_jump;
2970 size_t last_axis, last_size;
2971 shape_t shp;
2972 ptrdiff_t p, rp;
2973 size_t rem;
2974
2975 public:
2976 rev_iter(const arr_info &arr_, const shape_t &axes)
2977 : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0),
2978 rev_jump(arr_.ndim(), 1), p(0), rp(0)
2979 {
2980 for (auto ax: axes)
2981 rev_axis[ax]=1;
2982 last_axis = axes.back();
2983 last_size = arr.shape(last_axis)/2 + 1;
2984 shp = arr.shape();
2985 shp[last_axis] = last_size;
2986 rem=1;
2987 for (auto i: shp)
2988 rem *= i;
2989 }
2990 void advance()
2991 {
2992 --rem;
2993 for (int i_=int(pos.size())-1; i_>=0; --i_)
2994 {
2995 auto i = size_t(i_);
2996 p += arr.stride(i);
2997 if (!rev_axis[i])
2998 rp += arr.stride(i);
2999 else
3000 {
3001 rp -= arr.stride(i);
3002 if (rev_jump[i])
3003 {
3004 rp += ptrdiff_t(arr.shape(i))*arr.stride(i);
3005 rev_jump[i] = 0;
3006 }
3007 }
3008 if (++pos[i] < shp[i])
3009 return;
3010 pos[i] = 0;
3011 p -= ptrdiff_t(shp[i])*arr.stride(i);
3012 if (rev_axis[i])
3013 {
3014 rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i);
3015 rev_jump[i] = 1;
3016 }
3017 else
3018 rp -= ptrdiff_t(shp[i])*arr.stride(i);
3019 }
3020 }
3021 ptrdiff_t ofs() const { return p; }
3022 ptrdiff_t rev_ofs() const { return rp; }
3023 size_t remaining() const { return rem; }
3024 };
3025
3026template<typename T> struct VTYPE {};
3027template <typename T> using vtype_t = typename VTYPE<T>::type;
3028
3029#ifndef POCKETFFT_NO_VECTORS
3030template<> struct VTYPE<float>
3031 {
3032 using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float))));
3033 };
3034template<> struct VTYPE<double>
3035 {
3036 using type = double __attribute__ ((vector_size (VLEN<double>::val*sizeof(double))));
3037 };
3038template<> struct VTYPE<long double>
3039 {
3040 using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double))));
3041 };
3042#endif
3043
3044template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3045 size_t axsize, size_t elemsize)
3046 {
3047 auto othersize = util::prod(shape)/axsize;
3048 auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3049 return arr<char>(tmpsize*elemsize);
3050 }
3051template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3052 const shape_t &axes, size_t elemsize)
3053 {
3054 size_t fullsize=util::prod(shape);
3055 size_t tmpsize=0;
3056 for (size_t i=0; i<axes.size(); ++i)
3057 {
3058 auto axsize = shape[axes[i]];
3059 auto othersize = fullsize/axsize;
3060 auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3061 if (sz>tmpsize) tmpsize=sz;
3062 }
3063 return arr<char>(tmpsize*elemsize);
3064 }
3065
3066template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3067 const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst)
3068 {
3069 for (size_t i=0; i<it.length_in(); ++i)
3070 for (size_t j=0; j<vlen; ++j)
3071 {
3072 dst[i].r[j] = src[it.iofs(j,i)].r;
3073 dst[i].i[j] = src[it.iofs(j,i)].i;
3074 }
3075 }
3076
3077template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3078 const cndarr<T> &src, vtype_t<T> *POCKETFFT_RESTRICT dst)
3079 {
3080 for (size_t i=0; i<it.length_in(); ++i)
3081 for (size_t j=0; j<vlen; ++j)
3082 dst[i][j] = src[it.iofs(j,i)];
3083 }
3084
3085template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3086 const cndarr<T> &src, T *POCKETFFT_RESTRICT dst)
3087 {
3088 if (dst == &src[it.iofs(0)]) return; // in-place
3089 for (size_t i=0; i<it.length_in(); ++i)
3090 dst[i] = src[it.iofs(i)];
3091 }
3092
3093template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3094 const cmplx<vtype_t<T>> *POCKETFFT_RESTRICT src, ndarr<cmplx<T>> &dst)
3095 {
3096 for (size_t i=0; i<it.length_out(); ++i)
3097 for (size_t j=0; j<vlen; ++j)
3098 dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
3099 }
3100
3101template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3102 const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3103 {
3104 for (size_t i=0; i<it.length_out(); ++i)
3105 for (size_t j=0; j<vlen; ++j)
3106 dst[it.oofs(j,i)] = src[i][j];
3107 }
3108
3109template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3110 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3111 {
3112 if (src == &dst[it.oofs(0)]) return; // in-place
3113 for (size_t i=0; i<it.length_out(); ++i)
3114 dst[it.oofs(i)] = src[i];
3115 }
3116
3117template <typename T> struct add_vec { using type = vtype_t<T>; };
3118template <typename T> struct add_vec<cmplx<T>>
3119 { using type = cmplx<vtype_t<T>>; };
3120template <typename T> using add_vec_t = typename add_vec<T>::type;
3121
3122template<typename Tplan, typename T, typename T0, typename Exec>
3123POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
3124 const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
3125 const bool allow_inplace=true)
3126 {
3127 std::shared_ptr<Tplan> plan;
3128
3129 for (size_t iax=0; iax<axes.size(); ++iax)
3130 {
3131 size_t len=in.shape(axes[iax]);
3132 if ((!plan) || (len!=plan->length()))
3133 plan = get_plan<Tplan>(len);
3134
3135 threading::thread_map(
3136 util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val),
3137 [&] {
3138 constexpr auto vlen = VLEN<T0>::val;
3139 auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
3140 const auto &tin(iax==0? in : out);
3141 multi_iter<vlen> it(tin, out, axes[iax]);
3142#ifndef POCKETFFT_NO_VECTORS
3143 if (vlen>1)
3144 while (it.remaining()>=vlen)
3145 {
3146 it.advance(vlen);
3147 auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
3148 exec(it, tin, out, tdatav, *plan, fct);
3149 }
3150#endif
3151 while (it.remaining()>0)
3152 {
3153 it.advance(1);
3154 auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
3155 &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
3156 exec(it, tin, out, buf, *plan, fct);
3157 }
3158 }); // end of parallel region
3159 fct = T0(1); // factor has been applied, use 1 for remaining axes
3160 }
3161 }
3162
3163struct ExecC2C
3164 {
3165 bool forward;
3166
3167 template <typename T0, typename T, size_t vlen> void operator () (
3168 const multi_iter<vlen> &it, const cndarr<cmplx<T0>> &in,
3169 ndarr<cmplx<T0>> &out, T * buf, const pocketfft_c<T0> &plan, T0 fct) const
3170 {
3171 copy_input(it, in, buf);
3172 plan.exec(buf, fct, forward);
3173 copy_output(it, buf, out);
3174 }
3175 };
3176
3177template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3178 const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3179 {
3180 for (size_t j=0; j<vlen; ++j)
3181 dst[it.oofs(j,0)] = src[0][j];
3182 size_t i=1, i1=1, i2=it.length_out()-1;
3183 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3184 for (size_t j=0; j<vlen; ++j)
3185 {
3186 dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
3187 dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
3188 }
3189 if (i<it.length_out())
3190 for (size_t j=0; j<vlen; ++j)
3191 dst[it.oofs(j,i1)] = src[i][j];
3192 }
3193
3194template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3195 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3196 {
3197 dst[it.oofs(0)] = src[0];
3198 size_t i=1, i1=1, i2=it.length_out()-1;
3199 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3200 {
3201 dst[it.oofs(i1)] = src[i]+src[i+1];
3202 dst[it.oofs(i2)] = src[i]-src[i+1];
3203 }
3204 if (i<it.length_out())
3205 dst[it.oofs(i1)] = src[i];
3206 }
3207
3208struct ExecHartley
3209 {
3210 template <typename T0, typename T, size_t vlen> void operator () (
3211 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out,
3212 T * buf, const pocketfft_r<T0> &plan, T0 fct) const
3213 {
3214 copy_input(it, in, buf);
3215 plan.exec(buf, fct, true);
3216 copy_hartley(it, buf, out);
3217 }
3218 };
3219
3220struct ExecDcst
3221 {
3222 bool ortho;
3223 int type;
3224 bool cosine;
3225
3226 template <typename T0, typename T, typename Tplan, size_t vlen>
3227 void operator () (const multi_iter<vlen> &it, const cndarr<T0> &in,
3228 ndarr<T0> &out, T * buf, const Tplan &plan, T0 fct) const
3229 {
3230 copy_input(it, in, buf);
3231 plan.exec(buf, fct, ortho, type, cosine);
3232 copy_output(it, buf, out);
3233 }
3234 };
3235
3236template<typename T> POCKETFFT_NOINLINE void general_r2c(
3237 const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct,
3238 size_t nthreads)
3239 {
3240 auto plan = get_plan<pocketfft_r<T>>(in.shape(axis));
3241 size_t len=in.shape(axis);
3242 threading::thread_map(
3243 util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3244 [&] {
3245 constexpr auto vlen = VLEN<T>::val;
3246 auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
3247 multi_iter<vlen> it(in, out, axis);
3248#ifndef POCKETFFT_NO_VECTORS
3249 if (vlen>1)
3250 while (it.remaining()>=vlen)
3251 {
3252 it.advance(vlen);
3253 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3254 copy_input(it, in, tdatav);
3255 plan->exec(tdatav, fct, true);
3256 for (size_t j=0; j<vlen; ++j)
3257 out[it.oofs(j,0)].Set(tdatav[0][j]);
3258 size_t i=1, ii=1;
3259 if (forward)
3260 for (; i<len-1; i+=2, ++ii)
3261 for (size_t j=0; j<vlen; ++j)
3262 out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
3263 else
3264 for (; i<len-1; i+=2, ++ii)
3265 for (size_t j=0; j<vlen; ++j)
3266 out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
3267 if (i<len)
3268 for (size_t j=0; j<vlen; ++j)
3269 out[it.oofs(j,ii)].Set(tdatav[i][j]);
3270 }
3271#endif
3272 while (it.remaining()>0)
3273 {
3274 it.advance(1);
3275 auto tdata = reinterpret_cast<T *>(storage.data());
3276 copy_input(it, in, tdata);
3277 plan->exec(tdata, fct, true);
3278 out[it.oofs(0)].Set(tdata[0]);
3279 size_t i=1, ii=1;
3280 if (forward)
3281 for (; i<len-1; i+=2, ++ii)
3282 out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
3283 else
3284 for (; i<len-1; i+=2, ++ii)
3285 out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
3286 if (i<len)
3287 out[it.oofs(ii)].Set(tdata[i]);
3288 }
3289 }); // end of parallel region
3290 }
3291template<typename T> POCKETFFT_NOINLINE void general_c2r(
3292 const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct,
3293 size_t nthreads)
3294 {
3295 auto plan = get_plan<pocketfft_r<T>>(out.shape(axis));
3296 size_t len=out.shape(axis);
3297 threading::thread_map(
3298 util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3299 [&] {
3300 constexpr auto vlen = VLEN<T>::val;
3301 auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
3302 multi_iter<vlen> it(in, out, axis);
3303#ifndef POCKETFFT_NO_VECTORS
3304 if (vlen>1)
3305 while (it.remaining()>=vlen)
3306 {
3307 it.advance(vlen);
3308 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3309 for (size_t j=0; j<vlen; ++j)
3310 tdatav[0][j]=in[it.iofs(j,0)].r;
3311 {
3312 size_t i=1, ii=1;
3313 if (forward)
3314 for (; i<len-1; i+=2, ++ii)
3315 for (size_t j=0; j<vlen; ++j)
3316 {
3317 tdatav[i ][j] = in[it.iofs(j,ii)].r;
3318 tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
3319 }
3320 else
3321 for (; i<len-1; i+=2, ++ii)
3322 for (size_t j=0; j<vlen; ++j)
3323 {
3324 tdatav[i ][j] = in[it.iofs(j,ii)].r;
3325 tdatav[i+1][j] = in[it.iofs(j,ii)].i;
3326 }
3327 if (i<len)
3328 for (size_t j=0; j<vlen; ++j)
3329 tdatav[i][j] = in[it.iofs(j,ii)].r;
3330 }
3331 plan->exec(tdatav, fct, false);
3332 copy_output(it, tdatav, out);
3333 }
3334#endif
3335 while (it.remaining()>0)
3336 {
3337 it.advance(1);
3338 auto tdata = reinterpret_cast<T *>(storage.data());
3339 tdata[0]=in[it.iofs(0)].r;
3340 {
3341 size_t i=1, ii=1;
3342 if (forward)
3343 for (; i<len-1; i+=2, ++ii)
3344 {
3345 tdata[i ] = in[it.iofs(ii)].r;
3346 tdata[i+1] = -in[it.iofs(ii)].i;
3347 }
3348 else
3349 for (; i<len-1; i+=2, ++ii)
3350 {
3351 tdata[i ] = in[it.iofs(ii)].r;
3352 tdata[i+1] = in[it.iofs(ii)].i;
3353 }
3354 if (i<len)
3355 tdata[i] = in[it.iofs(ii)].r;
3356 }
3357 plan->exec(tdata, fct, false);
3358 copy_output(it, tdata, out);
3359 }
3360 }); // end of parallel region
3361 }
3362
3363struct ExecR2R
3364 {
3365 bool r2h, forward;
3366
3367 template <typename T0, typename T, size_t vlen> void operator () (
3368 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, T * buf,
3369 const pocketfft_r<T0> &plan, T0 fct) const
3370 {
3371 copy_input(it, in, buf);
3372 if ((!r2h) && forward)
3373 for (size_t i=2; i<it.length_out(); i+=2)
3374 buf[i] = -buf[i];
3375 plan.exec(buf, fct, r2h);
3376 if (r2h && (!forward))
3377 for (size_t i=2; i<it.length_out(); i+=2)
3378 buf[i] = -buf[i];
3379 copy_output(it, buf, out);
3380 }
3381 };
3382
3383template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
3384 const stride_t &stride_out, const shape_t &axes, bool forward,
3385 const std::complex<T> *data_in, std::complex<T> *data_out, T fct,
3386 size_t nthreads=1)
3387 {
3388 if (util::prod(shape)==0) return;
3389 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3390 cndarr<cmplx<T>> ain(data_in, shape, stride_in);
3391 ndarr<cmplx<T>> aout(data_out, shape, stride_out);
3392 general_nd<pocketfft_c<T>>(ain, aout, axes, fct, nthreads, ExecC2C{forward});
3393 }
3394
3395template<typename T> void dct(const shape_t &shape,
3396 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3397 int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3398 {
3399 if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
3400 if (util::prod(shape)==0) return;
3401 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3402 cndarr<T> ain(data_in, shape, stride_in);
3403 ndarr<T> aout(data_out, shape, stride_out);
3404 const ExecDcst exec{ortho, type, true};
3405 if (type==1)
3406 general_nd<T_dct1<T>>(ain, aout, axes, fct, nthreads, exec);
3407 else if (type==4)
3408 general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3409 else
3410 general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3411 }
3412
3413template<typename T> void dst(const shape_t &shape,
3414 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3415 int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3416 {
3417 if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
3418 if (util::prod(shape)==0) return;
3419 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3420 cndarr<T> ain(data_in, shape, stride_in);
3421 ndarr<T> aout(data_out, shape, stride_out);
3422 const ExecDcst exec{ortho, type, false};
3423 if (type==1)
3424 general_nd<T_dst1<T>>(ain, aout, axes, fct, nthreads, exec);
3425 else if (type==4)
3426 general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3427 else
3428 general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3429 }
3430
3431template<typename T> void r2c(const shape_t &shape_in,
3432 const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3433 bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3434 size_t nthreads=1)
3435 {
3436 if (util::prod(shape_in)==0) return;
3437 util::sanity_check(shape_in, stride_in, stride_out, false, axis);
3438 cndarr<T> ain(data_in, shape_in, stride_in);
3439 shape_t shape_out(shape_in);
3440 shape_out[axis] = shape_in[axis]/2 + 1;
3441 ndarr<cmplx<T>> aout(data_out, shape_out, stride_out);
3442 general_r2c(ain, aout, axis, forward, fct, nthreads);
3443 }
3444
3445template<typename T> void r2c(const shape_t &shape_in,
3446 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3447 bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3448 size_t nthreads=1)
3449 {
3450 if (util::prod(shape_in)==0) return;
3451 util::sanity_check(shape_in, stride_in, stride_out, false, axes);
3452 r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out,
3453 fct, nthreads);
3454 if (axes.size()==1) return;
3455
3456 shape_t shape_out(shape_in);
3457 shape_out[axes.back()] = shape_in[axes.back()]/2 + 1;
3458 auto newaxes = shape_t{axes.begin(), --axes.end()};
3459 c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out,
3460 T(1), nthreads);
3461 }
3462
3463template<typename T> void c2r(const shape_t &shape_out,
3464 const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3465 bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3466 size_t nthreads=1)
3467 {
3468 if (util::prod(shape_out)==0) return;
3469 util::sanity_check(shape_out, stride_in, stride_out, false, axis);
3470 shape_t shape_in(shape_out);
3471 shape_in[axis] = shape_out[axis]/2 + 1;
3472 cndarr<cmplx<T>> ain(data_in, shape_in, stride_in);
3473 ndarr<T> aout(data_out, shape_out, stride_out);
3474 general_c2r(ain, aout, axis, forward, fct, nthreads);
3475 }
3476
3477template<typename T> void c2r(const shape_t &shape_out,
3478 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3479 bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3480 size_t nthreads=1)
3481 {
3482 if (util::prod(shape_out)==0) return;
3483 if (axes.size()==1)
3484 return c2r(shape_out, stride_in, stride_out, axes[0], forward,
3485 data_in, data_out, fct, nthreads);
3486 util::sanity_check(shape_out, stride_in, stride_out, false, axes);
3487 auto shape_in = shape_out;
3488 shape_in[axes.back()] = shape_out[axes.back()]/2 + 1;
3489 auto nval = util::prod(shape_in);
3490 stride_t stride_inter(shape_in.size());
3491 stride_inter.back() = sizeof(cmplx<T>);
3492 for (int i=int(shape_in.size())-2; i>=0; --i)
3493 stride_inter[size_t(i)] =
3494 stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
3495 arr<std::complex<T>> tmp(nval);
3496 auto newaxes = shape_t{axes.begin(), --axes.end()};
3497 c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
3498 T(1), nthreads);
3499 c2r(shape_out, stride_inter, stride_out, axes.back(), forward,
3500 tmp.data(), data_out, fct, nthreads);
3501 }
3502
3503template<typename T> void r2r_fftpack(const shape_t &shape,
3504 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3505 bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct,
3506 size_t nthreads=1)
3507 {
3508 if (util::prod(shape)==0) return;
3509 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3510 cndarr<T> ain(data_in, shape, stride_in);
3511 ndarr<T> aout(data_out, shape, stride_out);
3512 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads,
3513 ExecR2R{real2hermitian, forward});
3514 }
3515
3516template<typename T> void r2r_separable_hartley(const shape_t &shape,
3517 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3518 const T *data_in, T *data_out, T fct, size_t nthreads=1)
3519 {
3520 if (util::prod(shape)==0) return;
3521 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3522 cndarr<T> ain(data_in, shape, stride_in);
3523 ndarr<T> aout(data_out, shape, stride_out);
3524 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{},
3525 false);
3526 }
3527
3528template<typename T> void r2r_genuine_hartley(const shape_t &shape,
3529 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3530 const T *data_in, T *data_out, T fct, size_t nthreads=1)
3531 {
3532 if (util::prod(shape)==0) return;
3533 if (axes.size()==1)
3534 return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in,
3535 data_out, fct, nthreads);
3536 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3537 shape_t tshp(shape);
3538 tshp[axes.back()] = tshp[axes.back()]/2+1;
3539 arr<std::complex<T>> tdata(util::prod(tshp));
3540 stride_t tstride(shape.size());
3541 tstride.back()=sizeof(std::complex<T>);
3542 for (size_t i=tstride.size()-1; i>0; --i)
3543 tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3544 r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads);
3545 cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride);
3546 ndarr<T> aout(data_out, shape, stride_out);
3547 simple_iter iin(atmp);
3548 rev_iter iout(aout, axes);
3549 while(iin.remaining()>0)
3550 {
3551 auto v = atmp[iin.ofs()];
3552 aout[iout.ofs()] = v.r+v.i;
3553 aout[iout.rev_ofs()] = v.r-v.i;
3554 iin.advance(); iout.advance();
3555 }
3556 }
3557
3558} // namespace detail
3559
3560using detail::FORWARD;
3561using detail::BACKWARD;
3562using detail::shape_t;
3563using detail::stride_t;
3564using detail::c2c;
3565using detail::c2r;
3566using detail::r2c;
3567using detail::r2r_fftpack;
3568using detail::r2r_separable_hartley;
3569using detail::r2r_genuine_hartley;
3570using detail::dct;
3571using detail::dst;
3572
3573} // namespace pocketfft
3574
3575#undef POCKETFFT_NOINLINE
3576#undef POCKETFFT_RESTRICT
3577
3578#endif // POCKETFFT_HDRONLY_H
3579