1 | /* |
2 | * Copyright (c) 2003-2010, Mark Borgerding. All rights reserved. |
3 | * This file is part of KISS FFT - https://github.com/mborgerding/kissfft |
4 | * |
5 | * SPDX-License-Identifier: BSD-3-Clause |
6 | * See COPYING file for more information. |
7 | */ |
8 | |
9 | |
10 | #include "_kiss_fft_guts.h" |
11 | /* The guts header contains all the multiplication and addition macros that are defined for |
12 | fixed or floating point complex numbers. It also delares the kf_ internal functions. |
13 | */ |
14 | |
15 | static void kf_bfly2( |
16 | kiss_fft_cpx * Fout, |
17 | const size_t fstride, |
18 | const kiss_fft_cfg st, |
19 | int m |
20 | ) |
21 | { |
22 | kiss_fft_cpx * Fout2; |
23 | kiss_fft_cpx * tw1 = st->twiddles; |
24 | kiss_fft_cpx t; |
25 | Fout2 = Fout + m; |
26 | do{ |
27 | C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2); |
28 | |
29 | C_MUL (t, *Fout2 , *tw1); |
30 | tw1 += fstride; |
31 | C_SUB( *Fout2 , *Fout , t ); |
32 | C_ADDTO( *Fout , t ); |
33 | ++Fout2; |
34 | ++Fout; |
35 | }while (--m); |
36 | } |
37 | |
38 | static void kf_bfly4( |
39 | kiss_fft_cpx * Fout, |
40 | const size_t fstride, |
41 | const kiss_fft_cfg st, |
42 | const size_t m |
43 | ) |
44 | { |
45 | kiss_fft_cpx *tw1,*tw2,*tw3; |
46 | kiss_fft_cpx scratch[6]; |
47 | size_t k=m; |
48 | const size_t m2=2*m; |
49 | const size_t m3=3*m; |
50 | |
51 | |
52 | tw3 = tw2 = tw1 = st->twiddles; |
53 | |
54 | do { |
55 | C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4); |
56 | |
57 | C_MUL(scratch[0],Fout[m] , *tw1 ); |
58 | C_MUL(scratch[1],Fout[m2] , *tw2 ); |
59 | C_MUL(scratch[2],Fout[m3] , *tw3 ); |
60 | |
61 | C_SUB( scratch[5] , *Fout, scratch[1] ); |
62 | C_ADDTO(*Fout, scratch[1]); |
63 | C_ADD( scratch[3] , scratch[0] , scratch[2] ); |
64 | C_SUB( scratch[4] , scratch[0] , scratch[2] ); |
65 | C_SUB( Fout[m2], *Fout, scratch[3] ); |
66 | tw1 += fstride; |
67 | tw2 += fstride*2; |
68 | tw3 += fstride*3; |
69 | C_ADDTO( *Fout , scratch[3] ); |
70 | |
71 | if(st->inverse) { |
72 | Fout[m].r = scratch[5].r - scratch[4].i; |
73 | Fout[m].i = scratch[5].i + scratch[4].r; |
74 | Fout[m3].r = scratch[5].r + scratch[4].i; |
75 | Fout[m3].i = scratch[5].i - scratch[4].r; |
76 | }else{ |
77 | Fout[m].r = scratch[5].r + scratch[4].i; |
78 | Fout[m].i = scratch[5].i - scratch[4].r; |
79 | Fout[m3].r = scratch[5].r - scratch[4].i; |
80 | Fout[m3].i = scratch[5].i + scratch[4].r; |
81 | } |
82 | ++Fout; |
83 | }while(--k); |
84 | } |
85 | |
86 | static void kf_bfly3( |
87 | kiss_fft_cpx * Fout, |
88 | const size_t fstride, |
89 | const kiss_fft_cfg st, |
90 | size_t m |
91 | ) |
92 | { |
93 | size_t k=m; |
94 | const size_t m2 = 2*m; |
95 | kiss_fft_cpx *tw1,*tw2; |
96 | kiss_fft_cpx scratch[5]; |
97 | kiss_fft_cpx epi3; |
98 | epi3 = st->twiddles[fstride*m]; |
99 | |
100 | tw1=tw2=st->twiddles; |
101 | |
102 | do{ |
103 | C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3); |
104 | |
105 | C_MUL(scratch[1],Fout[m] , *tw1); |
106 | C_MUL(scratch[2],Fout[m2] , *tw2); |
107 | |
108 | C_ADD(scratch[3],scratch[1],scratch[2]); |
109 | C_SUB(scratch[0],scratch[1],scratch[2]); |
110 | tw1 += fstride; |
111 | tw2 += fstride*2; |
112 | |
113 | Fout[m].r = Fout->r - HALF_OF(scratch[3].r); |
114 | Fout[m].i = Fout->i - HALF_OF(scratch[3].i); |
115 | |
116 | C_MULBYSCALAR( scratch[0] , epi3.i ); |
117 | |
118 | C_ADDTO(*Fout,scratch[3]); |
119 | |
120 | Fout[m2].r = Fout[m].r + scratch[0].i; |
121 | Fout[m2].i = Fout[m].i - scratch[0].r; |
122 | |
123 | Fout[m].r -= scratch[0].i; |
124 | Fout[m].i += scratch[0].r; |
125 | |
126 | ++Fout; |
127 | }while(--k); |
128 | } |
129 | |
130 | static void kf_bfly5( |
131 | kiss_fft_cpx * Fout, |
132 | const size_t fstride, |
133 | const kiss_fft_cfg st, |
134 | int m |
135 | ) |
136 | { |
137 | kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; |
138 | int u; |
139 | kiss_fft_cpx scratch[13]; |
140 | kiss_fft_cpx * twiddles = st->twiddles; |
141 | kiss_fft_cpx *tw; |
142 | kiss_fft_cpx ya,yb; |
143 | ya = twiddles[fstride*m]; |
144 | yb = twiddles[fstride*2*m]; |
145 | |
146 | Fout0=Fout; |
147 | Fout1=Fout0+m; |
148 | Fout2=Fout0+2*m; |
149 | Fout3=Fout0+3*m; |
150 | Fout4=Fout0+4*m; |
151 | |
152 | tw=st->twiddles; |
153 | for ( u=0; u<m; ++u ) { |
154 | C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5); |
155 | scratch[0] = *Fout0; |
156 | |
157 | C_MUL(scratch[1] ,*Fout1, tw[u*fstride]); |
158 | C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]); |
159 | C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]); |
160 | C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]); |
161 | |
162 | C_ADD( scratch[7],scratch[1],scratch[4]); |
163 | C_SUB( scratch[10],scratch[1],scratch[4]); |
164 | C_ADD( scratch[8],scratch[2],scratch[3]); |
165 | C_SUB( scratch[9],scratch[2],scratch[3]); |
166 | |
167 | Fout0->r += scratch[7].r + scratch[8].r; |
168 | Fout0->i += scratch[7].i + scratch[8].i; |
169 | |
170 | scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r); |
171 | scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r); |
172 | |
173 | scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i); |
174 | scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i); |
175 | |
176 | C_SUB(*Fout1,scratch[5],scratch[6]); |
177 | C_ADD(*Fout4,scratch[5],scratch[6]); |
178 | |
179 | scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r); |
180 | scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r); |
181 | scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i); |
182 | scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i); |
183 | |
184 | C_ADD(*Fout2,scratch[11],scratch[12]); |
185 | C_SUB(*Fout3,scratch[11],scratch[12]); |
186 | |
187 | ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; |
188 | } |
189 | } |
190 | |
191 | /* perform the butterfly for one stage of a mixed radix FFT */ |
192 | static void kf_bfly_generic( |
193 | kiss_fft_cpx * Fout, |
194 | const size_t fstride, |
195 | const kiss_fft_cfg st, |
196 | int m, |
197 | int p |
198 | ) |
199 | { |
200 | int u,k,q1,q; |
201 | kiss_fft_cpx * twiddles = st->twiddles; |
202 | kiss_fft_cpx t; |
203 | int Norig = st->nfft; |
204 | |
205 | kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p); |
206 | if (scratch == NULL){ |
207 | KISS_FFT_ERROR("Memory allocation failed." ); |
208 | return; |
209 | } |
210 | |
211 | for ( u=0; u<m; ++u ) { |
212 | k=u; |
213 | for ( q1=0 ; q1<p ; ++q1 ) { |
214 | scratch[q1] = Fout[ k ]; |
215 | C_FIXDIV(scratch[q1],p); |
216 | k += m; |
217 | } |
218 | |
219 | k=u; |
220 | for ( q1=0 ; q1<p ; ++q1 ) { |
221 | int twidx=0; |
222 | Fout[ k ] = scratch[0]; |
223 | for (q=1;q<p;++q ) { |
224 | twidx += fstride * k; |
225 | if (twidx>=Norig) twidx-=Norig; |
226 | C_MUL(t,scratch[q] , twiddles[twidx] ); |
227 | C_ADDTO( Fout[ k ] ,t); |
228 | } |
229 | k += m; |
230 | } |
231 | } |
232 | KISS_FFT_TMP_FREE(scratch); |
233 | } |
234 | |
235 | static |
236 | void kf_work( |
237 | kiss_fft_cpx * Fout, |
238 | const kiss_fft_cpx * f, |
239 | const size_t fstride, |
240 | int in_stride, |
241 | int * factors, |
242 | const kiss_fft_cfg st |
243 | ) |
244 | { |
245 | kiss_fft_cpx * Fout_beg=Fout; |
246 | const int p=*factors++; /* the radix */ |
247 | const int m=*factors++; /* stage's fft length/p */ |
248 | const kiss_fft_cpx * Fout_end = Fout + p*m; |
249 | |
250 | #ifdef _OPENMP |
251 | // use openmp extensions at the |
252 | // top-level (not recursive) |
253 | if (fstride==1 && p<=5 && m!=1) |
254 | { |
255 | int k; |
256 | |
257 | // execute the p different work units in different threads |
258 | # pragma omp parallel for |
259 | for (k=0;k<p;++k) |
260 | kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st); |
261 | // all threads have joined by this point |
262 | |
263 | switch (p) { |
264 | case 2: kf_bfly2(Fout,fstride,st,m); break; |
265 | case 3: kf_bfly3(Fout,fstride,st,m); break; |
266 | case 4: kf_bfly4(Fout,fstride,st,m); break; |
267 | case 5: kf_bfly5(Fout,fstride,st,m); break; |
268 | default: kf_bfly_generic(Fout,fstride,st,m,p); break; |
269 | } |
270 | return; |
271 | } |
272 | #endif |
273 | |
274 | if (m==1) { |
275 | do{ |
276 | *Fout = *f; |
277 | f += fstride*in_stride; |
278 | }while(++Fout != Fout_end ); |
279 | }else{ |
280 | do{ |
281 | // recursive call: |
282 | // DFT of size m*p performed by doing |
283 | // p instances of smaller DFTs of size m, |
284 | // each one takes a decimated version of the input |
285 | kf_work( Fout , f, fstride*p, in_stride, factors,st); |
286 | f += fstride*in_stride; |
287 | }while( (Fout += m) != Fout_end ); |
288 | } |
289 | |
290 | Fout=Fout_beg; |
291 | |
292 | // recombine the p smaller DFTs |
293 | switch (p) { |
294 | case 2: kf_bfly2(Fout,fstride,st,m); break; |
295 | case 3: kf_bfly3(Fout,fstride,st,m); break; |
296 | case 4: kf_bfly4(Fout,fstride,st,m); break; |
297 | case 5: kf_bfly5(Fout,fstride,st,m); break; |
298 | default: kf_bfly_generic(Fout,fstride,st,m,p); break; |
299 | } |
300 | } |
301 | |
302 | /* facbuf is populated by p1,m1,p2,m2, ... |
303 | where |
304 | p[i] * m[i] = m[i-1] |
305 | m0 = n */ |
306 | static |
307 | void kf_factor(int n,int * facbuf) |
308 | { |
309 | int p=4; |
310 | double floor_sqrt; |
311 | floor_sqrt = floor( sqrt((double)n) ); |
312 | |
313 | /*factor out powers of 4, powers of 2, then any remaining primes */ |
314 | do { |
315 | while (n % p) { |
316 | switch (p) { |
317 | case 4: p = 2; break; |
318 | case 2: p = 3; break; |
319 | default: p += 2; break; |
320 | } |
321 | if (p > floor_sqrt) |
322 | p = n; /* no more factors, skip to end */ |
323 | } |
324 | n /= p; |
325 | *facbuf++ = p; |
326 | *facbuf++ = n; |
327 | } while (n > 1); |
328 | } |
329 | |
330 | /* |
331 | * |
332 | * User-callable function to allocate all necessary storage space for the fft. |
333 | * |
334 | * The return value is a contiguous block of memory, allocated with malloc. As such, |
335 | * It can be freed with free(), rather than a kiss_fft-specific function. |
336 | * */ |
337 | kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem ) |
338 | { |
339 | KISS_FFT_ALIGN_CHECK(mem) |
340 | |
341 | kiss_fft_cfg st=NULL; |
342 | size_t memneeded = KISS_FFT_ALIGN_SIZE_UP(sizeof(struct kiss_fft_state) |
343 | + sizeof(kiss_fft_cpx)*(nfft-1)); /* twiddle factors*/ |
344 | |
345 | if ( lenmem==NULL ) { |
346 | st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded ); |
347 | }else{ |
348 | if (mem != NULL && *lenmem >= memneeded) |
349 | st = (kiss_fft_cfg)mem; |
350 | *lenmem = memneeded; |
351 | } |
352 | if (st) { |
353 | int i; |
354 | st->nfft=nfft; |
355 | st->inverse = inverse_fft; |
356 | |
357 | for (i=0;i<nfft;++i) { |
358 | const double pi=3.141592653589793238462643383279502884197169399375105820974944; |
359 | double phase = -2*pi*i / nfft; |
360 | if (st->inverse) |
361 | phase *= -1; |
362 | kf_cexp(st->twiddles+i, phase ); |
363 | } |
364 | |
365 | kf_factor(nfft,st->factors); |
366 | } |
367 | return st; |
368 | } |
369 | |
370 | |
371 | void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride) |
372 | { |
373 | if (fin == fout) { |
374 | //NOTE: this is not really an in-place FFT algorithm. |
375 | //It just performs an out-of-place FFT into a temp buffer |
376 | if (fout == NULL){ |
377 | KISS_FFT_ERROR("fout buffer NULL." ); |
378 | return; |
379 | } |
380 | |
381 | kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft); |
382 | if (tmpbuf == NULL){ |
383 | KISS_FFT_ERROR("Memory allocation error." ); |
384 | return; |
385 | } |
386 | |
387 | |
388 | |
389 | kf_work(tmpbuf,fin,1,in_stride, st->factors,st); |
390 | memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft); |
391 | KISS_FFT_TMP_FREE(tmpbuf); |
392 | }else{ |
393 | kf_work( fout, fin, 1,in_stride, st->factors,st ); |
394 | } |
395 | } |
396 | |
397 | void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) |
398 | { |
399 | kiss_fft_stride(cfg,fin,fout,1); |
400 | } |
401 | |
402 | |
403 | void kiss_fft_cleanup(void) |
404 | { |
405 | // nothing needed any more |
406 | } |
407 | |
408 | int kiss_fft_next_fast_size(int n) |
409 | { |
410 | while(1) { |
411 | int m=n; |
412 | while ( (m%2) == 0 ) m/=2; |
413 | while ( (m%3) == 0 ) m/=3; |
414 | while ( (m%5) == 0 ) m/=5; |
415 | if (m<=1) |
416 | break; /* n is completely factorable by twos, threes, and fives */ |
417 | n++; |
418 | } |
419 | return n; |
420 | } |
421 | |