Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_rfft_float32.c
1 /*
2  * Copyright 2014-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_rfft_float32.c
45  */
46 
47 #include "NE10_types.h"
48 #include "NE10_macros.h"
49 #include "NE10_fft.h"
50 #include "NE10_dsp.h"
51 
52 #if (NE10_UNROLL_LEVEL > 0)
53 
54 void ne10_radix8_r2c_c (ne10_fft_cpx_float32_t *Fout,
55  const ne10_fft_cpx_float32_t *Fin,
56  const ne10_int32_t fstride,
57  const ne10_int32_t mstride,
58  const ne10_int32_t nfft)
59 {
60  const ne10_int32_t in_step = nfft >> 3;
61  ne10_int32_t f_count;
62 
63  ne10_float32_t scratch_in[8];
64  ne10_float32_t scratch [4];
65 
66  /* real pointers */
67  const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
68  ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
69  Fout_r ++; // always leave the first real empty
70 
71  for (f_count = fstride; f_count; f_count --)
72  {
73  scratch_in[0] = Fin_r[in_step * 0] + Fin_r[in_step * (0 + 4)];
74  scratch_in[1] = Fin_r[in_step * 0] - Fin_r[in_step * (0 + 4)];
75  scratch_in[2] = Fin_r[in_step * 1] + Fin_r[in_step * (1 + 4)];
76  scratch_in[3] = Fin_r[in_step * 1] - Fin_r[in_step * (1 + 4)];
77  scratch_in[4] = Fin_r[in_step * 2] + Fin_r[in_step * (2 + 4)];
78  scratch_in[5] = Fin_r[in_step * 2] - Fin_r[in_step * (2 + 4)];
79  scratch_in[6] = Fin_r[in_step * 3] + Fin_r[in_step * (3 + 4)];
80  scratch_in[7] = Fin_r[in_step * 3] - Fin_r[in_step * (3 + 4)];
81 
82  scratch_in[3] *= TW_81_F32;
83  scratch_in[7] *= TW_81N_F32;
84 
85  // radix 2 butterfly
86  scratch[0] = scratch_in[0] + scratch_in[4];
87  scratch[1] = scratch_in[2] + scratch_in[6];
88  scratch[2] = scratch_in[7] - scratch_in[3];
89  scratch[3] = scratch_in[3] + scratch_in[7];
90 
91  Fout_r[0] = scratch [0] + scratch [1];
92  Fout_r[7] = scratch [0] - scratch [1];
93 
94  Fout_r[1] = scratch_in[1] + scratch [3];
95  Fout_r[5] = scratch_in[1] - scratch [3];
96 
97  Fout_r[2] = scratch [2] - scratch_in[5];
98  Fout_r[6] = scratch [2] + scratch_in[5];
99 
100  Fout_r[3] = scratch_in[0] - scratch_in[4];
101 
102  Fout_r[4] = scratch_in[6] - scratch_in[2];
103 
104  Fin_r ++;
105  Fout_r += 8;
106  }
107 }
108 
109 void ne10_radix8_c2r_c (ne10_fft_cpx_float32_t *Fout,
110  const ne10_fft_cpx_float32_t *Fin,
111  const ne10_int32_t fstride,
112  const ne10_int32_t mstride,
113  const ne10_int32_t nfft)
114 {
115  const ne10_int32_t in_step = nfft >> 3;
116  ne10_int32_t f_count;
117 
118  ne10_float32_t scratch_in[8];
119 
120  const ne10_float32_t one_by_N = 1.0 / nfft;
121 
122  /* real pointers */
123  const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
124  ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
125 
126  for (f_count = fstride; f_count; f_count --)
127  {
128  scratch_in[0] = Fin_r[0] + Fin_r[3] + Fin_r[3] + Fin_r[7];
129  scratch_in[1] = Fin_r[1] + Fin_r[1] + Fin_r[5] + Fin_r[5];
130  scratch_in[2] = Fin_r[0] - Fin_r[4] - Fin_r[4] - Fin_r[7];
131  scratch_in[3] = Fin_r[1] - Fin_r[2] - Fin_r[5] - Fin_r[6];
132  scratch_in[4] = Fin_r[0] - Fin_r[3] - Fin_r[3] + Fin_r[7];
133  scratch_in[5] = - Fin_r[2] - Fin_r[2] + Fin_r[6] + Fin_r[6];
134  scratch_in[6] = Fin_r[0] + Fin_r[4] + Fin_r[4] - Fin_r[7];
135  scratch_in[7] = Fin_r[1] + Fin_r[2] - Fin_r[5] + Fin_r[6];
136 
137  scratch_in[3] /= TW_81_F32;
138  scratch_in[7] /= TW_81N_F32;
139 
140  Fout_r[0 * in_step] = scratch_in[0] + scratch_in[1];
141  Fout_r[4 * in_step] = scratch_in[0] - scratch_in[1];
142  Fout_r[1 * in_step] = scratch_in[2] + scratch_in[3];
143  Fout_r[5 * in_step] = scratch_in[2] - scratch_in[3];
144  Fout_r[2 * in_step] = scratch_in[4] + scratch_in[5];
145  Fout_r[6 * in_step] = scratch_in[4] - scratch_in[5];
146  Fout_r[3 * in_step] = scratch_in[6] + scratch_in[7];
147  Fout_r[7 * in_step] = scratch_in[6] - scratch_in[7];
148 
149 #if defined(NE10_DSP_RFFT_SCALING)
150  Fout_r[0 * in_step] *= one_by_N;
151  Fout_r[4 * in_step] *= one_by_N;
152  Fout_r[1 * in_step] *= one_by_N;
153  Fout_r[5 * in_step] *= one_by_N;
154  Fout_r[2 * in_step] *= one_by_N;
155  Fout_r[6 * in_step] *= one_by_N;
156  Fout_r[3 * in_step] *= one_by_N;
157  Fout_r[7 * in_step] *= one_by_N;
158 #endif
159 
160  Fin_r += 8;
161  Fout_r ++;
162  }
163 }
164 
165 NE10_INLINE void ne10_radix4_r2c_c (ne10_fft_cpx_float32_t *Fout,
166  const ne10_fft_cpx_float32_t *Fin,
167  const ne10_int32_t fstride,
168  const ne10_int32_t mstride,
169  const ne10_int32_t nfft)
170 {
171  const ne10_int32_t in_step = nfft >> 2;
172  ne10_int32_t f_count;
173 
174  ne10_float32_t scratch_in [4];
175  ne10_float32_t scratch_out[4];
176 
177  /* real pointers */
178  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
179  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
180  Fout_r ++; // always leave the first real empty
181 
182  for (f_count = fstride; f_count; f_count --)
183  {
184  scratch_in[0] = Fin_r[0 * in_step];
185  scratch_in[1] = Fin_r[1 * in_step];
186  scratch_in[2] = Fin_r[2 * in_step];
187  scratch_in[3] = Fin_r[3 * in_step];
188 
189  // NE10_PRINT_Q_VECTOR(scratch_in);
190 
191  NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
192 
193  // NE10_PRINT_Q_VECTOR(scratch_out);
194 
195  Fout_r[0] = scratch_out[0];
196  Fout_r[1] = scratch_out[1];
197  Fout_r[2] = scratch_out[2];
198  Fout_r[3] = scratch_out[3];
199 
200  Fin_r ++;
201  Fout_r += 4;
202  }
203 }
204 
205 NE10_INLINE void ne10_radix4_c2r_c (ne10_fft_cpx_float32_t *Fout,
206  const ne10_fft_cpx_float32_t *Fin,
207  const ne10_int32_t fstride,
208  const ne10_int32_t mstride,
209  const ne10_int32_t nfft)
210 {
211  ne10_int32_t f_count;
212  const ne10_int32_t in_step = nfft >> 2;
213  ne10_float32_t scratch_in [4];
214  ne10_float32_t scratch_out[4];
215 
216  const ne10_float32_t one_by_N = 1.0 / nfft;
217 
218  /* real pointers */
219  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
220  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
221 
222  for (f_count = fstride; f_count; f_count --)
223  {
224  scratch_in[0] = Fin_r[0];
225  scratch_in[1] = Fin_r[1];
226  scratch_in[2] = Fin_r[2];
227  scratch_in[3] = Fin_r[3];
228 
229  // NE10_PRINT_Q_VECTOR(scratch_in);
230 
231  NE10_FFT_C2R_RCR_4R(scratch_out,scratch_in);
232 
233  // NE10_PRINT_Q_VECTOR(scratch_out);
234 
235 #if defined(NE10_DSP_RFFT_SCALING)
236  scratch_out[0] *= one_by_N;
237  scratch_out[1] *= one_by_N;
238  scratch_out[2] *= one_by_N;
239  scratch_out[3] *= one_by_N;
240 #endif
241 
242  // store
243  Fout_r[0 * in_step] = scratch_out[0];
244  Fout_r[1 * in_step] = scratch_out[1];
245  Fout_r[2 * in_step] = scratch_out[2];
246  Fout_r[3 * in_step] = scratch_out[3];
247 
248  Fin_r += 4;
249  Fout_r ++;
250  }
251 }
252 
253 NE10_INLINE void ne10_radix4_r2c_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
254  const ne10_float32_t *Fin_r,
255  const ne10_int32_t out_step,
256  const ne10_int32_t in_step,
257  const ne10_fft_cpx_float32_t *twiddles)
258 {
259  ne10_float32_t scratch_out[4];
260  ne10_float32_t scratch_in [4];
261 
262  // load
263  scratch_in[0] = Fin_r[0 * in_step];
264  scratch_in[1] = Fin_r[1 * in_step];
265  scratch_in[2] = Fin_r[2 * in_step];
266  scratch_in[3] = Fin_r[3 * in_step];
267 
268  // NE10_PRINT_Q_VECTOR(scratch_in);
269 
270  NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
271 
272  // NE10_PRINT_Q_VECTOR(scratch_out);
273 
274  // store
275  Fout_r[ 0] = scratch_out[0];
276  Fout_r[ (out_step << 1) - 1] = scratch_out[1];
277  Fout_r[ (out_step << 1) ] = scratch_out[2];
278  Fout_r[2 * (out_step << 1) - 1] = scratch_out[3];
279 }
280 
281 NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
282  const ne10_float32_t *Fin_r,
283  const ne10_int32_t out_step,
284  const ne10_int32_t in_step,
285  const ne10_fft_cpx_float32_t *twiddles)
286 {
287  ne10_float32_t scratch [8];
288  ne10_float32_t scratch_in_r [4];
289  ne10_float32_t scratch_out_r[4];
290 
291  // load
292  scratch_in_r[0] = Fin_r[0 ];
293  scratch_in_r[1] = Fin_r[1*(out_step<<1)-1];
294  scratch_in_r[2] = Fin_r[1*(out_step<<1) ];
295  scratch_in_r[3] = Fin_r[2*(out_step<<1)-1];
296 
297  // NE10_PRINT_Q_VECTOR(scratch_in_r);
298 
299  // radix 4 butterfly without twiddles
300  scratch[0] = scratch_in_r[0] + scratch_in_r[3];
301  scratch[1] = scratch_in_r[0] - scratch_in_r[3];
302  scratch[2] = scratch_in_r[1] + scratch_in_r[1];
303  scratch[3] = scratch_in_r[2] + scratch_in_r[2];
304 
305  scratch_out_r[0] = scratch[0] + scratch[2];
306  scratch_out_r[1] = scratch[1] - scratch[3];
307  scratch_out_r[2] = scratch[0] - scratch[2];
308  scratch_out_r[3] = scratch[1] + scratch[3];
309 
310  // NE10_PRINT_Q_VECTOR(scratch_out_r);
311 
312  // store
313  Fout_r[0 * in_step] = scratch_out_r[0];
314  Fout_r[1 * in_step] = scratch_out_r[1];
315  Fout_r[2 * in_step] = scratch_out_r[2];
316  Fout_r[3 * in_step] = scratch_out_r[3];
317 
318 }
319 
320 NE10_INLINE void ne10_radix4_r2c_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
321  const ne10_float32_t *Fin_r,
322  const ne10_int32_t out_step,
323  const ne10_int32_t in_step,
324  const ne10_fft_cpx_float32_t *twiddles)
325 {
326  ne10_int32_t m_count;
327  ne10_float32_t *Fout_b = Fout_r + (((out_step<<1)-1)<<1) - 2; // reversed
328  ne10_fft_cpx_float32_t scratch_tw[3], scratch_in[4];
329 
330  ne10_fft_cpx_float32_t scratch[4], scratch_out[4];
331 
332  for (m_count = (out_step >> 1) - 1; m_count; m_count --)
333  {
334  scratch_tw [0] = twiddles[0 * out_step];
335  scratch_tw [1] = twiddles[1 * out_step];
336  scratch_tw [2] = twiddles[2 * out_step];
337 
338  scratch_in[0].r = Fin_r[0 * in_step ];
339  scratch_in[0].i = Fin_r[0 * in_step + 1];
340  scratch_in[1].r = Fin_r[1 * in_step ];
341  scratch_in[1].i = Fin_r[1 * in_step + 1];
342  scratch_in[2].r = Fin_r[2 * in_step ];
343  scratch_in[2].i = Fin_r[2 * in_step + 1];
344  scratch_in[3].r = Fin_r[3 * in_step ];
345  scratch_in[3].i = Fin_r[3 * in_step + 1];
346 
347  // NE10_PRINT_Q2_VECTOR(scratch_in);
348 
349  // radix 4 butterfly with twiddles
350  scratch[0].r = scratch_in[0].r;
351  scratch[0].i = scratch_in[0].i;
352  scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
353  scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
354 
355  scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
356  scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
357 
358  scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
359  scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
360 
361  NE10_FFT_R2C_CC_CC(scratch_out,scratch);
362 
363  // NE10_PRINT_Q2_VECTOR(scratch_in);
364 
365  // result
366  Fout_r[ 0] = scratch_out[0].r;
367  Fout_r[ 1] = scratch_out[0].i;
368  Fout_r[ (out_step << 1) ] = scratch_out[1].r;
369  Fout_r[ (out_step << 1) + 1] = scratch_out[1].i;
370  Fout_b[ 0] = scratch_out[2].r;
371  Fout_b[ 1] = scratch_out[2].i;
372  Fout_b[- (out_step << 1) ] = scratch_out[3].r;
373  Fout_b[- (out_step << 1) + 1] = scratch_out[3].i;
374 
375  // update pointers
376  Fin_r += 2;
377  Fout_r += 2;
378  Fout_b -= 2;
379  twiddles ++;
380  }
381 }
382 
383 NE10_INLINE void ne10_radix4_c2r_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
384  const ne10_float32_t *Fin_r,
385  const ne10_int32_t out_step,
386  const ne10_int32_t in_step,
387  const ne10_fft_cpx_float32_t *twiddles)
388 {
389  ne10_int32_t m_count;
390  const ne10_float32_t *Fin_b = Fin_r + (((out_step<<1)-1)<<1) - 2; // reversed
391  ne10_fft_cpx_float32_t scratch_tw [3],
392  scratch [8],
393  scratch_in [4],
394  scratch_out[4];
395 
396  for (m_count = (out_step >> 1) - 1; m_count; m_count --)
397  {
398  scratch_tw[0] = twiddles[0 * out_step];
399  scratch_tw[1] = twiddles[1 * out_step];
400  scratch_tw[2] = twiddles[2 * out_step];
401 
402  scratch_in[0].r = Fin_r[0];
403  scratch_in[0].i = Fin_r[1];
404 
405  scratch_in[1].r = Fin_b[0];
406  scratch_in[1].i = Fin_b[1];
407 
408  scratch_in[2].r = Fin_r[(out_step<<1) + 0];
409  scratch_in[2].i = Fin_r[(out_step<<1) + 1];
410 
411  scratch_in[3].r = Fin_b[-(out_step<<1) + 0];
412  scratch_in[3].i = Fin_b[-(out_step<<1) + 1];
413 
414  // NE10_PRINT_Q2_VECTOR(scratch_in);
415 
416  // // inverse of "result"
417  NE10_FFT_C2R_CC_CC(scratch,scratch_in);
418 
419  // inverse of "mutltipy twiddles"
420  scratch_out[0] = scratch[0];
421 
422  scratch_out[1].r = scratch[1].r * scratch_tw[0].r + scratch[1].i * scratch_tw[0].i;
423  scratch_out[1].i = scratch[1].i * scratch_tw[0].r - scratch[1].r * scratch_tw[0].i;
424 
425  scratch_out[2].r = scratch[2].r * scratch_tw[1].r + scratch[2].i * scratch_tw[1].i;
426  scratch_out[2].i = scratch[2].i * scratch_tw[1].r - scratch[2].r * scratch_tw[1].i;
427 
428  scratch_out[3].r = scratch[3].r * scratch_tw[2].r + scratch[3].i * scratch_tw[2].i;
429  scratch_out[3].i = scratch[3].i * scratch_tw[2].r - scratch[3].r * scratch_tw[2].i;
430 
431  // NE10_PRINT_Q2_VECTOR(scratch_out);
432 
433  // store
434  Fout_r[0 * in_step ] = scratch_out[0].r;
435  Fout_r[0 * in_step + 1] = scratch_out[0].i;
436  Fout_r[1 * in_step ] = scratch_out[1].r;
437  Fout_r[1 * in_step + 1] = scratch_out[1].i;
438  Fout_r[2 * in_step ] = scratch_out[2].r;
439  Fout_r[2 * in_step + 1] = scratch_out[2].i;
440  Fout_r[3 * in_step ] = scratch_out[3].r;
441  Fout_r[3 * in_step + 1] = scratch_out[3].i;
442 
443  // update pointers
444  Fin_r += 2;
445  Fout_r += 2;
446  Fin_b -= 2;
447  twiddles ++;
448  }
449 }
450 
451 NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
452  const ne10_float32_t *Fin_r,
453  const ne10_int32_t out_step,
454  const ne10_int32_t in_step,
455  const ne10_fft_cpx_float32_t *twiddles)
456 {
457  ne10_float32_t scratch_in [4];
458  ne10_float32_t scratch_out[4];
459 
460  scratch_in[0] = Fin_r[0 * in_step];
461  scratch_in[1] = Fin_r[1 * in_step];
462  scratch_in[2] = Fin_r[2 * in_step];
463  scratch_in[3] = Fin_r[3 * in_step];
464 
465  // NE10_PRINT_Q_VECTOR(scratch_in);
466 
467  NE10_FFT_R2C_4R_CC(scratch_out,scratch_in);
468 
469  // NE10_PRINT_Q_VECTOR(scratch_out);
470 
471  Fout_r[ 0] = scratch_out[0];
472  Fout_r[ 1] = scratch_out[1];
473  Fout_r[ (out_step << 1) ] = scratch_out[2];
474  Fout_r[ (out_step << 1) + 1] = scratch_out[3];
475 }
476 
477 NE10_INLINE void ne10_radix4_c2r_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
478  const ne10_float32_t *Fin_r,
479  const ne10_int32_t out_step,
480  const ne10_int32_t in_step,
481  const ne10_fft_cpx_float32_t *twiddles)
482 {
483  // inverse operation of ne10_radix4_r2c_with_twiddles_last_butterfly_c
484  ne10_float32_t scratch_in [4];
485  ne10_float32_t scratch_out[4];
486 
487  // load
488  scratch_in[0] = Fin_r[ 0];
489  scratch_in[1] = Fin_r[ 1];
490  scratch_in[2] = Fin_r[ (out_step << 1) ];
491  scratch_in[3] = Fin_r[ (out_step << 1) + 1];
492 
493  // NE10_PRINT_Q_VECTOR(scratch_in);
494 
495  NE10_FFT_C2R_CC_4R(scratch_out,scratch_in);
496 
497  // NE10_PRINT_Q_VECTOR(scratch_out);
498 
499  // store
500  Fout_r[0 * in_step] = scratch_out[0];
501  Fout_r[1 * in_step] = scratch_out[1];
502  Fout_r[2 * in_step] = scratch_out[2];
503  Fout_r[3 * in_step] = scratch_out[3];
504 }
505 
506 NE10_INLINE void ne10_radix4_r2c_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
507  const ne10_fft_cpx_float32_t *Fin,
508  const ne10_int32_t fstride,
509  const ne10_int32_t mstride,
510  const ne10_int32_t nfft,
511  const ne10_fft_cpx_float32_t *twiddles)
512 {
513  ne10_int32_t f_count;
514  const ne10_int32_t in_step = nfft >> 2;
515  const ne10_int32_t out_step = mstride;
516 
517  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
518  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
519  const ne10_fft_cpx_float32_t *tw;
520 
521  Fout_r ++;
522  Fin_r ++;
523 
524  for (f_count = fstride; f_count; f_count --)
525  {
526  tw = twiddles;
527 
528  // first butterfly
529  ne10_radix4_r2c_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
530 
531  tw ++;
532  Fin_r ++;
533  Fout_r ++;
534 
535  // other butterfly
536  ne10_radix4_r2c_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
537 
538  // update Fin_r, Fout_r, twiddles
539  tw += ( (out_step >> 1) - 1);
540  Fin_r += 2 * ( (out_step >> 1) - 1);
541  Fout_r += 2 * ( (out_step >> 1) - 1);
542 
543  // last butterfly
544  ne10_radix4_r2c_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
545  tw ++;
546  Fin_r ++;
547  Fout_r ++;
548 
549  Fout_r += 3 * out_step;
550  } // f_count
551 }
552 
553 NE10_INLINE void ne10_radix4_c2r_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
554  const ne10_fft_cpx_float32_t *Fin,
555  const ne10_int32_t fstride,
556  const ne10_int32_t mstride,
557  const ne10_int32_t nfft,
558  const ne10_fft_cpx_float32_t *twiddles)
559 {
560  ne10_int32_t f_count;
561  const ne10_int32_t in_step = nfft >> 2;
562  const ne10_int32_t out_step = mstride;
563 
564  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
565  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
566  const ne10_fft_cpx_float32_t *tw;
567 
568  for (f_count = fstride; f_count; f_count --)
569  {
570  tw = twiddles;
571 
572  // first butterfly
573  ne10_radix4_c2r_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
574 
575  tw ++;
576  Fin_r ++;
577  Fout_r ++;
578 
579  // other butterfly
580  ne10_radix4_c2r_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
581 
582  // update Fin_r, Fout_r, twiddles
583  tw += ( (out_step >> 1) - 1);
584  Fin_r += 2 * ( (out_step >> 1) - 1);
585  Fout_r += 2 * ( (out_step >> 1) - 1);
586 
587  // last butterfly
588  ne10_radix4_c2r_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
589  tw ++;
590  Fin_r ++;
591  Fout_r ++;
592 
593  Fin_r += 3 * out_step;
594  } // f_count
595 }
596 
597 NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_c (
598  ne10_fft_cpx_float32_t * Fout,
599  const ne10_fft_cpx_float32_t * Fin,
600  const ne10_int32_t * factors,
601  const ne10_fft_cpx_float32_t * twiddles,
602  ne10_fft_cpx_float32_t * buffer)
603 {
604  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
605 
606  ne10_int32_t fstride, mstride, nfft;
607  ne10_int32_t radix;
608  ne10_int32_t stage_count;
609 
610  // init fstride, mstride, radix, nfft
611  stage_count = factors[0];
612  fstride = factors[1];
613  mstride = factors[ (stage_count << 1) - 1 ];
614  radix = factors[ stage_count << 1 ];
615  nfft = radix * fstride;
616 
617  // PRINT_STAGE_INFO;
618 
619  if (radix == 2)
620  {
621  // combine one radix-4 and one radix-2 into one radix-8
622  mstride <<= 2;
623  fstride >>= 2;
624  twiddles += 6; // (4-1) x 2
625  stage_count --;
626  }
627 
628  if (stage_count % 2 == 0)
629  {
630  ne10_swap_ptr (buffer, Fout);
631  }
632 
633  // the first stage
634  if (radix == 2) // length of FFT is 2^n (n is odd)
635  {
636  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
637  ne10_radix8_r2c_c (Fout, Fin, fstride, mstride, nfft);
638  }
639  else if (radix == 4) // length of FFT is 2^n (n is even)
640  {
641  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
642  ne10_radix4_r2c_c (Fout, Fin, fstride, mstride, nfft);
643  }
644  // end of first stage
645 
646  // others
647  for (; fstride > 1;)
648  {
649  fstride >>= 2;
650  ne10_swap_ptr (buffer, Fout);
651 
652  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
653  ne10_radix4_r2c_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
654  twiddles += 3 * mstride;
655  mstride <<= 2;
656 
657  } // other stage
658 }
659 
660 NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_c (
661  ne10_fft_cpx_float32_t * Fout,
662  const ne10_fft_cpx_float32_t * Fin,
663  const ne10_int32_t * factors,
664  const ne10_fft_cpx_float32_t * twiddles,
665  ne10_fft_cpx_float32_t * buffer)
666 {
667  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
668 
669  ne10_int32_t fstride, mstride, nfft;
670  ne10_int32_t radix;
671  ne10_int32_t stage_count;
672 
673  // init fstride, mstride, radix, nfft
674  stage_count = factors[0];
675  fstride = factors[1];
676  mstride = factors[ (stage_count << 1) - 1 ];
677  radix = factors[ stage_count << 1 ];
678  nfft = radix * fstride;
679 
680  // fstride, mstride for the last stage
681  fstride = 1;
682  mstride = nfft >> 2;
683  // PRINT_STAGE_INFO;
684 
685  if (radix == 2)
686  {
687  // combine one radix-4 and one radix-2 into one radix-8
688  stage_count --;
689  }
690 
691  if (stage_count % 2 == 1)
692  {
693  ne10_swap_ptr (buffer, Fout);
694  }
695 
696  // last butterfly -- inversed
697  if (stage_count > 1)
698  {
699  twiddles -= 3 * mstride;
700  // PRINT_STAGE_INFO;
701  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
702  ne10_radix4_c2r_with_twiddles_c (buffer, Fin, fstride, mstride, nfft, twiddles);
703  fstride <<= 2;
704  mstride >>= 2;
705  stage_count --;
706  }
707 
708  // others but the last stage
709  for (; stage_count > 1;)
710  {
711  twiddles -= 3 * mstride;
712  // PRINT_STAGE_INFO;
713  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
714  ne10_radix4_c2r_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
715  fstride <<= 2;
716  mstride >>= 2;
717  stage_count --;
718  ne10_swap_ptr (buffer, Fout);
719  } // other stage
720 
721  // first stage -- inversed
722  if (radix == 2) // length of FFT is 2^n (n is odd)
723  {
724  mstride >>= 1;
725 
726  // PRINT_STAGE_INFO;
727  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
728  ne10_radix8_c2r_c (Fout, buffer, fstride, mstride, nfft);
729  }
730  else if (radix == 4) // length of FFT is 2^n (n is even)
731  {
732  // PRINT_STAGE_INFO;
733  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
734  ne10_radix4_c2r_c (Fout, buffer, fstride, mstride, nfft);
735  }
736 }
737 
745 {
746  ne10_fft_r2c_cfg_float32_t st = NULL;
747  ne10_int32_t ncfft = nfft >> 1;
748  ne10_int32_t result;
749 
750  ne10_uint32_t memneeded = sizeof (ne10_fft_r2c_state_float32_t)
751  + sizeof (ne10_fft_cpx_float32_t) * nfft /* buffer*/
752  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors */
753  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors_neon */
754  + sizeof (ne10_fft_cpx_float32_t) * nfft /* r_twiddles */
755  + sizeof (ne10_fft_cpx_float32_t) * nfft/4 /* r_twiddles_neon */
756  + sizeof (ne10_fft_cpx_float32_t) * (12 + nfft/32*12) /* r_super_twiddles_neon */
757  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
758 
759  st = (ne10_fft_r2c_cfg_float32_t) NE10_MALLOC (memneeded);
760 
761  if (!st)
762  {
763  return st;
764  }
765 
766  ne10_int32_t i,j;
768  const ne10_float32_t pi = NE10_PI;
769  ne10_float32_t phase1;
770 
771  st->nfft = nfft;
772 
773  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_r2c_state_float32_t);
774  NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
775 
776  st->buffer = (ne10_fft_cpx_float32_t*) address;
777  st->r_twiddles = st->buffer + nfft;
778  st->r_factors = (ne10_int32_t*) (st->r_twiddles + nfft);
779  st->r_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors + (NE10_MAXFACTORS * 2));
780  st->r_factors_neon = (ne10_int32_t*) (st->r_twiddles_neon + nfft/4);
781  st->r_super_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors_neon + (NE10_MAXFACTORS * 2));
782 
783  if (nfft<16)
784  {
785  return st;
786  }
787 
788  // factors and twiddles for rfft C
789  ne10_factor (nfft, st->r_factors, NE10_FACTOR_DEFAULT);
790 
791  // backward twiddles pointers
792  st->r_twiddles_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles, st->r_factors, nfft);
793 
794  // factors and twiddles for rfft neon
795  result = ne10_factor (nfft/4, st->r_factors_neon, NE10_FACTOR_DEFAULT);
796  if (result == NE10_ERR)
797  {
798  return st;
799  }
800 
801  // Twiddle table is transposed here to improve cache access performance.
802  st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
803  st->r_twiddles_neon,
804  st->r_factors_neon,
805  nfft/4);
806 
807  // nfft/4 x 4
808  tw = st->r_super_twiddles_neon;
809  for (i = 1; i < 4; i ++)
810  {
811  for (j = 0; j < 4; j++)
812  {
813  phase1 = - 2 * pi * ( (ne10_float32_t) (i * j) / nfft);
814  tw[4*i-4+j].r = (ne10_float32_t) cos (phase1);
815  tw[4*i-4+j].i = (ne10_float32_t) sin (phase1);
816  }
817  }
818 
819  ne10_int32_t k,s;
820  // [nfft/32] x [3] x [4]
821  // k s j
822  for (k=1; k<nfft/32; k++)
823  {
824  // transposed
825  for (s = 1; s < 4; s++)
826  {
827  for (j = 0; j < 4; j++)
828  {
829  phase1 = - 2 * pi * ( (ne10_float32_t) ((k*4+j) * s) / nfft);
830  tw[12*k+j+4*(s-1)].r = (ne10_float32_t) cos (phase1);
831  tw[12*k+j+4*(s-1)].i = (ne10_float32_t) sin (phase1);
832  }
833  }
834  }
835  return st;
836 }
837 
849  ne10_float32_t *fin,
851 {
852  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
853 
854  switch(cfg->nfft)
855  {
856  case 8:
857  ne10_radix8_r2c_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) fin,1,1,8);
858  break;
859  default:
860  ne10_mixed_radix_r2c_butterfly_float32_c (
861  fout,
862  (ne10_fft_cpx_float32_t*) fin,
863  cfg->r_factors,
864  cfg->r_twiddles,
865  tmpbuf);
866  break;
867  }
868 
869  fout[0].r = fout[0].i;
870  fout[0].i = 0.0f;
871  fout[(cfg->nfft) >> 1].i = 0.0f;
872 }
873 
884 void ne10_fft_c2r_1d_float32_c (ne10_float32_t *fout,
887 {
888  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
889 
890  fin[0].i = fin[0].r;
891  fin[0].r = 0.0f;
892  switch(cfg->nfft)
893  {
894  case 8:
895  ne10_radix8_c2r_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) &fin[0].i,1,1,8);
896  break;
897  default:
898  ne10_mixed_radix_c2r_butterfly_float32_c (
899  (ne10_fft_cpx_float32_t*)fout,
900  (ne10_fft_cpx_float32_t*)&fin[0].i, // first real is moved to first image
901  cfg->r_factors,
902  cfg->r_twiddles_backward,
903  tmpbuf);
904  break;
905  }
906  fin[0].r = fin[0].i;
907  fin[0].i = 0.0f;
908 }
909 
914 #endif // NE10_UNROLL_LEVEL
void ne10_fft_c2r_1d_float32_c(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.
ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft (r2c/c2r).
void ne10_fft_r2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.