34#include "NE10_types.h"
35#include "NE10_macros.h"
38#define FFT4_FS_START \
39 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
40 ne10_int16_t tmp_r, tmp_i;
44 s2_r = Fin[0].r - Fin[2].r; \
45 s2_i = Fin[0].i - Fin[2].i; \
46 tmp_r = Fin[0].r + Fin[2].r; \
47 tmp_i = Fin[0].i + Fin[2].i; \
48 s0_r = Fin[1].r + Fin[3].r; \
49 s0_i = Fin[1].i + Fin[3].i; \
50 s1_r = Fin[1].r - Fin[3].r; \
51 s1_i = Fin[1].i - Fin[3].i;
53#define FFT4_FS_SCALED \
54 s2_r = (Fin[0].r - Fin[2].r) >> 2; \
55 s2_i = (Fin[0].i - Fin[2].i) >> 2; \
56 tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
57 tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
58 s0_r = (Fin[1].r + Fin[3].r) >> 2; \
59 s0_i = (Fin[1].i + Fin[3].i) >> 2; \
60 s1_r = (Fin[1].r - Fin[3].r) >> 2; \
61 s1_i = (Fin[1].i - Fin[3].i) >> 2;
64 Fout[2].r = tmp_r - s0_r; \
65 Fout[2].i = tmp_i - s0_i; \
66 Fout[0].r = tmp_r + s0_r; \
67 Fout[0].i = tmp_i + s0_i; \
68 Fout[1].r = s2_r + s1_i; \
69 Fout[1].i = s2_i - s1_r; \
70 Fout[3].r = s2_r - s1_i; \
71 Fout[3].i = s2_i + s1_r;
74 Fout[2].r = tmp_r - s0_r; \
75 Fout[2].i = tmp_i - s0_i; \
76 Fout[0].r = tmp_r + s0_r; \
77 Fout[0].i = tmp_i + s0_i; \
78 Fout[1].r = s2_r - s1_i; \
79 Fout[1].i = s2_i + s1_r; \
80 Fout[3].r = s2_r + s1_i; \
81 Fout[3].i = s2_i - s1_r;
118#define FFT8_FS_START \
119 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
120 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
121 const ne10_int16_t TW_81 = 23169;
124 s0_r = Fin[0].r + Fin[4].r; \
125 s0_i = Fin[0].i + Fin[4].i; \
126 s1_r = Fin[0].r - Fin[4].r; \
127 s1_i = Fin[0].i - Fin[4].i; \
128 s2_r = Fin[1].r + Fin[5].r; \
129 s2_i = Fin[1].i + Fin[5].i; \
130 s3_r = Fin[1].r - Fin[5].r; \
131 s3_i = Fin[1].i - Fin[5].i; \
132 s4_r = Fin[2].r + Fin[6].r; \
133 s4_i = Fin[2].i + Fin[6].i; \
134 s5_r = Fin[2].r - Fin[6].r; \
135 s5_i = Fin[2].i - Fin[6].i; \
136 s6_r = Fin[3].r + Fin[7].r; \
137 s6_i = Fin[3].i + Fin[7].i; \
138 s7_r = Fin[3].r - Fin[7].r; \
139 s7_i = Fin[3].i - Fin[7].i;
141#define FFT8_FS_SCALED \
142 s0_r = (Fin[0].r + Fin[4].r) >> 3; \
143 s0_i = (Fin[0].i + Fin[4].i) >> 3; \
144 s1_r = (Fin[0].r - Fin[4].r) >> 3; \
145 s1_i = (Fin[0].i - Fin[4].i) >> 3; \
146 s2_r = (Fin[1].r + Fin[5].r) >> 3; \
147 s2_i = (Fin[1].i + Fin[5].i) >> 3; \
148 s3_r = (Fin[1].r - Fin[5].r) >> 3; \
149 s3_i = (Fin[1].i - Fin[5].i) >> 3; \
150 s4_r = (Fin[2].r + Fin[6].r) >> 3; \
151 s4_i = (Fin[2].i + Fin[6].i) >> 3; \
152 s5_r = (Fin[2].r - Fin[6].r) >> 3; \
153 s5_i = (Fin[2].i - Fin[6].i) >> 3; \
154 s6_r = (Fin[3].r + Fin[7].r) >> 3; \
155 s6_i = (Fin[3].i + Fin[7].i) >> 3; \
156 s7_r = (Fin[3].r - Fin[7].r) >> 3; \
157 s7_i = (Fin[3].i - Fin[7].i) >> 3;
161 t0_r = s0_r - s4_r; \
162 t0_i = s0_i - s4_i; \
163 t1_r = s0_r + s4_r; \
164 t1_i = s0_i + s4_i; \
165 t2_r = s2_r + s6_r; \
166 t2_i = s2_i + s6_i; \
167 t3_r = s2_r - s6_r; \
168 t3_i = s2_i - s6_i; \
169 Fout[0].r = t1_r + t2_r; \
170 Fout[0].i = t1_i + t2_i; \
171 Fout[4].r = t1_r - t2_r; \
172 Fout[4].i = t1_i - t2_i; \
173 Fout[2].r = t0_r + t3_i; \
174 Fout[2].i = t0_i - t3_r; \
175 Fout[6].r = t0_r - t3_i; \
176 Fout[6].i = t0_i + t3_r; \
177 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
178 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
179 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
180 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
181 t0_r = s1_r - s5_i; \
182 t0_i = s1_i + s5_r; \
183 t1_r = s1_r + s5_i; \
184 t1_i = s1_i - s5_r; \
185 t2_r = t4_r - t5_r; \
186 t2_i = t4_i - t5_i; \
187 t3_r = t4_r + t5_r; \
188 t3_i = t4_i + t5_i; \
189 Fout[1].r = t1_r + t2_r; \
190 Fout[1].i = t1_i + t2_i; \
191 Fout[5].r = t1_r - t2_r; \
192 Fout[5].i = t1_i - t2_i; \
193 Fout[3].r = t0_r + t3_i; \
194 Fout[3].i = t0_i - t3_r; \
195 Fout[7].r = t0_r - t3_i; \
196 Fout[7].i = t0_i + t3_r;
199 t0_r = s0_r - s4_r; \
200 t0_i = s0_i - s4_i; \
201 t1_r = s0_r + s4_r; \
202 t1_i = s0_i + s4_i; \
203 t2_r = s2_r + s6_r; \
204 t2_i = s2_i + s6_i; \
205 t3_r = s2_r - s6_r; \
206 t3_i = s2_i - s6_i; \
207 Fout[0].r = t1_r + t2_r; \
208 Fout[0].i = t1_i + t2_i; \
209 Fout[4].r = t1_r - t2_r; \
210 Fout[4].i = t1_i - t2_i; \
211 Fout[2].r = t0_r - t3_i; \
212 Fout[2].i = t0_i + t3_r; \
213 Fout[6].r = t0_r + t3_i; \
214 Fout[6].i = t0_i - t3_r; \
215 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
216 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
217 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
218 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
219 t0_r = s1_r + s5_i; \
220 t0_i = s1_i - s5_r; \
221 t1_r = s1_r - s5_i; \
222 t1_i = s1_i + s5_r; \
223 t2_r = t4_r - t5_r; \
224 t2_i = t4_i - t5_i; \
225 t3_r = t4_r + t5_r; \
226 t3_i = t4_i + t5_i; \
227 Fout[1].r = t1_r + t2_r; \
228 Fout[1].i = t1_i + t2_i; \
229 Fout[5].r = t1_r - t2_r; \
230 Fout[5].i = t1_i - t2_i; \
231 Fout[3].r = t0_r - t3_i; \
232 Fout[3].i = t0_i + t3_r; \
233 Fout[7].r = t0_r + t3_i; \
234 Fout[7].i = t0_i - t3_r;
271#define RADIX8x4_START \
272 ne10_int32_t f_count; \
273 ne10_int32_t src_step = stride << 1; \
274 const ne10_int16_t TW_81 = 23169; \
275 const ne10_int16_t TW_81N = -23169; \
276 int16_t *p_src, *p_dst; \
277 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3, d2_in4, d2_in5, d2_in6, d2_in7; \
278 int16x4_t d_sin0_r, d_sin0_i, d_sin1_r, d_sin1_i, d_sin2_r, d_sin2_i, d_sin3_r, d_sin3_i; \
279 int16x4_t d_sin4_r, d_sin4_i, d_sin5_r, d_sin5_i, d_sin6_r, d_sin6_i, d_sin7_r, d_sin7_i; \
280 int16x4_t d_s3_r, d_s3_i, d_s5_r, d_s5_i, d_s7_r, d_s7_i; \
281 int16x4_t d_s8_r, d_s8_i, d_s9_r, d_s9_i, d_s10_r, d_s10_i, d_s11_r, d_s11_i; \
282 int16x4_t d_s12_r, d_s12_i, d_s13_r, d_s13_i, d_s14_r, d_s14_i, d_s15_r, d_s15_i; \
283 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
284 int16x4_t d_out4_r, d_out4_i, d_out5_r, d_out5_i, d_out6_r, d_out6_i, d_out7_r, d_out7_i; \
285 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3, d2_out4, d2_out5, d2_out6, d2_out7; \
286 int16x8x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
287 int32x4x2_t q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
288 int16x4_t d_tw_81, d_tw_81n; \
289 p_src = (int16_t *) Fin; \
290 p_dst = (int16_t *) Fout;
293#define RADIX8x4_LOAD \
294 d2_in0 = vld2_s16 (p_src); \
296 d2_in2 = vld2_s16 (p_src); \
298 d2_in4 = vld2_s16 (p_src); \
300 d2_in6 = vld2_s16 (p_src); \
302 d2_in1 = vld2_s16 (p_src); \
304 d2_in3 = vld2_s16 (p_src); \
306 d2_in5 = vld2_s16 (p_src); \
308 d2_in7 = vld2_s16 (p_src); \
311#define RADIX8x4_STORE \
312 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
313 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
314 q2_tmp2 = vtrnq_s16 (vcombine_s16(d_out4_r, d_out4_i), vcombine_s16(d_out5_r, d_out5_i)); \
315 q2_tmp3 = vtrnq_s16 (vcombine_s16(d_out6_r, d_out6_i), vcombine_s16(d_out7_r, d_out7_i)); \
316 q2_tmp4 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
317 q2_tmp5 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
318 q2_tmp6 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[0]), vreinterpretq_s32_s16(q2_tmp3.val[0])); \
319 q2_tmp7 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[1]), vreinterpretq_s32_s16(q2_tmp3.val[1])); \
320 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
321 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
322 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
323 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
324 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
325 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
326 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
327 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
328 d2_out4.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
329 d2_out4.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
330 d2_out5.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
331 d2_out5.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
332 d2_out6.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
333 d2_out6.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
334 d2_out7.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
335 d2_out7.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
336 vst2_s16 (p_dst, d2_out0); \
338 vst2_s16 (p_dst, d2_out1); \
340 vst2_s16 (p_dst, d2_out2); \
342 vst2_s16 (p_dst, d2_out3); \
344 vst2_s16 (p_dst, d2_out4); \
346 vst2_s16 (p_dst, d2_out5); \
348 vst2_s16 (p_dst, d2_out6); \
350 vst2_s16 (p_dst, d2_out7); \
352 p_src = p_src - src_step * 8 + 8;
354#define RADIX8x4_FS_S0 \
355 d_sin0_r = vadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
356 d_sin0_i = vadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
357 d_sin1_r = vsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
358 d_sin1_i = vsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
359 d_sin2_r = vadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
360 d_sin2_i = vadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
361 d_sin3_r = vsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
362 d_sin3_i = vsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
363 d_sin4_r = vadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
364 d_sin4_i = vadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
365 d_sin5_r = vsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
366 d_sin5_i = vsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
367 d_sin6_r = vadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
368 d_sin6_i = vadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
369 d_sin7_r = vsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
370 d_sin7_i = vsub_s16 (d2_in6.val[1], d2_in7.val[1]);
372#define RADIX8x4_FWD_S357 \
373 d_tw_81 = vdup_n_s16 (TW_81); \
374 d_tw_81n = vdup_n_s16 (TW_81N); \
376 d_s5_i = vneg_s16 (d_sin5_r); \
377 d_s3_r = vadd_s16 (d_sin3_r, d_sin3_i); \
378 d_s3_i = vsub_s16 (d_sin3_i, d_sin3_r); \
379 d_s7_r = vsub_s16 (d_sin7_r, d_sin7_i); \
380 d_s7_i = vadd_s16 (d_sin7_i, d_sin7_r); \
381 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
382 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
383 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
384 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
386#define RADIX8x4_INV_S357 \
387 d_tw_81 = vdup_n_s16 (TW_81); \
388 d_tw_81n = vdup_n_s16 (TW_81N); \
389 d_s5_r = vneg_s16 (d_sin5_i); \
391 d_s3_r = vsub_s16 (d_sin3_r, d_sin3_i); \
392 d_s3_i = vadd_s16 (d_sin3_i, d_sin3_r); \
393 d_s7_r = vadd_s16 (d_sin7_r, d_sin7_i); \
394 d_s7_i = vsub_s16 (d_sin7_i, d_sin7_r); \
395 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
396 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
397 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
398 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
400#define RADIX8x4_LS_02 \
401 d_s8_r = vadd_s16 (d_sin0_r, d_sin4_r); \
402 d_s8_i = vadd_s16 (d_sin0_i, d_sin4_i); \
403 d_s9_r = vadd_s16 (d_sin1_r, d_s5_r); \
404 d_s9_i = vadd_s16 (d_sin1_i, d_s5_i); \
405 d_s10_r = vsub_s16 (d_sin0_r, d_sin4_r); \
406 d_s10_i = vsub_s16 (d_sin0_i, d_sin4_i); \
407 d_s11_r = vsub_s16 (d_sin1_r, d_s5_r); \
408 d_s11_i = vsub_s16 (d_sin1_i, d_s5_i); \
409 d_s12_r = vadd_s16 (d_sin2_r, d_sin6_r); \
410 d_s12_i = vadd_s16 (d_sin2_i, d_sin6_i); \
411 d_s13_r = vadd_s16 (d_s3_r, d_s7_r); \
412 d_s13_i = vadd_s16 (d_s3_i, d_s7_i); \
413 d_s14_r = vsub_s16 (d_sin2_r, d_sin6_r); \
414 d_s14_i = vsub_s16 (d_sin2_i, d_sin6_i); \
415 d_s15_r = vsub_s16 (d_s3_r, d_s7_r); \
416 d_s15_i = vsub_s16 (d_s3_i, d_s7_i); \
417 d_out4_r = vsub_s16 (d_s8_r, d_s12_r); \
418 d_out4_i = vsub_s16 (d_s8_i, d_s12_i); \
419 d_out5_r = vsub_s16 (d_s9_r, d_s13_r); \
420 d_out5_i = vsub_s16 (d_s9_i, d_s13_i); \
421 d_out0_r = vadd_s16 (d_s8_r, d_s12_r); \
422 d_out0_i = vadd_s16 (d_s8_i, d_s12_i); \
423 d_out1_r = vadd_s16 (d_s9_r, d_s13_r); \
424 d_out1_i = vadd_s16 (d_s9_i, d_s13_i);
426#define RADIX8x4_FS_S0_SCALED \
427 d_sin0_r = vhadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
428 d_sin0_i = vhadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
429 d_sin1_r = vhsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
430 d_sin1_i = vhsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
431 d_sin2_r = vhadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
432 d_sin2_i = vhadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
433 d_sin3_r = vhsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
434 d_sin3_i = vhsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
435 d_sin4_r = vhadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
436 d_sin4_i = vhadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
437 d_sin5_r = vhsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
438 d_sin5_i = vhsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
439 d_sin6_r = vhadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
440 d_sin6_i = vhadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
441 d_sin7_r = vhsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
442 d_sin7_i = vhsub_s16 (d2_in6.val[1], d2_in7.val[1]);
444#define RADIX8x4_LS_02_SCALED \
445 d_s8_r = vhadd_s16 (d_sin0_r, d_sin4_r); \
446 d_s8_i = vhadd_s16 (d_sin0_i, d_sin4_i); \
447 d_s9_r = vhadd_s16 (d_sin1_r, d_s5_r); \
448 d_s9_i = vhadd_s16 (d_sin1_i, d_s5_i); \
449 d_s10_r = vhsub_s16 (d_sin0_r, d_sin4_r); \
450 d_s10_i = vhsub_s16 (d_sin0_i, d_sin4_i); \
451 d_s11_r = vhsub_s16 (d_sin1_r, d_s5_r); \
452 d_s11_i = vhsub_s16 (d_sin1_i, d_s5_i); \
453 d_s12_r = vhadd_s16 (d_sin2_r, d_sin6_r); \
454 d_s12_i = vhadd_s16 (d_sin2_i, d_sin6_i); \
455 d_s13_r = vhadd_s16 (d_s3_r, d_s7_r); \
456 d_s13_i = vhadd_s16 (d_s3_i, d_s7_i); \
457 d_s14_r = vhsub_s16 (d_sin2_r, d_sin6_r); \
458 d_s14_i = vhsub_s16 (d_sin2_i, d_sin6_i); \
459 d_s15_r = vhsub_s16 (d_s3_r, d_s7_r); \
460 d_s15_i = vhsub_s16 (d_s3_i, d_s7_i); \
461 d_out4_r = vhsub_s16 (d_s8_r, d_s12_r); \
462 d_out4_i = vhsub_s16 (d_s8_i, d_s12_i); \
463 d_out5_r = vhsub_s16 (d_s9_r, d_s13_r); \
464 d_out5_i = vhsub_s16 (d_s9_i, d_s13_i); \
465 d_out0_r = vhadd_s16 (d_s8_r, d_s12_r); \
466 d_out0_i = vhadd_s16 (d_s8_i, d_s12_i); \
467 d_out1_r = vhadd_s16 (d_s9_r, d_s13_r); \
468 d_out1_i = vhadd_s16 (d_s9_i, d_s13_i);
477 for (f_count = 0; f_count < stride; f_count += 4)
487 d_out2_r = vadd_s16 (d_s10_r, d_s14_i);
488 d_out2_i = vsub_s16 (d_s10_i, d_s14_r);
489 d_out3_r = vadd_s16 (d_s11_r, d_s15_i);
490 d_out3_i = vsub_s16 (d_s11_i, d_s15_r);
491 d_out6_r = vsub_s16 (d_s10_r, d_s14_i);
492 d_out6_i = vadd_s16 (d_s10_i, d_s14_r);
493 d_out7_r = vsub_s16 (d_s11_r, d_s15_i);
494 d_out7_i = vadd_s16 (d_s11_i, d_s15_r);
506 for (f_count = 0; f_count < stride; f_count += 4)
515 d_out2_r = vsub_s16 (d_s10_r, d_s14_i);
516 d_out2_i = vadd_s16 (d_s10_i, d_s14_r);
517 d_out3_r = vsub_s16 (d_s11_r, d_s15_i);
518 d_out3_i = vadd_s16 (d_s11_i, d_s15_r);
519 d_out6_r = vadd_s16 (d_s10_r, d_s14_i);
520 d_out6_i = vsub_s16 (d_s10_i, d_s14_r);
521 d_out7_r = vadd_s16 (d_s11_r, d_s15_i);
522 d_out7_i = vsub_s16 (d_s11_i, d_s15_r);
533 for (f_count = 0; f_count < stride; f_count += 4)
536 RADIX8x4_FS_S0_SCALED
540 RADIX8x4_LS_02_SCALED
542 d_out2_r = vhadd_s16 (d_s10_r, d_s14_i);
543 d_out2_i = vhsub_s16 (d_s10_i, d_s14_r);
544 d_out3_r = vhadd_s16 (d_s11_r, d_s15_i);
545 d_out3_i = vhsub_s16 (d_s11_i, d_s15_r);
546 d_out6_r = vhsub_s16 (d_s10_r, d_s14_i);
547 d_out6_i = vhadd_s16 (d_s10_i, d_s14_r);
548 d_out7_r = vhsub_s16 (d_s11_r, d_s15_i);
549 d_out7_i = vhadd_s16 (d_s11_i, d_s15_r);
561 for (f_count = 0; f_count < stride; f_count += 4)
564 RADIX8x4_FS_S0_SCALED
568 RADIX8x4_LS_02_SCALED
570 d_out2_r = vhsub_s16 (d_s10_r, d_s14_i);
571 d_out2_i = vhadd_s16 (d_s10_i, d_s14_r);
572 d_out3_r = vhsub_s16 (d_s11_r, d_s15_i);
573 d_out3_i = vhadd_s16 (d_s11_i, d_s15_r);
574 d_out6_r = vhadd_s16 (d_s10_r, d_s14_i);
575 d_out6_i = vhsub_s16 (d_s10_i, d_s14_r);
576 d_out7_r = vhadd_s16 (d_s11_r, d_s15_i);
577 d_out7_i = vhsub_s16 (d_s11_i, d_s15_r);
583#define RADIX4x4_WITHOUT_TW_START \
584 ne10_int32_t f_count; \
585 ne10_int32_t src_step = stride << 1; \
586 int16_t *p_src, *p_dst; \
587 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
588 int16x4_t d_s0_r, d_s0_i, d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
589 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
590 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
591 int16x8x2_t q2_tmp0, q2_tmp1; \
592 int32x4x2_t q2_tmp2, q2_tmp3; \
593 p_src = (int16_t *) Fin; \
594 p_dst = (int16_t *) Fout;
596#define RADIX4x4_WITHOUT_TW_LOAD \
597 d2_in0 = vld2_s16 (p_src); \
599 d2_in1 = vld2_s16 (p_src); \
601 d2_in2 = vld2_s16 (p_src); \
603 d2_in3 = vld2_s16 (p_src); \
606#define RADIX4x4_WITHOUT_TW_STORE \
607 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
608 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
609 q2_tmp2 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
610 q2_tmp3 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
611 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
612 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
613 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
614 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
615 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
616 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
617 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
618 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
619 vst2_s16 (p_dst, d2_out0); \
621 vst2_s16 (p_dst, d2_out1); \
623 vst2_s16 (p_dst, d2_out2); \
625 vst2_s16 (p_dst, d2_out3); \
627 p_src = p_src - src_step * 4 + 8;
629#define RADIX4x4_WITHOUT_TW_S0 \
630 d_s0_r = vadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
631 d_s0_i = vadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
632 d_s1_r = vsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
633 d_s1_i = vsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
634 d_s2_r = vadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
635 d_s2_i = vadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
636 d_s3_r = vsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
637 d_s3_i = vsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
638 d_out2_r = vsub_s16 (d_s0_r, d_s2_r); \
639 d_out2_i = vsub_s16 (d_s0_i, d_s2_i); \
640 d_out0_r = vadd_s16 (d_s0_r, d_s2_r); \
641 d_out0_i = vadd_s16 (d_s0_i, d_s2_i);
643#define RADIX4x4_WITHOUT_TW_S0_SCALED \
644 d_s0_r = vhadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
645 d_s0_i = vhadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
646 d_s1_r = vhsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
647 d_s1_i = vhsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
648 d_s2_r = vhadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
649 d_s2_i = vhadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
650 d_s3_r = vhsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
651 d_s3_i = vhsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
652 d_out2_r = vhsub_s16 (d_s0_r, d_s2_r); \
653 d_out2_i = vhsub_s16 (d_s0_i, d_s2_i); \
654 d_out0_r = vhadd_s16 (d_s0_r, d_s2_r); \
655 d_out0_i = vhadd_s16 (d_s0_i, d_s2_i);
658static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
662 RADIX4x4_WITHOUT_TW_START
664 for (f_count = 0; f_count < stride; f_count += 4)
667 RADIX4x4_WITHOUT_TW_LOAD
670 RADIX4x4_WITHOUT_TW_S0
672 d_out1_r = vadd_s16 (d_s1_r, d_s3_i);
673 d_out1_i = vsub_s16 (d_s1_i, d_s3_r);
674 d_out3_r = vsub_s16 (d_s1_r, d_s3_i);
675 d_out3_i = vadd_s16 (d_s1_i, d_s3_r);
677 RADIX4x4_WITHOUT_TW_STORE
681static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
685 RADIX4x4_WITHOUT_TW_START
687 for (f_count = 0; f_count < stride; f_count += 4)
690 RADIX4x4_WITHOUT_TW_LOAD
693 RADIX4x4_WITHOUT_TW_S0
695 d_out1_r = vsub_s16 (d_s1_r, d_s3_i);
696 d_out1_i = vadd_s16 (d_s1_i, d_s3_r);
697 d_out3_r = vadd_s16 (d_s1_r, d_s3_i);
698 d_out3_i = vsub_s16 (d_s1_i, d_s3_r);
700 RADIX4x4_WITHOUT_TW_STORE
704static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
708 RADIX4x4_WITHOUT_TW_START
710 for (f_count = 0; f_count < stride; f_count += 4)
713 RADIX4x4_WITHOUT_TW_LOAD
716 RADIX4x4_WITHOUT_TW_S0_SCALED
718 d_out1_r = vhadd_s16 (d_s1_r, d_s3_i);
719 d_out1_i = vhsub_s16 (d_s1_i, d_s3_r);
720 d_out3_r = vhsub_s16 (d_s1_r, d_s3_i);
721 d_out3_i = vhadd_s16 (d_s1_i, d_s3_r);
723 RADIX4x4_WITHOUT_TW_STORE
727static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
731 RADIX4x4_WITHOUT_TW_START
733 for (f_count = 0; f_count < stride; f_count += 4)
736 RADIX4x4_WITHOUT_TW_LOAD
739 RADIX4x4_WITHOUT_TW_S0_SCALED
741 d_out1_r = vhsub_s16 (d_s1_r, d_s3_i);
742 d_out1_i = vhadd_s16 (d_s1_i, d_s3_r);
743 d_out3_r = vhadd_s16 (d_s1_r, d_s3_i);
744 d_out3_i = vhsub_s16 (d_s1_i, d_s3_r);
746 RADIX4x4_WITHOUT_TW_STORE
750#define RADIX4x4_WITH_TW_START \
751 ne10_int32_t m_count; \
752 ne10_int32_t src_step = src_stride << 1; \
753 ne10_int32_t dst_step = dst_stride << 1; \
754 ne10_int32_t tw_step = mstride << 1; \
755 int16_t *p_src, *p_dst, *p_tw; \
756 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
757 int16x4x2_t d2_tw0, d2_tw1, d2_tw2; \
758 int16x4_t d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
759 int16x4_t d_tmp0, d_tmp1, d_tmp2, d_tmp3, d_tmp4, d_tmp5; \
760 int16x4_t d_s4_r, d_s4_i, d_s5_r, d_s5_i, d_s6_r, d_s6_i, d_s7_r, d_s7_i; \
761 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
762 p_src = (int16_t *) Fin; \
763 p_dst = (int16_t *) Fout; \
764 p_tw = (int16_t *) tw;
766#define RADIX4x4_WITH_TW_LOAD \
767 d2_in0 = vld2_s16 (p_src); \
769 d2_in1 = vld2_s16 (p_src); \
771 d2_in2 = vld2_s16 (p_src); \
773 d2_in3 = vld2_s16 (p_src); \
775 d2_tw0 = vld2_s16 (p_tw); \
777 d2_tw1 = vld2_s16 (p_tw); \
779 d2_tw2 = vld2_s16 (p_tw); \
780 d_s1_r = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[0]); \
781 d_s1_i = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[0]); \
782 d_s2_r = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[0]); \
783 d_s2_i = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[0]); \
784 d_s3_r = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[0]); \
785 d_s3_i = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[0]); \
786 d_tmp0 = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[1]); \
787 d_tmp1 = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[1]); \
788 d_tmp2 = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[1]); \
789 d_tmp3 = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[1]); \
790 d_tmp4 = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[1]); \
791 d_tmp5 = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[1]);
793#define RADIX4x4_WITH_TW_STORE \
794 vst2_s16 (p_dst, d2_out0); \
796 vst2_s16 (p_dst, d2_out1); \
798 vst2_s16 (p_dst, d2_out2); \
800 vst2_s16 (p_dst, d2_out3); \
802 p_src = p_src - src_step * 4 + 8; \
803 p_dst = p_dst - dst_step * 4 + 8; \
804 p_tw = p_tw - tw_step * 2 + 8;
806#define RADIX4x4_WITH_TW_S1_FWD \
807 d_s1_r = vsub_s16 (d_s1_r, d_tmp0); \
808 d_s1_i = vadd_s16 (d_s1_i, d_tmp1); \
809 d_s2_r = vsub_s16 (d_s2_r, d_tmp2); \
810 d_s2_i = vadd_s16 (d_s2_i, d_tmp3); \
811 d_s3_r = vsub_s16 (d_s3_r, d_tmp4); \
812 d_s3_i = vadd_s16 (d_s3_i, d_tmp5);
814#define RADIX4x4_WITH_TW_S1_INV \
815 d_s1_r = vadd_s16 (d_s1_r, d_tmp0); \
816 d_s1_i = vsub_s16 (d_s1_i, d_tmp1); \
817 d_s2_r = vadd_s16 (d_s2_r, d_tmp2); \
818 d_s2_i = vsub_s16 (d_s2_i, d_tmp3); \
819 d_s3_r = vadd_s16 (d_s3_r, d_tmp4); \
820 d_s3_i = vsub_s16 (d_s3_i, d_tmp5);
823#define RADIX4x4_WITH_TW_LS_02 \
824 d_s4_r = vadd_s16 (d2_in0.val[0], d_s2_r); \
825 d_s4_i = vadd_s16 (d2_in0.val[1], d_s2_i); \
826 d_s5_r = vsub_s16 (d2_in0.val[0], d_s2_r); \
827 d_s5_i = vsub_s16 (d2_in0.val[1], d_s2_i); \
828 d_s6_r = vadd_s16 (d_s1_r, d_s3_r); \
829 d_s6_i = vadd_s16 (d_s1_i, d_s3_i); \
830 d_s7_r = vsub_s16 (d_s1_r, d_s3_r); \
831 d_s7_i = vsub_s16 (d_s1_i, d_s3_i); \
832 d2_out2.val[0] = vsub_s16 (d_s4_r, d_s6_r); \
833 d2_out2.val[1] = vsub_s16 (d_s4_i, d_s6_i); \
834 d2_out0.val[0] = vadd_s16 (d_s4_r, d_s6_r); \
835 d2_out0.val[1] = vadd_s16 (d_s4_i, d_s6_i);
837#define RADIX4x4_WITH_TW_LS_02_SCALED \
838 d_s4_r = vhadd_s16 (d2_in0.val[0], d_s2_r); \
839 d_s4_i = vhadd_s16 (d2_in0.val[1], d_s2_i); \
840 d_s5_r = vhsub_s16 (d2_in0.val[0], d_s2_r); \
841 d_s5_i = vhsub_s16 (d2_in0.val[1], d_s2_i); \
842 d_s6_r = vhadd_s16 (d_s1_r, d_s3_r); \
843 d_s6_i = vhadd_s16 (d_s1_i, d_s3_i); \
844 d_s7_r = vhsub_s16 (d_s1_r, d_s3_r); \
845 d_s7_i = vhsub_s16 (d_s1_i, d_s3_i); \
846 d2_out2.val[0] = vhsub_s16 (d_s4_r, d_s6_r); \
847 d2_out2.val[1] = vhsub_s16 (d_s4_i, d_s6_i); \
848 d2_out0.val[0] = vhadd_s16 (d_s4_r, d_s6_r); \
849 d2_out0.val[1] = vhadd_s16 (d_s4_i, d_s6_i);
852static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
855 ne10_int32_t src_stride,
856 ne10_int32_t dst_stride,
857 ne10_int32_t mstride)
859 RADIX4x4_WITH_TW_START
861 for (m_count = 0; m_count < mstride; m_count += 4)
864 RADIX4x4_WITH_TW_LOAD
865 RADIX4x4_WITH_TW_S1_FWD
867 RADIX4x4_WITH_TW_LS_02
869 d2_out1.val[0] = vadd_s16 (d_s5_r, d_s7_i);
870 d2_out1.val[1] = vsub_s16 (d_s5_i, d_s7_r);
871 d2_out3.val[0] = vsub_s16 (d_s5_r, d_s7_i);
872 d2_out3.val[1] = vadd_s16 (d_s5_i, d_s7_r);
875 RADIX4x4_WITH_TW_STORE
880static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
883 ne10_int32_t src_stride,
884 ne10_int32_t dst_stride,
885 ne10_int32_t mstride)
887 RADIX4x4_WITH_TW_START
889 for (m_count = 0; m_count < mstride; m_count += 4)
892 RADIX4x4_WITH_TW_LOAD
893 RADIX4x4_WITH_TW_S1_INV
895 RADIX4x4_WITH_TW_LS_02
897 d2_out1.val[0] = vsub_s16 (d_s5_r, d_s7_i);
898 d2_out1.val[1] = vadd_s16 (d_s5_i, d_s7_r);
899 d2_out3.val[0] = vadd_s16 (d_s5_r, d_s7_i);
900 d2_out3.val[1] = vsub_s16 (d_s5_i, d_s7_r);
903 RADIX4x4_WITH_TW_STORE
912 ne10_int32_t src_stride,
913 ne10_int32_t dst_stride,
914 ne10_int32_t mstride)
916 RADIX4x4_WITH_TW_START
918 for (m_count = 0; m_count < mstride; m_count += 4)
921 RADIX4x4_WITH_TW_LOAD
922 RADIX4x4_WITH_TW_S1_FWD
924 RADIX4x4_WITH_TW_LS_02_SCALED
926 d2_out1.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
927 d2_out1.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
928 d2_out3.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
929 d2_out3.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
932 RADIX4x4_WITH_TW_STORE
936static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
939 ne10_int32_t src_stride,
940 ne10_int32_t dst_stride,
941 ne10_int32_t mstride)
943 RADIX4x4_WITH_TW_START
945 for (m_count = 0; m_count < mstride; m_count += 4)
948 RADIX4x4_WITH_TW_LOAD
949 RADIX4x4_WITH_TW_S1_INV
951 RADIX4x4_WITH_TW_LS_02_SCALED
953 d2_out1.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
954 d2_out1.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
955 d2_out3.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
956 d2_out3.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
959 RADIX4x4_WITH_TW_STORE
964#define ne10_mixed_radix_fft_forward_int16_neon(scaled) \
965void ne10_mixed_radix_fft_forward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
966 ne10_fft_cpx_int16_t * Fin, \
967 ne10_int32_t * factors, \
968 ne10_fft_cpx_int16_t * twiddles, \
969 ne10_fft_cpx_int16_t * buffer) \
971 ne10_int32_t fstride, mstride, N; \
972 ne10_int32_t fstride1; \
973 ne10_int32_t f_count; \
974 ne10_int32_t stage_count; \
976 ne10_fft_cpx_int16_t *Fin1, *Fout1; \
977 ne10_fft_cpx_int16_t *Fout_ls = Fout; \
978 ne10_fft_cpx_int16_t *Ftmp; \
979 ne10_fft_cpx_int16_t *tw, *tw1; \
982 stage_count = factors[0]; \
983 fstride = factors[1]; \
984 mstride = factors[ (stage_count << 1) - 1 ]; \
985 N = factors[ stage_count << 1 ]; \
994 fstride1 = fstride >> 2; \
995 ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride1);\
1008 ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1019 for (; stage_count > 1 ; stage_count--) \
1022 for (f_count = 0; f_count < fstride; f_count ++) \
1024 Fout1 = & Fout[ f_count * mstride << 2 ]; \
1026 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1029 tw += mstride * 3; \
1041 for (f_count = 0; f_count < fstride; f_count ++) \
1044 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1051#define ne10_mixed_radix_fft_backward_int16_neon(scaled) \
1052void ne10_mixed_radix_fft_backward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
1053 ne10_fft_cpx_int16_t * Fin, \
1054 ne10_int32_t * factors, \
1055 ne10_fft_cpx_int16_t * twiddles, \
1056 ne10_fft_cpx_int16_t * buffer) \
1058 ne10_int32_t fstride, mstride, N; \
1059 ne10_int32_t fstride1; \
1060 ne10_int32_t f_count; \
1061 ne10_int32_t stage_count; \
1063 ne10_fft_cpx_int16_t *Fin1, *Fout1; \
1064 ne10_fft_cpx_int16_t *Fout_ls = Fout; \
1065 ne10_fft_cpx_int16_t *Ftmp; \
1066 ne10_fft_cpx_int16_t *tw, *tw1; \
1069 stage_count = factors[0]; \
1070 fstride = factors[1]; \
1071 mstride = factors[ (stage_count << 1) - 1 ]; \
1072 N = factors[ stage_count << 1 ]; \
1081 fstride1 = fstride >> 2; \
1082 ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride1);\
1095 ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1106 for (; stage_count > 1 ; stage_count--) \
1109 for (f_count = 0; f_count < fstride; f_count ++) \
1111 Fout1 = & Fout[ f_count * mstride << 2 ]; \
1113 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1116 tw += mstride * 3; \
1128 for (f_count = 0; f_count < fstride; f_count ++) \
1131 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1139ne10_mixed_radix_fft_forward_int16_neon (unscaled)
1140ne10_mixed_radix_fft_forward_int16_neon (scaled)
1141ne10_mixed_radix_fft_backward_int16_neon (unscaled)
1142ne10_mixed_radix_fft_backward_int16_neon (scaled)
1149 ne10_int32_t scaled_flag)
1152 ne10_int32_t count = ncfft / 2;
1154 int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1155 int16x8_t q_fpnk_r, q_fpnk_i;
1156 int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1157 int16x8_t q_tw_r, q_tw_i;
1158 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1159 int16x8_t q_dst2_r, q_dst2_i;
1160 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1166 NE10_F2I16_FIXDIV (tdc, 2);
1168 dst[0].r = tdc.r + tdc.i;
1169 dst[ncfft].r = tdc.r - tdc.i;
1170 dst[ncfft].i = dst[0].i = 0;
1176 for (k = 1; k <= count ; k += 8)
1178 p_src = (int16_t*) (& (src[k]));
1179 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1180 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1181 p_dst = (int16_t*) (& (dst[k]));
1182 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1184 q2_fpk = vld2q_s16 (p_src);
1185 q2_fpnk = vld2q_s16 (p_src2);
1187 q2_tw = vld2q_s16 (p_twiddles);
1188 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1189 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1190 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1191 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1192 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1193 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1194 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1196 q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1197 q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1199 q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1200 q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1202 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1203 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1204 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1205 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1206 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1207 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1209 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1210 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1211 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1212 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1213 q_dst2_r = vrev32q_s16 (q_dst2_r);
1214 q_dst2_i = vrev32q_s16 (q_dst2_i);
1215 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1216 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1217 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1218 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1219 vst2q_s16 (p_dst, q2_dst);
1220 vst2q_s16 (p_dst2, q2_dst2);
1226 for (k = 1; k <= count ; k += 8)
1228 p_src = (int16_t*) (& (src[k]));
1229 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1230 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1231 p_dst = (int16_t*) (& (dst[k]));
1232 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1234 q2_fpk = vld2q_s16 (p_src);
1235 q2_fpnk = vld2q_s16 (p_src2);
1237 q2_tw = vld2q_s16 (p_twiddles);
1238 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1239 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1240 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1241 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1242 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1243 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1244 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1246 q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1247 q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1249 q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1250 q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1252 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1253 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1254 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1255 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1256 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1257 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1259 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1260 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1261 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1262 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1263 q_dst2_r = vrev32q_s16 (q_dst2_r);
1264 q_dst2_i = vrev32q_s16 (q_dst2_i);
1265 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1266 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1267 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1268 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1269 vst2q_s16 (p_dst, q2_dst);
1270 vst2q_s16 (p_dst2, q2_dst2);
1278 for (k = 1; k <= ncfft / 2 ; ++k)
1281 fpnk.r = src[ncfft - k].r;
1282 fpnk.i = - src[ncfft - k].i;
1285 NE10_F2I16_FIXDIV (fpk, 2);
1286 NE10_F2I16_FIXDIV (fpnk, 2);
1289 f1k.r = fpk.r + fpnk.r;
1290 f1k.i = fpk.i + fpnk.i;
1292 f2k.r = fpk.r - fpnk.r;
1293 f2k.i = fpk.i - fpnk.i;
1295 tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
1296 - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1297 tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
1298 + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
1300 dst[k].r = (f1k.r + tw.r) >> 1;
1301 dst[k].i = (f1k.i + tw.i) >> 1;
1302 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1303 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1312 ne10_int32_t scaled_flag)
1316 ne10_int32_t count = ncfft / 2;
1318 int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1319 int16x8_t q_fnkc_r, q_fnkc_i;
1320 int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1321 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1322 int16x8_t q_dst2_r, q_dst2_i;
1323 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1326 dst[0].r = src[0].r + src[ncfft].r;
1327 dst[0].i = src[0].r - src[ncfft].r;
1330 NE10_F2I16_FIXDIV (dst[0], 2);
1335 for (k = 1; k <= count ; k += 8)
1337 p_src = (int16_t*) (& (src[k]));
1338 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1339 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1340 p_dst = (int16_t*) (& (dst[k]));
1341 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1343 q2_fk = vld2q_s16 (p_src);
1344 q2_fnkc = vld2q_s16 (p_src2);
1345 q2_tw = vld2q_s16 (p_twiddles);
1346 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1347 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1348 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1349 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1350 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1351 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1352 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1354 q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
1355 q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
1356 q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
1357 q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
1359 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1360 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1361 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1362 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1363 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1364 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1366 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1367 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1368 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1369 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1370 q_dst2_r = vrev32q_s16 (q_dst2_r);
1371 q_dst2_i = vrev32q_s16 (q_dst2_i);
1372 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1373 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1374 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1375 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1376 vst2q_s16 (p_dst, q2_dst);
1377 vst2q_s16 (p_dst2, q2_dst2);
1384 for (k = 1; k <= count ; k += 8)
1386 p_src = (int16_t*) (& (src[k]));
1387 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1388 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1389 p_dst = (int16_t*) (& (dst[k]));
1390 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1392 q2_fk = vld2q_s16 (p_src);
1393 q2_fnkc = vld2q_s16 (p_src2);
1394 q2_tw = vld2q_s16 (p_twiddles);
1395 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1396 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1397 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1398 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1399 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1400 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1401 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1403 q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
1404 q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
1405 q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
1406 q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
1408 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1409 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1410 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1411 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1412 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1413 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1415 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1416 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1417 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1418 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1419 q_dst2_r = vrev32q_s16 (q_dst2_r);
1420 q_dst2_i = vrev32q_s16 (q_dst2_i);
1421 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1422 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1423 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1424 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1425 vst2q_s16 (p_dst, q2_dst);
1426 vst2q_s16 (p_dst2, q2_dst2);
1434 for (k = 1; k <= ncfft / 2; k++)
1437 fnkc.r = src[ncfft - k].r;
1438 fnkc.i = -src[ncfft - k].i;
1441 NE10_F2I16_FIXDIV (fk, 2);
1442 NE10_F2I16_FIXDIV (fnkc, 2);
1445 fek.r = fk.r + fnkc.r;
1446 fek.i = fk.i + fnkc.i;
1448 tmp.r = fk.r - fnkc.r;
1449 tmp.i = fk.i - fnkc.i;
1451 fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
1452 + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1453 fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
1454 - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1456 dst[k].r = fek.r + fok.r;
1457 dst[k].i = fek.i + fok.i;
1459 dst[ncfft - k].r = fek.r - fok.r;
1460 dst[ncfft - k].i = fok.i - fek.i;
1490 ne10_int32_t inverse_fft,
1491 ne10_int32_t scaled_flag)
1500 ne10_fft4_backward_int16_scaled (fout, fin);
1503 ne10_fft8_backward_int16_scaled (fout, fin);
1506 ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1515 ne10_fft4_forward_int16_scaled (fout, fin);
1518 ne10_fft8_forward_int16_scaled (fout, fin);
1521 ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1533 ne10_fft4_backward_int16_unscaled (fout, fin);
1536 ne10_fft8_backward_int16_unscaled (fout, fin);
1539 ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1548 ne10_fft4_forward_int16_unscaled (fout, fin);
1551 ne10_fft8_forward_int16_unscaled (fout, fin);
1554 ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1587 ne10_int32_t scaled_flag)
1593 c2c_state.nfft = cfg->ncfft;
1594 c2c_state.factors = cfg->factors;
1595 c2c_state.twiddles = cfg->twiddles;
1596 c2c_state.buffer = tmpbuf2;
1599 ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1619 ne10_int32_t scaled_flag)
1625 c2c_state.nfft = cfg->ncfft;
1626 c2c_state.factors = cfg->factors;
1627 c2c_state.twiddles = cfg->twiddles;
1628 c2c_state.buffer = tmpbuf2;
1630 ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 16-bit fixed point data.
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int16 data.
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int16 data.
structure for the 16 bits fixed point FFT function.