Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fir.c
1/*
2 * Copyright 2012-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fir.c
30 */
31
32#include "NE10_types.h"
33
122 ne10_float32_t * pSrc,
123 ne10_float32_t * pDst,
124 ne10_uint32_t blockSize)
125{
126
127 ne10_float32_t *pState = S->pState; /* State pointer */
128 ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
129 ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
130 ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
131 ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
132 ne10_uint32_t i, tapCnt, blkCnt; /* Loop counters */
133
134 /* Run the below code for Cortex-M4 and Cortex-M3 */
135
136 ne10_float32_t acc0, acc1, acc2, acc3; /* Accumulators */
137 ne10_float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
138
139
140 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
141 /* pStateCurnt points to the location where the new input data should be written */
142 pStateCurnt = & (S->pState[ (numTaps - 1u)]);
143
144 /* Apply loop unrolling and compute 4 output values simultaneously.
145 * The variables acc0 ... acc3 hold output values that are being computed:
146 *
147 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
148 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
149 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
150 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
151 */
152 blkCnt = blockSize >> 2;
153
154 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
155 ** a second loop below computes the remaining 1 to 3 samples. */
156 while (blkCnt > 0u)
157 {
158 /* Copy four new input samples into the state buffer */
159 *pStateCurnt++ = *pSrc++;
160 *pStateCurnt++ = *pSrc++;
161 *pStateCurnt++ = *pSrc++;
162 *pStateCurnt++ = *pSrc++;
163
164 /* Set all accumulators to zero */
165 acc0 = 0.0f;
166 acc1 = 0.0f;
167 acc2 = 0.0f;
168 acc3 = 0.0f;
169
170 /* Initialize state pointer */
171 px = pState;
172
173 /* Initialize coeff pointer */
174 pb = (pCoeffs);
175
176 /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
177 x0 = *px++;
178 x1 = *px++;
179 x2 = *px++;
180
181 /* Loop unrolling. Process 4 taps at a time. */
182 tapCnt = numTaps >> 2u;
183
184 /* Loop over the number of taps. Unroll by a factor of 4.
185 ** Repeat until we've computed numTaps-4 coefficients. */
186 while (tapCnt > 0u)
187 {
188 /* Read the b[numTaps-1] coefficient */
189 c0 = * (pb++);
190
191 /* Read x[n-numTaps-3] sample */
192 x3 = * (px++);
193
194 /* acc0 += b[numTaps-1] * x[n-numTaps] */
195 acc0 += x0 * c0;
196
197 /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
198 acc1 += x1 * c0;
199
200 /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
201 acc2 += x2 * c0;
202
203 /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
204 acc3 += x3 * c0;
205
206 /* Read the b[numTaps-2] coefficient */
207 c0 = * (pb++);
208
209 /* Read x[n-numTaps-4] sample */
210 x0 = * (px++);
211
212 /* Perform the multiply-accumulate */
213 acc0 += x1 * c0;
214 acc1 += x2 * c0;
215 acc2 += x3 * c0;
216 acc3 += x0 * c0;
217
218 /* Read the b[numTaps-3] coefficient */
219 c0 = * (pb++);
220
221 /* Read x[n-numTaps-5] sample */
222 x1 = * (px++);
223
224 /* Perform the multiply-accumulates */
225 acc0 += x2 * c0;
226 acc1 += x3 * c0;
227 acc2 += x0 * c0;
228 acc3 += x1 * c0;
229
230 /* Read the b[numTaps-4] coefficient */
231 c0 = * (pb++);
232
233 /* Read x[n-numTaps-6] sample */
234 x2 = * (px++);
235
236 /* Perform the multiply-accumulates */
237 acc0 += x3 * c0;
238 acc1 += x0 * c0;
239 acc2 += x1 * c0;
240 acc3 += x2 * c0;
241
242 tapCnt--;
243 }
244
245 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
246 tapCnt = numTaps % 0x4u;
247
248 while (tapCnt > 0u)
249 {
250 /* Read coefficients */
251 c0 = * (pb++);
252
253 /* Fetch 1 state variable */
254 x3 = * (px++);
255
256 /* Perform the multiply-accumulates */
257 acc0 += x0 * c0;
258 acc1 += x1 * c0;
259 acc2 += x2 * c0;
260 acc3 += x3 * c0;
261
262 /* Reuse the present sample states for next sample */
263 x0 = x1;
264 x1 = x2;
265 x2 = x3;
266
267 /* Decrement the loop counter */
268 tapCnt--;
269 }
270
271 /* Advance the state pointer by 4 to process the next group of 4 samples */
272 pState = pState + 4;
273
274 /* The results in the 4 accumulators, store in the destination buffer. */
275 *pDst++ = acc0;
276 *pDst++ = acc1;
277 *pDst++ = acc2;
278 *pDst++ = acc3;
279
280 blkCnt--;
281 }
282
283 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
284 ** No loop unrolling is used. */
285 blkCnt = blockSize % 0x4u;
286
287 while (blkCnt > 0u)
288 {
289 /* Copy one sample at a time into state buffer */
290 *pStateCurnt++ = *pSrc++;
291
292 /* Set the accumulator to zero */
293 acc0 = 0.0f;
294
295 /* Initialize state pointer */
296 px = pState;
297
298 /* Initialize Coefficient pointer */
299 pb = (pCoeffs);
300
301 i = numTaps;
302
303 /* Perform the multiply-accumulates */
304 do
305 {
306 acc0 += *px++ * *pb++;
307 i--;
308
309 }
310 while (i > 0u);
311
312 /* The result is store in the destination buffer. */
313 *pDst++ = acc0;
314
315 /* Advance state pointer by 1 for the next sample */
316 pState = pState + 1;
317
318 blkCnt--;
319 }
320
321 /* Processing is complete.
322 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
323 ** This prepares the state buffer for the next function call. */
324
325 /* Points to the start of the state buffer */
326 pStateCurnt = S->pState;
327
328 tapCnt = (numTaps - 1u) >> 2u;
329
330 /* copy data */
331 while (tapCnt > 0u)
332 {
333 *pStateCurnt++ = *pState++;
334 *pStateCurnt++ = *pState++;
335 *pStateCurnt++ = *pState++;
336 *pStateCurnt++ = *pState++;
337
338 /* Decrement the loop counter */
339 tapCnt--;
340 }
341
342 /* Calculate remaining number of copies */
343 tapCnt = (numTaps - 1u) % 0x4u;
344
345 /* Copy the remaining q31_t data */
346 while (tapCnt > 0u)
347 {
348 *pStateCurnt++ = *pState++;
349
350 /* Decrement the loop counter */
351 tapCnt--;
352 }
353
354}
//end of FIR group
356
453 ne10_float32_t * pSrc,
454 ne10_float32_t * pDst,
455 ne10_uint32_t blockSize)
456{
457 ne10_float32_t *pState = S->pState; /* State pointer */
458 ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
459 ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
460 ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
461 ne10_float32_t sum0; /* Accumulator */
462 ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
463 ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
464 ne10_uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
465
466
467 /* Run the below code for Cortex-M4 and Cortex-M3 */
468
469 /* S->pState buffer contains previous frame (numTaps - 1) samples */
470 /* pStateCurnt points to the location where the new input data should be written */
471 pStateCurnt = S->pState + (numTaps - 1u);
472
473 /* Total number of output samples to be computed */
474 blkCnt = outBlockSize;
475
476 while (blkCnt > 0u)
477 {
478 /* Copy decimation factor number of new input samples into the state buffer */
479 i = S->M;
480
481 do
482 {
483 *pStateCurnt++ = *pSrc++;
484
485 }
486 while (--i);
487
488 /* Set accumulator to zero */
489 sum0 = 0.0f;
490
491 /* Initialize state pointer */
492 px = pState;
493
494 /* Initialize coeff pointer */
495 pb = pCoeffs;
496
497 /* Loop unrolling. Process 4 taps at a time. */
498 tapCnt = numTaps >> 2;
499
500 /* Loop over the number of taps. Unroll by a factor of 4.
501 ** Repeat until we've computed numTaps-4 coefficients. */
502 while (tapCnt > 0u)
503 {
504 /* Read the b[numTaps-1] coefficient */
505 c0 = * (pb++);
506
507 /* Read x[n-numTaps-1] sample */
508 x0 = * (px++);
509
510 /* Perform the multiply-accumulate */
511 sum0 += x0 * c0;
512
513 /* Read the b[numTaps-2] coefficient */
514 c0 = * (pb++);
515
516 /* Read x[n-numTaps-2] sample */
517 x0 = * (px++);
518
519 /* Perform the multiply-accumulate */
520 sum0 += x0 * c0;
521
522 /* Read the b[numTaps-3] coefficient */
523 c0 = * (pb++);
524
525 /* Read x[n-numTaps-3] sample */
526 x0 = * (px++);
527
528 /* Perform the multiply-accumulate */
529 sum0 += x0 * c0;
530
531 /* Read the b[numTaps-4] coefficient */
532 c0 = * (pb++);
533
534 /* Read x[n-numTaps-4] sample */
535 x0 = * (px++);
536
537 /* Perform the multiply-accumulate */
538 sum0 += x0 * c0;
539
540 /* Decrement the loop counter */
541 tapCnt--;
542 }
543
544 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
545 tapCnt = numTaps % 0x4u;
546
547 while (tapCnt > 0u)
548 {
549 /* Read coefficients */
550 c0 = * (pb++);
551
552 /* Fetch 1 state variable */
553 x0 = * (px++);
554
555 /* Perform the multiply-accumulate */
556 sum0 += x0 * c0;
557
558 /* Decrement the loop counter */
559 tapCnt--;
560 }
561
562 /* Advance the state pointer by the decimation factor
563 * to process the next group of decimation factor number samples */
564 pState = pState + S->M;
565
566 /* The result is in the accumulator, store in the destination buffer. */
567 *pDst++ = sum0;
568
569 /* Decrement the loop counter */
570 blkCnt--;
571 }
572
573 /* Processing is complete.
574 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
575 ** This prepares the state buffer for the next function call. */
576
577 /* Points to the start of the state buffer */
578 pStateCurnt = S->pState;
579
580 i = (numTaps - 1u) >> 2;
581
582 /* copy data */
583 while (i > 0u)
584 {
585 *pStateCurnt++ = *pState++;
586 *pStateCurnt++ = *pState++;
587 *pStateCurnt++ = *pState++;
588 *pStateCurnt++ = *pState++;
589
590 /* Decrement the loop counter */
591 i--;
592 }
593
594 i = (numTaps - 1u) % 0x04u;
595
596 /* copy data */
597 while (i > 0u)
598 {
599 *pStateCurnt++ = *pState++;
600
601 /* Decrement the loop counter */
602 i--;
603 }
604
605}
//end of FIR_Decimate group
607
608
713 ne10_float32_t * pSrc,
714 ne10_float32_t * pDst,
715 ne10_uint32_t blockSize)
716{
717 ne10_float32_t *pState = S->pState; /* State pointer */
718 ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
719 ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
720 ne10_float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
721
722
723 /* Run the below code for Cortex-M4 and Cortex-M3 */
724
725 ne10_float32_t sum0; /* Accumulators */
726 ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
727 ne10_uint32_t i, blkCnt, j; /* Loop counters */
728 ne10_uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
729
730
731 /* S->pState buffer contains previous frame (phaseLen - 1) samples */
732 /* pStateCurnt points to the location where the new input data should be written */
733 pStateCurnt = S->pState + (phaseLen - 1u);
734
735 /* Total number of intput samples */
736 blkCnt = blockSize;
737
738 /* Loop over the blockSize. */
739 while (blkCnt > 0u)
740 {
741 /* Copy new input sample into the state buffer */
742 *pStateCurnt++ = *pSrc++;
743
744 /* Address modifier index of coefficient buffer */
745 j = 1u;
746
747 /* Loop over the Interpolation factor. */
748 i = S->L;
749 while (i > 0u)
750 {
751 /* Set accumulator to zero */
752 sum0 = 0.0f;
753
754 /* Initialize state pointer */
755 ptr1 = pState;
756
757 /* Initialize coefficient pointer */
758 ptr2 = pCoeffs + (S->L - j);
759
760 /* Loop over the polyPhase length. Unroll by a factor of 4.
761 ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
762 tapCnt = phaseLen >> 2u;
763 while (tapCnt > 0u)
764 {
765
766 /* Read the coefficient */
767 c0 = * (ptr2);
768
769 /* Upsampling is done by stuffing L-1 zeros between each sample.
770 * So instead of multiplying zeros with coefficients,
771 * Increment the coefficient pointer by interpolation factor times. */
772 ptr2 += S->L;
773
774 /* Read the input sample */
775 x0 = * (ptr1++);
776
777 /* Perform the multiply-accumulate */
778 sum0 += x0 * c0;
779
780 /* Read the coefficient */
781 c0 = * (ptr2);
782
783 /* Increment the coefficient pointer by interpolation factor times. */
784 ptr2 += S->L;
785
786 /* Read the input sample */
787 x0 = * (ptr1++);
788
789 /* Perform the multiply-accumulate */
790 sum0 += x0 * c0;
791
792 /* Read the coefficient */
793 c0 = * (ptr2);
794
795 /* Increment the coefficient pointer by interpolation factor times. */
796 ptr2 += S->L;
797
798 /* Read the input sample */
799 x0 = * (ptr1++);
800
801 /* Perform the multiply-accumulate */
802 sum0 += x0 * c0;
803
804 /* Read the coefficient */
805 c0 = * (ptr2);
806
807 /* Increment the coefficient pointer by interpolation factor times. */
808 ptr2 += S->L;
809
810 /* Read the input sample */
811 x0 = * (ptr1++);
812
813 /* Perform the multiply-accumulate */
814 sum0 += x0 * c0;
815
816 /* Decrement the loop counter */
817 tapCnt--;
818 }
819
820 /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
821 tapCnt = phaseLen % 0x4u;
822
823 while (tapCnt > 0u)
824 {
825 /* Perform the multiply-accumulate */
826 sum0 += * (ptr1++) * (*ptr2);
827
828 /* Increment the coefficient pointer by interpolation factor times. */
829 ptr2 += S->L;
830
831 /* Decrement the loop counter */
832 tapCnt--;
833 }
834
835 /* The result is in the accumulator, store in the destination buffer. */
836 *pDst++ = sum0;
837
838 /* Increment the address modifier index of coefficient buffer */
839 j++;
840
841 /* Decrement the loop counter */
842 i--;
843 }
844
845 /* Advance the state pointer by 1
846 * to process the next group of interpolation factor number samples */
847 pState = pState + 1;
848
849 /* Decrement the loop counter */
850 blkCnt--;
851 }
852
853 /* Processing is complete.
854 ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
855 ** This prepares the state buffer for the next function call. */
856
857 /* Points to the start of the state buffer */
858 pStateCurnt = S->pState;
859
860 tapCnt = (phaseLen - 1u) >> 2u;
861
862 /* copy data */
863 while (tapCnt > 0u)
864 {
865 *pStateCurnt++ = *pState++;
866 *pStateCurnt++ = *pState++;
867 *pStateCurnt++ = *pState++;
868 *pStateCurnt++ = *pState++;
869
870 /* Decrement the loop counter */
871 tapCnt--;
872 }
873
874 tapCnt = (phaseLen - 1u) % 0x04u;
875
876 while (tapCnt > 0u)
877 {
878 *pStateCurnt++ = *pState++;
879
880 /* Decrement the loop counter */
881 tapCnt--;
882 }
883
884}
//end of FIR_interpolate group
886
887
973 ne10_float32_t * pSrc,
974 ne10_float32_t * pDst,
975 ne10_uint32_t blockSize)
976{
977 ne10_float32_t *pState; /* State pointer */
978 ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
979 ne10_float32_t *px; /* temporary state pointer */
980 ne10_float32_t *pk; /* temporary coefficient pointer */
981
982
983 /* Run the below code for Cortex-M4 and Cortex-M3 */
984
985 ne10_float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */
986 ne10_float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
987 ne10_float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
988 ne10_float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
989 ne10_uint32_t numStages = S->numStages; /* Number of stages in the filter */
990 ne10_uint32_t blkCnt, stageCnt; /* temporary variables for counts */
991
992 gcurr1 = 0.0f;
993 pState = &S->pState[0];
994
995 blkCnt = blockSize >> 2;
996
997 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
998 a second loop below computes the remaining 1 to 3 samples. */
999 while (blkCnt > 0u)
1000 {
1001
1002 /* Read two samples from input buffer */
1003 /* f0(n) = x(n) */
1004 fcurr1 = *pSrc++;
1005 fcurr2 = *pSrc++;
1006
1007 /* Initialize coeff pointer */
1008 pk = (pCoeffs);
1009
1010 /* Initialize state pointer */
1011 px = pState;
1012
1013 /* Read g0(n-1) from state */
1014 gcurr1 = *px;
1015
1016 /* Process first sample for first tap */
1017 /* f1(n) = f0(n) + K1 * g0(n-1) */
1018 fnext1 = fcurr1 + ( (*pk) * gcurr1);
1019 /* g1(n) = f0(n) * K1 + g0(n-1) */
1020 gnext1 = (fcurr1 * (*pk)) + gcurr1;
1021
1022 /* Process second sample for first tap */
1023 /* for sample 2 processing */
1024 fnext2 = fcurr2 + ( (*pk) * fcurr1);
1025 gnext2 = (fcurr2 * (*pk)) + fcurr1;
1026
1027 /* Read next two samples from input buffer */
1028 /* f0(n+2) = x(n+2) */
1029 fcurr3 = *pSrc++;
1030 fcurr4 = *pSrc++;
1031
1032 /* Copy only last input samples into the state buffer
1033 which will be used for next four samples processing */
1034 *px++ = fcurr4;
1035
1036 /* Process third sample for first tap */
1037 fnext3 = fcurr3 + ( (*pk) * fcurr2);
1038 gnext3 = (fcurr3 * (*pk)) + fcurr2;
1039
1040 /* Process fourth sample for first tap */
1041 fnext4 = fcurr4 + ( (*pk) * fcurr3);
1042 gnext4 = (fcurr4 * (*pk++)) + fcurr3;
1043
1044 /* Update of f values for next coefficient set processing */
1045 fcurr1 = fnext1;
1046 fcurr2 = fnext2;
1047 fcurr3 = fnext3;
1048 fcurr4 = fnext4;
1049
1050 /* Loop unrolling. Process 4 taps at a time . */
1051 stageCnt = (numStages - 1u) >> 2u;
1052
1053 /* Loop over the number of taps. Unroll by a factor of 4.
1054 ** Repeat until we've computed numStages-3 coefficients. */
1055
1056 /* Process 2nd, 3rd, 4th and 5th taps ... here */
1057 while (stageCnt > 0u)
1058 {
1059 /* Read g1(n-1), g3(n-1) .... from state */
1060 gcurr1 = *px;
1061
1062 /* save g1(n) in state buffer */
1063 *px++ = gnext4;
1064
1065 /* Process first sample for 2nd, 6th .. tap */
1066 /* Sample processing for K2, K6.... */
1067 /* f2(n) = f1(n) + K2 * g1(n-1) */
1068 fnext1 = fcurr1 + ( (*pk) * gcurr1);
1069 /* Process second sample for 2nd, 6th .. tap */
1070 /* for sample 2 processing */
1071 fnext2 = fcurr2 + ( (*pk) * gnext1);
1072 /* Process third sample for 2nd, 6th .. tap */
1073 fnext3 = fcurr3 + ( (*pk) * gnext2);
1074 /* Process fourth sample for 2nd, 6th .. tap */
1075 fnext4 = fcurr4 + ( (*pk) * gnext3);
1076
1077 /* g2(n) = f1(n) * K2 + g1(n-1) */
1078 /* Calculation of state values for next stage */
1079 gnext4 = (fcurr4 * (*pk)) + gnext3;
1080 gnext3 = (fcurr3 * (*pk)) + gnext2;
1081 gnext2 = (fcurr2 * (*pk)) + gnext1;
1082 gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1083
1084
1085 /* Read g2(n-1), g4(n-1) .... from state */
1086 gcurr1 = *px;
1087
1088 /* save g2(n) in state buffer */
1089 *px++ = gnext4;
1090
1091 /* Sample processing for K3, K7.... */
1092 /* Process first sample for 3rd, 7th .. tap */
1093 /* f3(n) = f2(n) + K3 * g2(n-1) */
1094 fcurr1 = fnext1 + ( (*pk) * gcurr1);
1095 /* Process second sample for 3rd, 7th .. tap */
1096 fcurr2 = fnext2 + ( (*pk) * gnext1);
1097 /* Process third sample for 3rd, 7th .. tap */
1098 fcurr3 = fnext3 + ( (*pk) * gnext2);
1099 /* Process fourth sample for 3rd, 7th .. tap */
1100 fcurr4 = fnext4 + ( (*pk) * gnext3);
1101
1102 /* Calculation of state values for next stage */
1103 /* g3(n) = f2(n) * K3 + g2(n-1) */
1104 gnext4 = (fnext4 * (*pk)) + gnext3;
1105 gnext3 = (fnext3 * (*pk)) + gnext2;
1106 gnext2 = (fnext2 * (*pk)) + gnext1;
1107 gnext1 = (fnext1 * (*pk++)) + gcurr1;
1108
1109
1110 /* Read g1(n-1), g3(n-1) .... from state */
1111 gcurr1 = *px;
1112
1113 /* save g3(n) in state buffer */
1114 *px++ = gnext4;
1115
1116 /* Sample processing for K4, K8.... */
1117 /* Process first sample for 4th, 8th .. tap */
1118 /* f4(n) = f3(n) + K4 * g3(n-1) */
1119 fnext1 = fcurr1 + ( (*pk) * gcurr1);
1120 /* Process second sample for 4th, 8th .. tap */
1121 /* for sample 2 processing */
1122 fnext2 = fcurr2 + ( (*pk) * gnext1);
1123 /* Process third sample for 4th, 8th .. tap */
1124 fnext3 = fcurr3 + ( (*pk) * gnext2);
1125 /* Process fourth sample for 4th, 8th .. tap */
1126 fnext4 = fcurr4 + ( (*pk) * gnext3);
1127
1128 /* g4(n) = f3(n) * K4 + g3(n-1) */
1129 /* Calculation of state values for next stage */
1130 gnext4 = (fcurr4 * (*pk)) + gnext3;
1131 gnext3 = (fcurr3 * (*pk)) + gnext2;
1132 gnext2 = (fcurr2 * (*pk)) + gnext1;
1133 gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1134
1135 /* Read g2(n-1), g4(n-1) .... from state */
1136 gcurr1 = *px;
1137
1138 /* save g4(n) in state buffer */
1139 *px++ = gnext4;
1140
1141 /* Sample processing for K5, K9.... */
1142 /* Process first sample for 5th, 9th .. tap */
1143 /* f5(n) = f4(n) + K5 * g4(n-1) */
1144 fcurr1 = fnext1 + ( (*pk) * gcurr1);
1145 /* Process second sample for 5th, 9th .. tap */
1146 fcurr2 = fnext2 + ( (*pk) * gnext1);
1147 /* Process third sample for 5th, 9th .. tap */
1148 fcurr3 = fnext3 + ( (*pk) * gnext2);
1149 /* Process fourth sample for 5th, 9th .. tap */
1150 fcurr4 = fnext4 + ( (*pk) * gnext3);
1151
1152 /* Calculation of state values for next stage */
1153 /* g5(n) = f4(n) * K5 + g4(n-1) */
1154 gnext4 = (fnext4 * (*pk)) + gnext3;
1155 gnext3 = (fnext3 * (*pk)) + gnext2;
1156 gnext2 = (fnext2 * (*pk)) + gnext1;
1157 gnext1 = (fnext1 * (*pk++)) + gcurr1;
1158
1159 stageCnt--;
1160 }
1161
1162 /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
1163 stageCnt = (numStages - 1u) % 0x4u;
1164
1165 while (stageCnt > 0u)
1166 {
1167 gcurr1 = *px;
1168
1169 /* save g value in state buffer */
1170 *px++ = gnext4;
1171
1172 /* Process four samples for last three taps here */
1173 fnext1 = fcurr1 + ( (*pk) * gcurr1);
1174 fnext2 = fcurr2 + ( (*pk) * gnext1);
1175 fnext3 = fcurr3 + ( (*pk) * gnext2);
1176 fnext4 = fcurr4 + ( (*pk) * gnext3);
1177
1178 /* g1(n) = f0(n) * K1 + g0(n-1) */
1179 gnext4 = (fcurr4 * (*pk)) + gnext3;
1180 gnext3 = (fcurr3 * (*pk)) + gnext2;
1181 gnext2 = (fcurr2 * (*pk)) + gnext1;
1182 gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1183
1184 /* Update of f values for next coefficient set processing */
1185 fcurr1 = fnext1;
1186 fcurr2 = fnext2;
1187 fcurr3 = fnext3;
1188 fcurr4 = fnext4;
1189
1190 stageCnt--;
1191
1192 }
1193
1194 /* The results in the 4 accumulators, store in the destination buffer. */
1195 /* y(n) = fN(n) */
1196 *pDst++ = fcurr1;
1197 *pDst++ = fcurr2;
1198 *pDst++ = fcurr3;
1199 *pDst++ = fcurr4;
1200
1201 blkCnt--;
1202 }
1203
1204 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
1205 ** No loop unrolling is used. */
1206 blkCnt = blockSize % 0x4u;
1207
1208 while (blkCnt > 0u)
1209 {
1210 /* f0(n) = x(n) */
1211 fcurr1 = *pSrc++;
1212
1213 /* Initialize coeff pointer */
1214 pk = (pCoeffs);
1215
1216 /* Initialize state pointer */
1217 px = pState;
1218
1219 /* read g2(n) from state buffer */
1220 gcurr1 = *px;
1221
1222 /* for sample 1 processing */
1223 /* f1(n) = f0(n) + K1 * g0(n-1) */
1224 fnext1 = fcurr1 + ( (*pk) * gcurr1);
1225 /* g1(n) = f0(n) * K1 + g0(n-1) */
1226 gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1227
1228 /* save g1(n) in state buffer */
1229 *px++ = fcurr1;
1230
1231 /* f1(n) is saved in fcurr1
1232 for next stage processing */
1233 fcurr1 = fnext1;
1234
1235 stageCnt = (numStages - 1u);
1236
1237 /* stage loop */
1238 while (stageCnt > 0u)
1239 {
1240 /* read g2(n) from state buffer */
1241 gcurr1 = *px;
1242
1243 /* save g1(n) in state buffer */
1244 *px++ = gnext1;
1245
1246 /* Sample processing for K2, K3.... */
1247 /* f2(n) = f1(n) + K2 * g1(n-1) */
1248 fnext1 = fcurr1 + ( (*pk) * gcurr1);
1249 /* g2(n) = f1(n) * K2 + g1(n-1) */
1250 gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1251
1252 /* f1(n) is saved in fcurr1
1253 for next stage processing */
1254 fcurr1 = fnext1;
1255
1256 stageCnt--;
1257
1258 }
1259
1260 /* y(n) = fN(n) */
1261 *pDst++ = fcurr1;
1262
1263 blkCnt--;
1264
1265 }
1266
1267}
//end of FIR_Lattice group
1269
1273static void ne10_circular_write_float (ne10_int32_t * circBuffer,
1274 ne10_int32_t L,
1275 ne10_uint16_t * writeOffset,
1276 ne10_int32_t bufferInc,
1277 const ne10_int32_t * src,
1278 ne10_int32_t srcInc,
1279 ne10_uint32_t blockSize)
1280{
1281 ne10_uint32_t i = 0u;
1282 ne10_int32_t wOffset;
1283
1284 /* Copy the value of Index pointer that points
1285 * to the current location where the input samples to be copied */
1286 wOffset = *writeOffset;
1287
1288 /* Loop over the blockSize */
1289 i = blockSize;
1290
1291 while (i > 0u)
1292 {
1293 /* copy the input sample to the circular buffer */
1294 circBuffer[wOffset] = *src;
1295
1296 /* Update the input pointer */
1297 src += srcInc;
1298
1299 /* Circularly update wOffset. Watch out for positive and negative value */
1300 wOffset += bufferInc;
1301 if (wOffset >= L)
1302 wOffset -= L;
1303
1304 /* Decrement the loop counter */
1305 i--;
1306 }
1307
1308 /* Update the index pointer */
1309 *writeOffset = wOffset;
1310}
1311
1312
1313
1317static void ne10_circular_read_float (ne10_int32_t * circBuffer,
1318 ne10_int32_t L,
1319 ne10_int32_t * readOffset,
1320 ne10_int32_t bufferInc,
1321 ne10_int32_t * dst,
1322 ne10_int32_t * dst_base,
1323 ne10_int32_t dst_length,
1324 ne10_int32_t dstInc,
1325 ne10_uint32_t blockSize)
1326{
1327 ne10_uint32_t i = 0u;
1328 ne10_int32_t rOffset, *dst_end;
1329
1330 /* Copy the value of Index pointer that points
1331 * to the current location from where the input samples to be read */
1332 rOffset = *readOffset;
1333 dst_end = dst_base + dst_length;
1334
1335 /* Loop over the blockSize */
1336 i = blockSize;
1337
1338 while (i > 0u)
1339 {
1340 /* copy the sample from the circular buffer to the destination buffer */
1341 *dst = circBuffer[rOffset];
1342
1343 /* Update the input pointer */
1344 dst += dstInc;
1345
1346 if (dst == dst_end)
1347 {
1348 dst = dst_base;
1349 }
1350
1351 /* Circularly update rOffset. Watch out for positive and negative value */
1352 rOffset += bufferInc;
1353
1354 if (rOffset >= L)
1355 {
1356 rOffset -= L;
1357 }
1358
1359 /* Decrement the loop counter */
1360 i--;
1361 }
1362
1363 /* Update the index pointer */
1364 *readOffset = rOffset;
1365}
1366
1440 ne10_float32_t * pSrc,
1441 ne10_float32_t * pDst,
1442 ne10_float32_t * pScratchIn,
1443 ne10_uint32_t blockSize)
1444{
1445
1446 ne10_float32_t *pState = S->pState; /* State pointer */
1447 ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
1448 ne10_float32_t *px; /* Scratch buffer pointer */
1449 ne10_float32_t *py = pState; /* Temporary pointers for state buffer */
1450 ne10_float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
1451 ne10_float32_t *pOut; /* Destination pointer */
1452 ne10_int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
1453 ne10_uint32_t delaySize = S->maxDelay + blockSize; /* state length */
1454 ne10_uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
1455 ne10_int32_t readIndex; /* Read index of the state buffer */
1456 ne10_uint32_t tapCnt, blkCnt; /* loop counters */
1457 ne10_float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
1458
1459
1460
1461 /* BlockSize of Input samples are copied into the state buffer */
1462 /* StateIndex points to the starting position to write in the state buffer */
1463 ne10_circular_write_float ( (ne10_int32_t *) py, delaySize, &S->stateIndex, 1,
1464 (ne10_int32_t *) pSrc, 1, blockSize);
1465
1466
1467 /* Read Index, from where the state buffer should be read, is calculated. */
1468 readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
1469
1470 /* Wraparound of readIndex */
1471 if (readIndex < 0)
1472 {
1473 readIndex += (ne10_int32_t) delaySize;
1474 }
1475
1476 /* Working pointer for state buffer is updated */
1477 py = pState;
1478
1479 /* blockSize samples are read from the state buffer */
1480 ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
1481 (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
1482 blockSize);
1483
1484 /* Working pointer for the scratch buffer */
1485 px = pb;
1486
1487 /* Working pointer for destination buffer */
1488 pOut = pDst;
1489
1490
1491 /* Run the below code for Cortex-M4 and Cortex-M3 */
1492
1493 /* Loop over the blockSize. Unroll by a factor of 4.
1494 * Compute 4 Multiplications at a time. */
1495 blkCnt = blockSize >> 2u;
1496
1497 while (blkCnt > 0u)
1498 {
1499 /* Perform Multiplications and store in destination buffer */
1500 *pOut++ = *px++ * coeff;
1501 *pOut++ = *px++ * coeff;
1502 *pOut++ = *px++ * coeff;
1503 *pOut++ = *px++ * coeff;
1504
1505 /* Decrement the loop counter */
1506 blkCnt--;
1507 }
1508
1509 /* If the blockSize is not a multiple of 4,
1510 * compute the remaining samples */
1511 blkCnt = blockSize % 0x4u;
1512
1513 while (blkCnt > 0u)
1514 {
1515 /* Perform Multiplications and store in destination buffer */
1516 *pOut++ = *px++ * coeff;
1517
1518 /* Decrement the loop counter */
1519 blkCnt--;
1520 }
1521
1522 /* Load the coefficient value and
1523 * increment the coefficient buffer for the next set of state values */
1524 coeff = *pCoeffs++;
1525
1526 /* Read Index, from where the state buffer should be read, is calculated. */
1527 readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
1528
1529 /* Wraparound of readIndex */
1530 if (readIndex < 0)
1531 {
1532 readIndex += (ne10_int32_t) delaySize;
1533 }
1534
1535 /* Loop over the number of taps. */
1536 tapCnt = (ne10_uint32_t) numTaps - 1u;
1537
1538 while (tapCnt > 0u)
1539 {
1540
1541 /* Working pointer for state buffer is updated */
1542 py = pState;
1543
1544 /* blockSize samples are read from the state buffer */
1545 ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
1546 (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
1547 blockSize);
1548
1549 /* Working pointer for the scratch buffer */
1550 px = pb;
1551
1552 /* Working pointer for destination buffer */
1553 pOut = pDst;
1554
1555 /* Loop over the blockSize. Unroll by a factor of 4.
1556 * Compute 4 MACS at a time. */
1557 blkCnt = blockSize >> 2u;
1558
1559 while (blkCnt > 0u)
1560 {
1561 /* Perform Multiply-Accumulate */
1562 *pOut++ += *px++ * coeff;
1563 *pOut++ += *px++ * coeff;
1564 *pOut++ += *px++ * coeff;
1565 *pOut++ += *px++ * coeff;
1566
1567 /* Decrement the loop counter */
1568 blkCnt--;
1569 }
1570
1571 /* If the blockSize is not a multiple of 4,
1572 * compute the remaining samples */
1573 blkCnt = blockSize % 0x4u;
1574
1575 while (blkCnt > 0u)
1576 {
1577 /* Perform Multiply-Accumulate */
1578 *pOut++ += *px++ * coeff;
1579
1580 /* Decrement the loop counter */
1581 blkCnt--;
1582 }
1583
1584 /* Load the coefficient value and
1585 * increment the coefficient buffer for the next set of state values */
1586 coeff = *pCoeffs++;
1587
1588 /* Read Index, from where the state buffer should be read, is calculated. */
1589 readIndex = ( (ne10_int32_t) S->stateIndex -
1590 (ne10_int32_t) blockSize) - *pTapDelay++;
1591
1592 /* Wraparound of readIndex */
1593 if (readIndex < 0)
1594 {
1595 readIndex += (ne10_int32_t) delaySize;
1596 }
1597
1598 /* Decrement the tap loop counter */
1599 tapCnt--;
1600 }
1601
1602}
//end of FIR_sparse group
void ne10_fir_decimate_float_c(const ne10_fir_decimate_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Processing function for the floating-point FIR decimator.
Definition NE10_fir.c:452
void ne10_fir_interpolate_float_c(const ne10_fir_interpolate_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Processing function for the floating-point FIR interpolator.
Definition NE10_fir.c:712
void ne10_fir_lattice_float_c(const ne10_fir_lattice_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Processing function for the floating-point FIR lattice filter.
Definition NE10_fir.c:972
void ne10_fir_sparse_float_c(ne10_fir_sparse_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_float32_t *pScratchIn, ne10_uint32_t blockSize)
Processing function for the floating-point sparse FIR filter.
Definition NE10_fir.c:1439
void ne10_fir_float_c(const ne10_fir_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Definition NE10_fir.c:121
Instance structure for the floating-point FIR Decimation.
Definition NE10_types.h:385
ne10_uint8_t M
Decimation Factor.
Definition NE10_types.h:386
ne10_float32_t * pState
Points to the state variable array.
Definition NE10_types.h:389
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition NE10_types.h:388
ne10_uint16_t numTaps
Length of the filter.
Definition NE10_types.h:387
Instance structure for the floating-point FIR filter.
Definition NE10_types.h:365
ne10_float32_t * pState
Points to the state variable array.
Definition NE10_types.h:367
ne10_uint16_t numTaps
Length of the filter.
Definition NE10_types.h:366
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition NE10_types.h:368
Instance structure for the floating-point FIR Interpolation.
Definition NE10_types.h:396
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition NE10_types.h:399
ne10_float32_t * pState
Points to the state variable array.
Definition NE10_types.h:400
ne10_uint16_t phaseLength
Length of each polyphase filter component.
Definition NE10_types.h:398
ne10_uint8_t L
Interpolation Factor.
Definition NE10_types.h:397
Instance structure for the floating point FIR Lattice filter.
Definition NE10_types.h:375
ne10_float32_t * pState
Points to the state variable array.
Definition NE10_types.h:377
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition NE10_types.h:378
ne10_uint16_t numStages
numStages of the of lattice filter.
Definition NE10_types.h:376
Instance structure for the floating-point FIR Sparse filter.
Definition NE10_types.h:407
ne10_uint16_t numTaps
Length of the filter.
Definition NE10_types.h:408
ne10_uint16_t maxDelay
the largest number of delay line values .
Definition NE10_types.h:412
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition NE10_types.h:411
ne10_uint16_t stateIndex
Index pointer for the state buffer .
Definition NE10_types.h:409
ne10_float32_t * pState
Points to the state variable array.
Definition NE10_types.h:410
ne10_int32_t * pTapDelay
Pointer to the array containing positions of the non-zero tap values.
Definition NE10_types.h:413