Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
macros.h
1/*
2 * Copyright 2011-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the <organization> nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM Limited and Contributors. BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : common/macros.h
30 */
31
32#include "factor.h"
33
34// Macros used in actual implementations
35
37
38#define NE10_XC_OPERATION_X_C(loopCode) { \
39 NE10_TEMPLATE_XC_OPERATION_X_C( \
40 NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
41 loopCode); \
42 }
43
44#define NE10_XC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
45 float32x4_t n_cst = { cst, cst, cst, cst }; \
46 NE10_DstSrcCst_OPERATION_FLOAT_NEON( \
47 NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
48 NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
49 NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
50 ); \
51 }
52
53#define NE10_XC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
54 NE10_DstSrcCst_OPERATION_VEC2F_NEON( \
55 NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
56 NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
57 NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
58 ); \
59 }
60
61/* This macro uses interleaving to boost the performance */
62#define NE10_XC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
63 NE10_DstSrcCst_OPERATION_VEC3F_NEON( \
64 NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
65 NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
66 NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
67 ); \
68 }
69
70#define NE10_XC_OPERATION_VEC4F_NEON(loopCode) { \
71 NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
72 NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
73 NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
74 ); \
75 }
76
78
79#define NE10_MLAC_OPERATION_X_C(loopCode) { \
80 NE10_TEMPLATE_XC_OPERATION_X_C( \
81 NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
82 loopCode); \
83 }
84
85#define NE10_MLAC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
86 float32x4_t n_acc; \
87 float32x4_t n_cst = { cst, cst, cst, cst }; \
88 NE10_DstAccSrcCst_OPERATION_FLOAT_NEON( \
89 NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
90 NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
91 NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
92 ); \
93 }
94
95#define NE10_MLAC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
96 float32x4_t n_acc; \
97 NE10_DstAccSrcCst_OPERATION_VEC2F_NEON( \
98 NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
99 NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
100 NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
101 ); \
102 }
103
104#define NE10_MLAC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
105 float32x4_t n_acc1, n_acc2, n_acc3; \
106 NE10_DstAccSrcCst_OPERATION_VEC3F_NEON( \
107 NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
108 NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
109 NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
110 ); \
111 }
112
113#define NE10_MLAC_OPERATION_VEC4F_NEON(loopCode) { \
114 float32x4_t n_acc; \
115 NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
116 NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
117 NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
118 ); \
119 }
120
122
123#define NE10_SETC_OPERATION_X_C(loopCode) { \
124 NE10_TEMPLATE_XC_OPERATION_X_C( \
125 NE10_CHECKPOINTER_DstCst_OPERATION; , \
126 loopCode); \
127 }
128
129#define NE10_SETC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
130 float32x4_t n_cst = { cst, cst, cst, cst }; \
131 NE10_DstCst_OPERATION_FLOAT_NEON( \
132 NE10_CHECKPOINTER_DstCst_OPERATION; , \
133 NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
134 NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
135 ); \
136 }
137
138#define NE10_SETC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
139 NE10_DstCst_OPERATION_VEC2F_NEON( \
140 NE10_CHECKPOINTER_DstCst_OPERATION; , \
141 NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
142 NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
143 ); \
144 }
145
146/* This macro uses interleaving to boost the performance */
147#define NE10_SETC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
148 NE10_DstCst_OPERATION_VEC3F_NEON( \
149 NE10_CHECKPOINTER_DstCst_OPERATION; , \
150 NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
151 NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
152 ); \
153 }
154
155#define NE10_SETC_OPERATION_VEC4F_NEON(loopCode) { \
156 NE10_DstCst_OPERATION_VEC4F_NEON( \
157 NE10_CHECKPOINTER_DstCst_OPERATION; , \
158 NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
159 ); \
160 }
161
163
164#define NE10_X_OPERATION_FLOAT_C(loopCode) { \
165 NE10_TEMPLATE_XC_OPERATION_X_C( \
166 NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
167 loopCode); \
168 }
169
170#define NE10_X_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
171 float32x4_t n_src2; \
172 NE10_DstSrc1Src2_OPERATION_FLOAT_NEON( \
173 NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
174 NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
175 NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
176 ); \
177 }
178
179#define NE10_DOT_OPERATION_X_C NE10_X_OPERATION_FLOAT_C
180
182
183#define NE10_ABS_OPERATION_X_C(loopCode) { \
184 NE10_TEMPLATE_XC_OPERATION_X_C( \
185 NE10_CHECKPOINTER_DstSrc_OPERATION, \
186 loopCode); \
187 }
188
189#define NE10_ABS_OPERATION_FLOAT_C NE10_ABS_OPERATION_X_C
190
191#define NE10_ABS_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
192 arm_float_t cst = 0.0f; /* this is used to compare the values against. */ \
193 float32x4_t n_cst = { cst, cst, cst, cst }; \
194 NE10_DstSrc_OPERATION_FLOAT_NEON( \
195 NE10_CHECKPOINTER_DstSrc_OPERATION; , \
196 NE10_DstSrc_MAINLOOP_FLOAT_NEON(loopCode1); , \
197 NE10_DstSrc_SECONDLOOP_FLOAT_NEON(loopCode2); \
198 ); \
199 }
200
201#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
202
203#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
204
205#define NE10_CMATVEC_OPERATION_X_C NE10_ABS_OPERATION_X_C
206
207#define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
208 NE10_DstSrc_OPERATION_VEC2F_NEON( \
209 NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
210 NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode1), \
211 NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode2) \
212 ); \
213 }
214
215#define NE10_LEN_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
216 NE10_DstSrc_OPERATION_VEC3F_NEON( \
217 NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
218 NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode1), \
219 NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode2) \
220 ); \
221 }
222
223#define NE10_LEN_OPERATION_VEC4F_NEON(loopCode) { \
224 NE10_DstSrc_OPERATION_VEC4F_NEON( \
225 NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
226 NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) \
227 ); \
228 }
229
230#define NE10_DETMAT_OPERATION_X_C NE10_ABS_OPERATION_X_C
231
233
234#define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
235 float32x4_t n_acc; \
236 float32x4_t n_src2; \
237 NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON( \
238 NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION; , \
239 NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
240 NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
241 ); \
242 }