VCTR
Loading...
Searching...
No Matches
NeonRegister.h
1/*
2 ==============================================================================
3 DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
5 Copyright 2022- by sonible GmbH.
6
7 This file is part of VCTR - Versatile Container Templates Reconceptualized.
8
9 VCTR is free software: you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License version 3
11 only, as published by the Free Software Foundation.
12
13 VCTR is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Lesser General Public License version 3 for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 version 3 along with VCTR. If not, see <https://www.gnu.org/licenses/>.
20 ==============================================================================
21*/
22
23namespace vctr
24{
25
26template <class T>
28{
29 static constexpr NeonRegister broadcast (const T&) { return {}; }
30};
31
32#if VCTR_ARM
33
34namespace detail
35{
36
37// clang-format off
38template <CompareOp, class>
39struct NeonCompare {};
40
41template <> struct NeonCompare<CompareOp::less, float> { static auto cmp (float32x4_t a, float32x4_t b) { return vcltq_f32 (a, b); } };
42template <> struct NeonCompare<CompareOp::less, double> { static auto cmp (float64x2_t a, float64x2_t b) { return vcltq_f64 (a, b); } };
43template <> struct NeonCompare<CompareOp::lessOrEqual, float> { static auto cmp (float32x4_t a, float32x4_t b) { return vcleq_f32 (a, b); } };
44template <> struct NeonCompare<CompareOp::lessOrEqual, double> { static auto cmp (float64x2_t a, float64x2_t b) { return vcleq_f64 (a, b); } };
45template <> struct NeonCompare<CompareOp::greater, float> { static auto cmp (float32x4_t a, float32x4_t b) { return vcgtq_f32 (a, b); } };
46template <> struct NeonCompare<CompareOp::greater, double> { static auto cmp (float64x2_t a, float64x2_t b) { return vcgtq_f64 (a, b); } };
47template <> struct NeonCompare<CompareOp::greaterOrEqual, float> { static auto cmp (float32x4_t a, float32x4_t b) { return vcgeq_f32 (a, b); } };
48template <> struct NeonCompare<CompareOp::greaterOrEqual, double> { static auto cmp (float64x2_t a, float64x2_t b) { return vcgeq_f64 (a, b); } };
49template <> struct NeonCompare<CompareOp::equal, float> { static auto cmp (float32x4_t a, float32x4_t b) { return vceqq_f32 (a, b); } };
50template <> struct NeonCompare<CompareOp::equal, double> { static auto cmp (float64x2_t a, float64x2_t b) { return vceqq_f64 (a, b); } };
51template <> struct NeonCompare<CompareOp::notEqual, float> { static auto cmp (float32x4_t a, float32x4_t b) { return vmvnq_u32 (vceqq_f32 (a, b)); } };
52template <> struct NeonCompare<CompareOp::notEqual, double> { static auto cmp (float64x2_t a, float64x2_t b) { return vreinterpretq_u64_u32 (vmvnq_u32 (vreinterpretq_u32_u64 (vceqq_f64 (a, b)))); } };
53// clang-format on
54}
55
56template <>
57struct NeonRegister<float>
58{
59 static constexpr size_t numElements = 4;
60
61 using NativeType = float32x4_t;
62 float32x4_t value;
63
64 //==============================================================================
65 // Loading
66 // clang-format off
67 static NeonRegister load (const float* d) { return { vld1q_f32 (d) }; }
68 static NeonRegister broadcast (float x) { return { vdupq_n_f32 (x) }; }
69
70 //==============================================================================
71 // Storing
72 void store (float* d) const { vst1q_f32 (d, value); }
73
74 //==============================================================================
75 // Generate Compare Masks
76 template <CompareOp op>
77 static NeonRegister compare (NeonRegister a, NeonRegister b) { return { vreinterpretq_f32_u32 (detail::NeonCompare<op, float>::cmp (a.value, b.value)) }; }
78
79 //==============================================================================
80 // Bit Operations
82 static NeonRegister bitwiseAndNot (NeonRegister a, NeonRegister b) { return { vreinterpretq_f32_u32 (vandq_u32 (vreinterpretq_u32_f32 (a.value), vmvnq_u32 (vreinterpretq_u32_f32 (b.value)))) }; }
83 static NeonRegister bitwiseAnd (NeonRegister a, NeonRegister b) { return { vreinterpretq_f32_u32 (vandq_u32 (vreinterpretq_u32_f32 (a.value), vreinterpretq_u32_f32 (b.value))) }; }
84 static NeonRegister bitwiseBlend (NeonRegister a, NeonRegister b, NeonRegister mask) { return { vbslq_f32 (vreinterpretq_u32_f32 (mask.value), b.value, a.value) }; }
85
86 //==============================================================================
87 // Math
88 static NeonRegister abs (NeonRegister x) { return { vabsq_f32 (x.value) }; }
89 static NeonRegister floor (NeonRegister x) { return { vrndmq_f32 (x.value) }; }
90 static NeonRegister ceil (NeonRegister x) { return { vrndpq_f32 (x.value) }; }
91 static NeonRegister mul (NeonRegister a, NeonRegister b) { return { vmulq_f32 (a.value, b.value) }; }
92 static NeonRegister div (NeonRegister a, NeonRegister b) { return { vdivq_f32 (a.value, b.value) }; }
93 static NeonRegister add (NeonRegister a, NeonRegister b) { return { vaddq_f32 (a.value, b.value) }; }
94 static NeonRegister sub (NeonRegister a, NeonRegister b) { return { vsubq_f32 (a.value, b.value) }; }
95 static NeonRegister max (NeonRegister a, NeonRegister b) { return { vmaxq_f32 (a.value, b.value) }; }
96 static NeonRegister min (NeonRegister a, NeonRegister b) { return { vminq_f32 (a.value, b.value) }; }
97 static NeonRegister fma (NeonRegister a, NeonRegister b, NeonRegister c) { return { vfmaq_f32 (c.value, a.value, b.value) }; }
98 static NeonRegister fms (NeonRegister a, NeonRegister b, NeonRegister c) { return { vfmsq_f32 (c.value, a.value, b.value) }; }
99
100#if VCTR_APPLE
101 // The Apple Accelerate vfp function collection contains some optimised math functions that can be directly called
102 // on 128 bit float vectors and which are therefore compatible to float32x4_t arguments.
103 static NeonRegister exp (NeonRegister x) { return { vexpf (x.value) }; }
104 static NeonRegister exp2 (NeonRegister x) { return { vexp2f (x.value) }; }
105 static NeonRegister expm1 (NeonRegister x) { return { vexpm1f (x.value) }; }
106 static NeonRegister log (NeonRegister x) { return { vlogf (x.value) }; }
107 static NeonRegister log1p (NeonRegister x) { return { vlog1pf (x.value) }; }
108 static NeonRegister log10 (NeonRegister x) { return { vlog10f (x.value) }; }
109 static NeonRegister logb (NeonRegister x) { return { vlogbf (x.value) }; }
110 static NeonRegister log2 (NeonRegister x) { return { vlog2f (x.value) }; }
111 static NeonRegister sin (NeonRegister x) { return { vsinf (x.value) }; }
112 static NeonRegister cos (NeonRegister x) { return { vcosf (x.value) }; }
113 static NeonRegister tan (NeonRegister x) { return { vtanf (x.value) }; }
114 static NeonRegister sinh (NeonRegister x) { return { vsinhf (x.value) }; }
115 static NeonRegister cosh (NeonRegister x) { return { vcoshf (x.value) }; }
116 static NeonRegister tanh (NeonRegister x) { return { vtanhf (x.value) }; }
117 static NeonRegister asinh (NeonRegister x) { return { vasinhf (x.value) }; }
118 static NeonRegister acosh (NeonRegister x) { return { vacoshf (x.value) }; }
119 static NeonRegister atanh (NeonRegister x) { return { vatanhf (x.value) }; }
120 static NeonRegister pow (NeonRegister x, NeonRegister y) { return { vpowf (x.value, y.value) }; }
121 static NeonRegister pow (NeonRegister x, NeonRegister<int32_t> y);
122#endif
123
124 //==============================================================================
125 // Type conversion
126 static NeonRegister<int32_t> convertToInt (NeonRegister x);
127 static NeonRegister<int32_t> reinterpretAsInt (NeonRegister x);
128 // clang-format on
129};
130
131template <>
132struct NeonRegister<double>
133{
134 static constexpr size_t numElements = 2;
135
136 using NativeType = float64x2_t;
137 float64x2_t value;
138
139 //==============================================================================
140 // Loading
141 // clang-format off
142 static NeonRegister load (const double* d) { return { vld1q_f64 (d) }; }
143 static NeonRegister broadcast (double x) { return { vdupq_n_f64 (x) }; }
144
145 //==============================================================================
146 // Storing
147 void store (double* d) const { vst1q_f64 (d, value); }
148
149 //==============================================================================
150 // Generate Compare Masks
151 template <CompareOp op>
152 static NeonRegister compare (NeonRegister a, NeonRegister b) { return { vreinterpretq_f64_u64 (detail::NeonCompare<op, double>::cmp (a.value, b.value)) }; }
153
154 //==============================================================================
155 // Bit Operations
157 static NeonRegister bitwiseAndNot (NeonRegister a, NeonRegister b) { return { vreinterpretq_f64_u32 (vandq_u32 (vreinterpretq_u32_f64 (a.value), vmvnq_u32 (vreinterpretq_u32_f64 (b.value)))) }; }
158 static NeonRegister bitwiseAnd (NeonRegister a, NeonRegister b) { return { vreinterpretq_f64_u32 (vandq_u32 (vreinterpretq_u32_f64 (a.value), vreinterpretq_u32_f64 (b.value))) }; }
159 static NeonRegister bitwiseBlend (NeonRegister a, NeonRegister b, NeonRegister mask) { return { vbslq_f64 (vreinterpretq_u64_f64 (mask.value), b.value, a.value) }; }
160
161 //==============================================================================
162 // Math
163 static NeonRegister abs (NeonRegister x) { return { vabsq_f64 (x.value) }; }
164 static NeonRegister floor (NeonRegister x) { return { vrndmq_f64 (x.value) }; }
165 static NeonRegister ceil (NeonRegister x) { return { vrndpq_f64 (x.value) }; }
166 static NeonRegister mul (NeonRegister a, NeonRegister b) { return { vmulq_f64 (a.value, b.value) }; }
167 static NeonRegister div (NeonRegister a, NeonRegister b) { return { vdivq_f64 (a.value, b.value) }; }
168 static NeonRegister add (NeonRegister a, NeonRegister b) { return { vaddq_f64 (a.value, b.value) }; }
169 static NeonRegister sub (NeonRegister a, NeonRegister b) { return { vsubq_f64 (a.value, b.value) }; }
170 static NeonRegister max (NeonRegister a, NeonRegister b) { return { vmaxq_f64 (a.value, b.value) }; }
171 static NeonRegister min (NeonRegister a, NeonRegister b) { return { vminq_f64 (a.value, b.value) }; }
172 static NeonRegister fma (NeonRegister a, NeonRegister b, NeonRegister c) { return { vfmaq_f64 (c.value, a.value, b.value) }; }
173 static NeonRegister fms (NeonRegister a, NeonRegister b, NeonRegister c) { return { vfmsq_f64 (c.value, a.value, b.value) }; }
174
175 //==============================================================================
176 // Type conversion
177 static NeonRegister<int64_t> convertToInt (NeonRegister x);
178 static NeonRegister<int64_t> reinterpretAsInt (NeonRegister x);
179 // clang-format on
180};
181
182template <>
183struct NeonRegister<int32_t>
184{
185 static constexpr size_t numElements = 4;
186
187 using NativeType = int32x4_t;
188 int32x4_t value;
189
190 //==============================================================================
191 // Loading
192 // clang-format off
193 static NeonRegister load (const int32_t* d) { return { vld1q_s32 (d) }; }
194 static NeonRegister broadcast (int32_t x) { return { vdupq_n_s32 (x) }; }
195
196 //==============================================================================
197 // Storing
198 void store (int32_t* d) const { vst1q_s32 (d, value); }
199
200 //==============================================================================
201 // Bit Operations
202 static NeonRegister bitwiseAnd (NeonRegister a, NeonRegister b) { return { vandq_s32 (a.value, b.value) }; }
203 static NeonRegister bitwiseOr (NeonRegister a, NeonRegister b) { return { vorrq_s32 (a.value, b.value) }; }
204
205 //==============================================================================
206 // Math
207 static NeonRegister abs (NeonRegister x) { return { vabsq_s32 (x.value) }; }
208 static NeonRegister mul (NeonRegister a, NeonRegister b) { return { vmulq_s32 (a.value, b.value) }; }
209 static NeonRegister add (NeonRegister a, NeonRegister b) { return { vaddq_s32 (a.value, b.value) }; }
210 static NeonRegister sub (NeonRegister a, NeonRegister b) { return { vsubq_s32 (a.value, b.value) }; }
211 static NeonRegister max (NeonRegister a, NeonRegister b) { return { vmaxq_s32 (a.value, b.value) }; }
212 static NeonRegister min (NeonRegister a, NeonRegister b) { return { vminq_s32 (a.value, b.value) }; }
213
214 //==============================================================================
215 // Type conversion
216 static NeonRegister<float> convertToFp (NeonRegister x) { return { vcvtq_f32_s32 (x.value) }; }
217 static NeonRegister<float> reinterpretAsFp (NeonRegister x) { return { vreinterpretq_f32_s32 (x.value) }; }
218 // clang-format on
219};
220
221template <>
222struct NeonRegister<uint32_t>
223{
224 static constexpr size_t numElements = 4;
225
226 using NativeType = uint32x4_t;
227 uint32x4_t value;
228
229 //==============================================================================
230 // Loading
231 // clang-format off
232 static NeonRegister load (const uint32_t* d) { return { vld1q_u32 (d) }; }
233 static NeonRegister broadcast (uint32_t x) { return { vdupq_n_u32 (x) }; }
234
235 //==============================================================================
236 // Storing
237 void store (uint32_t* d) const { vst1q_u32 (d, value); }
238
239 //==============================================================================
240 // Bit Operations
241 static NeonRegister bitwiseAnd (NeonRegister a, NeonRegister b) { return { vandq_u32 (a.value, b.value) }; }
242 static NeonRegister bitwiseOr (NeonRegister a, NeonRegister b) { return { vorrq_u32 (a.value, b.value) }; }
243
244 //==============================================================================
245 // Math
246 static NeonRegister mul (NeonRegister a, NeonRegister b) { return { vmulq_u32 (a.value, b.value) }; }
247 static NeonRegister add (NeonRegister a, NeonRegister b) { return { vaddq_u32 (a.value, b.value) }; }
248 static NeonRegister sub (NeonRegister a, NeonRegister b) { return { vsubq_u32 (a.value, b.value) }; }
249 static NeonRegister max (NeonRegister a, NeonRegister b) { return { vmaxq_u32 (a.value, b.value) }; }
250 static NeonRegister min (NeonRegister a, NeonRegister b) { return { vminq_u32 (a.value, b.value) }; }
251 // clang-format on
252};
253
254template <>
255struct NeonRegister<int64_t>
256{
257 static constexpr size_t numElements = 2;
258
259 using NativeType = int64x2_t;
260 int64x2_t value;
261
262 //==============================================================================
263 // Loading
264 // clang-format off
265 static NeonRegister load (const int64_t* d) { return { vld1q_s64 (d) }; }
266 static NeonRegister broadcast (int64_t x) { return { vdupq_n_s64 (x) }; }
267
268 //==============================================================================
269 // Storing
270 void store (int64_t* d) const { vst1q_s64 (d, value); }
271
272 //==============================================================================
273 // Bit Operations
274 static NeonRegister bitwiseAnd (NeonRegister a, NeonRegister b) { return { vandq_s64 (a.value, b.value) }; }
275 static NeonRegister bitwiseOr (NeonRegister a, NeonRegister b) { return { vorrq_s64 (a.value, b.value) }; }
276
277 //==============================================================================
278 // Math
279 static NeonRegister abs (NeonRegister x) { return { vabsq_s64 (x.value) }; }
280 static NeonRegister add (NeonRegister a, NeonRegister b) { return { vaddq_s64 (a.value, b.value) }; }
281 static NeonRegister sub (NeonRegister a, NeonRegister b) { return { vsubq_s64 (a.value, b.value) }; }
282
283 //==============================================================================
284 // Type conversion
285 static NeonRegister<double> convertToFp (NeonRegister x) { return { vcvtq_f64_s64 (x.value) }; }
286 static NeonRegister<double> reinterpretAsFp (NeonRegister x) { return { vreinterpretq_f64_s64 (x.value) }; }
287 // clang-format on
288};
289
290template <>
291struct NeonRegister<uint64_t>
292{
293 static constexpr size_t numElements = 2;
294
295 using NativeType = uint64x2_t;
296 uint64x2_t value;
297
298 //==============================================================================
299 // Loading
300 // clang-format off
301 static NeonRegister load (const uint64_t* d) { return { vld1q_u64 (d) }; }
302 static NeonRegister broadcast (uint64_t x) { return { vdupq_n_u64 (x) }; }
303
304 //==============================================================================
305 // Storing
306 void store (uint64_t* d) const { vst1q_u64 (d, value); }
307
308 //==============================================================================
309 // Bit Operations
310 static NeonRegister bitwiseAnd (NeonRegister a, NeonRegister b) { return { vandq_u64 (a.value, b.value) }; }
311 static NeonRegister bitwiseOr (NeonRegister a, NeonRegister b) { return { vorrq_u64 (a.value, b.value) }; }
312
313 //==============================================================================
314 // Math
315 static NeonRegister add (NeonRegister a, NeonRegister b) { return { vaddq_u64 (a.value, b.value) }; }
316 static NeonRegister sub (NeonRegister a, NeonRegister b) { return { vsubq_u64 (a.value, b.value) }; }
317 // clang-format on
318};
319
320inline NeonRegister<int32_t> NeonRegister<float>::convertToInt (NeonRegister<float> x) { return { vcvtq_s32_f32 (x.value) }; }
321inline NeonRegister<int32_t> NeonRegister<float>::reinterpretAsInt (NeonRegister<float> x) { return { vreinterpretq_s32_f32 (x.value) }; }
322inline NeonRegister<int64_t> NeonRegister<double>::convertToInt (NeonRegister<double> x) { return { vcvtq_s64_f64 (x.value) }; }
323inline NeonRegister<int64_t> NeonRegister<double>::reinterpretAsInt (NeonRegister<double> x) { return { vreinterpretq_s64_f64 (x.value) }; }
324
325#if VCTR_APPLE
326inline NeonRegister<float> NeonRegister<float>::pow (NeonRegister<float> x, NeonRegister<int32_t> y) { return { vipowf (x.value, y.value) }; }
327#endif
328
329#endif
330
331} // namespace vctr
constexpr ExpressionChainBuilder< expressions::Log10 > log10
Computes the logarithm to the base of ten of the source values.
Definition: Log10.h:84
constexpr ExpressionChainBuilder< expressions::Sin > sin
Computes the sine of each source element.
Definition: Sin.h:90
constexpr ExpressionChainBuilder< expressions::Asinh > asinh
Computes the inverse hyperbolic sine of each source element.
Definition: Asinh.h:90
constexpr ExpressionChainBuilder< expressions::Exp > exp
Computes e (Euler's number, 2.7182818...) raised to the source vector elements power.
Definition: Exp.h:104
constexpr ExpressionChainBuilder< expressions::Acosh > acosh
Computes the inverse hyperbolic cosine of each source element.
Definition: Acosh.h:90
constexpr ExpressionChainBuilder< expressions::Cosh > cosh
Computes the hyperbolic cosine of each source element.
Definition: Cosh.h:90
constexpr ExpressionChainBuilder< expressions::Cos > cos
Computes the cosine of each source element.
Definition: Cos.h:90
constexpr ExpressionChainBuilder< expressions::Tan > tan
Computes the tangent of each source element.
Definition: Tan.h:90
constexpr ExpressionChainBuilder< expressions::Atanh > atanh
Computes the inverse hyperbolic tangent of each source element.
Definition: Atanh.h:90
constexpr auto pow(SrcBaseType &&bases, SrcExpType &&exponents)
Returns an expression that raises the elements in bases element-wise to the power of the elements in ...
Definition: Pow.h:213
constexpr ExpressionChainBuilder< expressions::Max > max
Computes the maximum value of the source values.
Definition: Max.h:198
constexpr ExpressionChainBuilder< expressions::Abs > abs
Computes the absolute value of the source values.
Definition: Abs.h:135
constexpr ExpressionChainBuilder< expressions::Log2 > log2
Computes the logarithm to the base of two of the source values.
Definition: Log2.h:91
constexpr ExpressionChainBuilder< expressions::Sinh > sinh
Computes the hyperbolic sine of each source element.
Definition: Sinh.h:90
constexpr ExpressionChainBuilder< expressions::Min > min
Computes the minimum value of the source values.
Definition: Min.h:198
constexpr ExpressionChainBuilder< expressions::Tanh > tanh
Computes the hyperbolic tangent of each source element.
Definition: Tanh.h:90
The main namespace of the VCTR project.
Definition: Array.h:24
CompareOp
Possible types of (SIMD) compare operations.
Definition: CompareOp.h:41
Definition: NeonRegister.h:28