VCTR
Loading...
Searching...
No Matches
AVXRegister.h
1/*
2 ==============================================================================
3 DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
5 Copyright 2022- by sonible GmbH.
6
7 This file is part of VCTR - Versatile Container Templates Reconceptualized.
8
9 VCTR is free software: you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License version 3
11 only, as published by the Free Software Foundation.
12
13 VCTR is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Lesser General Public License version 3 for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 version 3 along with VCTR. If not, see <https://www.gnu.org/licenses/>.
20 ==============================================================================
21*/
22
23namespace vctr
24{
25
26template <class T>
28{
29 static constexpr AVXRegister broadcast (const T&) { return {}; }
30};
31
32#if VCTR_X64
33template <>
34struct AVXRegister<float>
35{
36 static constexpr size_t numElements = 8;
37
38 using NativeType = __m256;
39 __m256 value;
40
41 //==============================================================================
42 // Loading
43 // clang-format off
44 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadUnaligned (const float* d) { return { _mm256_loadu_ps (d) }; }
45 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadAligned (const float* d) { return { _mm256_load_ps (d) }; }
46 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister broadcast (float x) { return { _mm256_broadcast_ss (&x) }; }
47 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister fromSSE (SSERegister<float> a, SSERegister<float> b) { return { _mm256_set_m128 (a.value, b.value) }; }
48
49 //==============================================================================
50 // Storing
51 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeUnaligned (float* d) const { _mm256_storeu_ps (d, value); }
52 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeAligned (float* d) const { _mm256_store_ps (d, value); }
53
54 //==============================================================================
55 // Generate Compare Masks
56 template <CompareOp Op>
57 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister compare (AVXRegister a, AVXRegister b) { return { _mm256_cmp_ps (a.value, b.value, int (Op)) }; }
58
59 //==============================================================================
60 // Bit Operations
62 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseAndNot (AVXRegister a, AVXRegister b) { return { _mm256_andnot_ps (b.value, a.value) }; }
63 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) { return { _mm256_and_ps (a.value, b.value) }; }
64 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseBlend (AVXRegister a, AVXRegister b, AVXRegister mask) { return { _mm256_blendv_ps (a.value, b.value, mask.value) }; }
65
66 //==============================================================================
67 // Math
68 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister floor (AVXRegister x) { return { _mm256_floor_ps (x.value) }; }
69 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister ceil (AVXRegister x) { return { _mm256_ceil_ps (x.value) }; }
70 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister mul (AVXRegister a, AVXRegister b) { return { _mm256_mul_ps (a.value, b.value) }; }
71 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister add (AVXRegister a, AVXRegister b) { return { _mm256_add_ps (a.value, b.value) }; }
72 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister sub (AVXRegister a, AVXRegister b) { return { _mm256_sub_ps (a.value, b.value) }; }
73 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister div (AVXRegister a, AVXRegister b) { return { _mm256_div_ps (a.value, b.value) }; }
74 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister max (AVXRegister a, AVXRegister b) { return { _mm256_max_ps (a.value, b.value) }; }
75 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister min (AVXRegister a, AVXRegister b) { return { _mm256_min_ps (a.value, b.value) }; }
76 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister fma (AVXRegister a, AVXRegister b, AVXRegister c) { return { _mm256_fmadd_ps (a.value, b.value, c.value) }; }
77 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister fms (AVXRegister a, AVXRegister b, AVXRegister c) { return { _mm256_fnmadd_ps (a.value, b.value, c.value) }; }
78
79#if VCTR_HAS_SVML
80 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister exp (AVXRegister x) { return {_mm256_exp_ps (x.value) }; }
81 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister exp2 (AVXRegister x) { return {_mm256_exp2_ps (x.value) }; }
82 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister expm1 (AVXRegister x) { return {_mm256_expm1_ps (x.value) }; }
83 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log (AVXRegister x) { return {_mm256_log_ps (x.value) }; }
84 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log1p (AVXRegister x) { return {_mm256_log1p_ps (x.value) }; }
85 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log10 (AVXRegister x) { return {_mm256_log10_ps (x.value) }; }
86 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister logb (AVXRegister x) { return {_mm256_logb_ps (x.value) }; }
87 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log2 (AVXRegister x) { return {_mm256_log2_ps (x.value) }; }
88 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister sin (AVXRegister x) { return {_mm256_sin_ps (x.value) }; }
89 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister cos (AVXRegister x) { return {_mm256_cos_ps (x.value) }; }
90 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister tan (AVXRegister x) { return {_mm256_tan_ps (x.value) }; }
91 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister sinh (AVXRegister x) { return {_mm256_sinh_ps (x.value) }; }
92 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister cosh (AVXRegister x) { return {_mm256_cosh_ps (x.value) }; }
93 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister tanh (AVXRegister x) { return {_mm256_tanh_ps (x.value) }; }
94 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister asinh (AVXRegister x) { return {_mm256_asinh_ps (x.value) }; }
95 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister acosh (AVXRegister x) { return {_mm256_acosh_ps (x.value) }; }
96 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister atanh (AVXRegister x) { return {_mm256_atanh_ps (x.value) }; }
97 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister pow (AVXRegister x, AVXRegister y) { return {_mm256_pow_ps (x.value, y.value) }; }
98#endif
99
100 //==============================================================================
101 // Type conversion
102 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<int32_t> convertToInt (AVXRegister x);
103 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<int32_t> reinterpretAsInt (AVXRegister x);
104 // clang-format on
105};
106
107template <>
108struct AVXRegister<double>
109{
110 static constexpr size_t numElements = 4;
111
112 using NativeType = __m256d;
113 __m256d value;
114
115 //==============================================================================
116 // Loading
117 // clang-format off
118 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadUnaligned (const double* d) { return { _mm256_loadu_pd (d) }; }
119 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadAligned (const double* d) { return { _mm256_load_pd (d) }; }
120 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister broadcast (double x) { return { _mm256_broadcast_sd (&x) }; }
121 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister fromSSE (SSERegister<double> a, SSERegister<double> b) { return { _mm256_set_m128d (a.value, b.value) }; }
122
123 //==============================================================================
124 // Storing
125 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeUnaligned (double* d) const { _mm256_storeu_pd (d, value); }
126 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeAligned (double* d) const { _mm256_store_pd (d, value); }
127
128 //==============================================================================
129 // Generate Compare Masks
130 template <CompareOp Op>
131 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister compare (AVXRegister a, AVXRegister b) { return { _mm256_cmp_pd (a.value, b.value, int (Op)) }; }
132
133 //==============================================================================
134 // Bit Operations
136 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseAndNot (AVXRegister a, AVXRegister b) { return { _mm256_andnot_pd (b.value, a.value) }; }
137 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) { return { _mm256_and_pd (a.value, b.value) }; }
138 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseBlend (AVXRegister a, AVXRegister b, AVXRegister mask) { return { _mm256_blendv_pd (a.value, b.value, mask.value) }; }
139
140 //==============================================================================
141 // Math
142 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister floor (AVXRegister x) { return { _mm256_floor_pd (x.value) }; }
143 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister ceil (AVXRegister x) { return { _mm256_ceil_pd (x.value) }; }
144 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister mul (AVXRegister a, AVXRegister b) { return { _mm256_mul_pd (a.value, b.value) }; }
145 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister add (AVXRegister a, AVXRegister b) { return { _mm256_add_pd (a.value, b.value) }; }
146 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister sub (AVXRegister a, AVXRegister b) { return { _mm256_sub_pd (a.value, b.value) }; }
147 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister div (AVXRegister a, AVXRegister b) { return { _mm256_div_pd (a.value, b.value) }; }
148 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister max (AVXRegister a, AVXRegister b) { return { _mm256_max_pd (a.value, b.value) }; }
149 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister min (AVXRegister a, AVXRegister b) { return { _mm256_min_pd (a.value, b.value) }; }
150 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister fma (AVXRegister a, AVXRegister b, AVXRegister c) { return { _mm256_fmadd_pd (a.value, b.value, c.value) }; }
151 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister fms (AVXRegister a, AVXRegister b, AVXRegister c) { return { _mm256_fnmadd_pd (a.value, b.value, c.value) }; }
152
153#if VCTR_HAS_SVML
154 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister exp (AVXRegister x) { return {_mm256_exp_pd (x.value) }; }
155 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister exp2 (AVXRegister x) { return {_mm256_exp2_pd (x.value) }; }
156 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister expm1 (AVXRegister x) { return {_mm256_expm1_pd (x.value) }; }
157 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log (AVXRegister x) { return {_mm256_log_pd (x.value) }; }
158 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log1p (AVXRegister x) { return {_mm256_log1p_pd (x.value) }; }
159 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log10 (AVXRegister x) { return {_mm256_log10_pd (x.value) }; }
160 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister logb (AVXRegister x) { return {_mm256_logb_pd (x.value) }; }
161 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister log2 (AVXRegister x) { return {_mm256_log2_pd (x.value) }; }
162 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister sin (AVXRegister x) { return {_mm256_sin_pd (x.value) }; }
163 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister cos (AVXRegister x) { return {_mm256_cos_pd (x.value) }; }
164 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister tan (AVXRegister x) { return {_mm256_tan_pd (x.value) }; }
165 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister sinh (AVXRegister x) { return {_mm256_sinh_pd (x.value) }; }
166 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister cosh (AVXRegister x) { return {_mm256_cosh_pd (x.value) }; }
167 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister tanh (AVXRegister x) { return {_mm256_tanh_pd (x.value) }; }
168 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister asinh (AVXRegister x) { return {_mm256_asinh_pd (x.value) }; }
169 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister acosh (AVXRegister x) { return {_mm256_acosh_pd (x.value) }; }
170 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister atanh (AVXRegister x) { return {_mm256_atanh_pd (x.value) }; }
171 VCTR_FORCEDINLINE VCTR_TARGET ("fma") static AVXRegister pow (AVXRegister x, AVXRegister y) { return {_mm256_pow_pd (x.value, y.value) }; }
172#endif
173
174 //==============================================================================
175 // Type conversion
176 VCTR_FORCEDINLINE VCTR_TARGET ("avx512vl") VCTR_TARGET ("avx512dq") static AVXRegister<int64_t> convertToInt (AVXRegister x);
177 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<int64_t> reinterpretAsInt (AVXRegister x);
178 // clang-format on
179};
180
181template <>
182struct AVXRegister<int32_t>
183{
184 static constexpr size_t numElements = 8;
185
186 using NativeType = __m256i;
187 __m256i value;
188
189 //==============================================================================
190 // Loading
191 // clang-format off
192 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadUnaligned (const int32_t* d) { return { _mm256_loadu_si256 (reinterpret_cast<const __m256i*> (d)) }; }
193 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadAligned (const int32_t* d) { return { _mm256_load_si256 (reinterpret_cast<const __m256i*> (d)) }; }
194 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister broadcast (int32_t x) { return { _mm256_set1_epi32 (x) }; }
195 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister fromSSE (SSERegister<int32_t> a, SSERegister<int32_t> b) { return { _mm256_set_m128i (a.value, b.value) }; }
196
197 //==============================================================================
198 // Storing
199 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeUnaligned (int32_t* d) const { _mm256_storeu_si256 (reinterpret_cast<__m256i*> (d), value); }
200 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeAligned (int32_t* d) const { _mm256_store_si256 (reinterpret_cast<__m256i*> (d), value); }
201
202 //==============================================================================
203 // Bit Operations
204 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) { return { _mm256_and_si256 (a.value, b.value) }; }
205 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister bitwiseOr (AVXRegister a, AVXRegister b) { return { _mm256_or_si256 (a.value, b.value) }; }
206 // These are non AVX2 variants that might be used in functions that are not targeted AVX2 at the expense of slightly worse performance
207 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseAndLegacy (AVXRegister a, AVXRegister b) { return { _mm256_castps_si256 (_mm256_and_ps (_mm256_castsi256_ps (a.value), _mm256_castsi256_ps (b.value))) }; }
208 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseOrLegacy (AVXRegister a, AVXRegister b) { return { _mm256_castps_si256 (_mm256_or_ps (_mm256_castsi256_ps (a.value), _mm256_castsi256_ps (b.value))) }; }
209
210
211 //==============================================================================
212 // Math
213 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister abs (AVXRegister x) { return { _mm256_abs_epi32 (x.value) }; }
214 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister add (AVXRegister a, AVXRegister b) { return { _mm256_add_epi32 (a.value, b.value) }; }
215 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) { return { _mm256_sub_epi32 (a.value, b.value) }; }
216 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister max (AVXRegister a, AVXRegister b) { return { _mm256_max_epi32 (a.value, b.value) }; }
217 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister min (AVXRegister a, AVXRegister b) { return { _mm256_min_epi32 (a.value, b.value) }; }
218
219 //==============================================================================
220 // Type conversion
221 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<float> convertToFp (AVXRegister x) { return { _mm256_cvtepi32_ps (x.value) }; }
222 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<float> reinterpretAsFp (AVXRegister x) { return { _mm256_castsi256_ps (x.value) }; }
223 // clang-format on
224};
225
226template <>
227struct AVXRegister<uint32_t>
228{
229 static constexpr size_t numElements = 8;
230
231 using NativeType = __m256i;
232 __m256i value;
233
234 //==============================================================================
235 // Loading
236 // clang-format off
237 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadUnaligned (const uint32_t* d) { return { _mm256_loadu_si256 (reinterpret_cast<const __m256i*> (d)) }; }
238 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadAligned (const uint32_t* d) { return { _mm256_load_si256 (reinterpret_cast<const __m256i*> (d)) }; }
239 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister broadcast (uint32_t x) { return { _mm256_set1_epi32 ((int32_t) x) }; }
240 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister fromSSE (SSERegister<uint32_t> a, SSERegister<uint32_t> b) { return { _mm256_set_m128i (a.value, b.value) }; }
241
242 //==============================================================================
243 // Storing
244 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeUnaligned (uint32_t* d) const { _mm256_storeu_si256 (reinterpret_cast<__m256i*> (d), value); }
245 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeAligned (uint32_t* d) const { _mm256_store_si256 (reinterpret_cast<__m256i*> (d), value); }
246
247 //==============================================================================
248 // Bit Operations
249
250 //==============================================================================
251 // Math
252 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister add (AVXRegister a, AVXRegister b) { return { _mm256_add_epi32 (a.value, b.value) }; }
253 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) { return { _mm256_sub_epi32 (a.value, b.value) }; }
254 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister max (AVXRegister a, AVXRegister b) { return { _mm256_max_epu32 (a.value, b.value) }; }
255 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister min (AVXRegister a, AVXRegister b) { return { _mm256_min_epu32 (a.value, b.value) }; }
256 // clang-format on
257};
258
259template <>
260struct AVXRegister<int64_t>
261{
262 static constexpr size_t numElements = 4;
263
264 using NativeType = __m256i;
265 __m256i value;
266
267 //==============================================================================
268 // Loading
269 // clang-format off
270 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadUnaligned (const int64_t* d) { return { _mm256_loadu_si256 (reinterpret_cast<const __m256i*> (d)) }; }
271 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadAligned (const int64_t* d) { return { _mm256_load_si256 (reinterpret_cast<const __m256i*> (d)) }; }
272 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister broadcast (int64_t x) { return { _mm256_set1_epi64x (x) }; }
273 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister fromSSE (SSERegister<int64_t> a, SSERegister<int64_t> b) { return { _mm256_set_m128i (a.value, b.value) }; }
274
275 //==============================================================================
276 // Storing
277 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeUnaligned (int64_t* d) const { _mm256_storeu_si256 (reinterpret_cast<__m256i*> (d), value); }
278 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeAligned (int64_t* d) const { _mm256_store_si256 (reinterpret_cast<__m256i*> (d), value); }
279
280 //==============================================================================
281 // Bit Operations
282 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) { return { _mm256_and_si256 (a.value, b.value) }; }
283 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister bitwiseOr (AVXRegister a, AVXRegister b) { return { _mm256_or_si256 (a.value, b.value) }; }
284 // These are non AVX2 variants that might be used in functions that are not targeted AVX2 at the expense of slightly worse performance
285 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseAndLegacy (AVXRegister a, AVXRegister b) { return { _mm256_castpd_si256 (_mm256_and_pd (_mm256_castsi256_pd (a.value), _mm256_castsi256_pd (b.value))) }; }
286 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister bitwiseOrLegacy (AVXRegister a, AVXRegister b) { return { _mm256_castpd_si256 (_mm256_or_pd (_mm256_castsi256_pd (a.value), _mm256_castsi256_pd (b.value))) }; }
287
288 //==============================================================================
289 // Math
290 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister add (AVXRegister a, AVXRegister b) { return { _mm256_add_epi64 (a.value, b.value) }; }
291 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) { return { _mm256_sub_epi64 (a.value, b.value) }; }
292
293 //==============================================================================
294 // Type conversion
295 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<double> convertToFp (AVXRegister x) { return { _mm256_cvtepi64_pd (x.value) }; }
296 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<double> reinterpretAsFp (AVXRegister x) { return { _mm256_castsi256_pd (x.value) }; }
297 // clang-format on
298};
299
300template <>
301struct AVXRegister<uint64_t>
302{
303 static constexpr size_t numElements = 4;
304
305 using NativeType = __m256i;
306 __m256i value;
307
308 //==============================================================================
309 // Loading
310 // clang-format off
311 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadUnaligned (const uint64_t* d) { return { _mm256_loadu_si256 (reinterpret_cast<const __m256i*> (d)) }; }
312 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister loadAligned (const uint64_t* d) { return { _mm256_load_si256 (reinterpret_cast<const __m256i*> (d)) }; }
313 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister broadcast (uint64_t x) { return { _mm256_set1_epi64x ((int64_t) x) }; }
314 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister fromSSE (SSERegister<uint64_t> a, SSERegister<uint64_t> b) { return { _mm256_set_m128i (a.value, b.value) }; }
315
316 //==============================================================================
317 // Storing
318 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeUnaligned (uint64_t* d) const { _mm256_storeu_si256 (reinterpret_cast<__m256i*> (d), value); }
319 VCTR_FORCEDINLINE VCTR_TARGET ("avx") void storeAligned (uint64_t* d) const { _mm256_store_si256 (reinterpret_cast<__m256i*> (d), value); }
320
321 //==============================================================================
322 // Bit Operations
323
324 //==============================================================================
325 // Math
326 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister add (AVXRegister a, AVXRegister b) { return { _mm256_add_epi64 (a.value, b.value) }; }
327 VCTR_FORCEDINLINE VCTR_TARGET ("avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) { return { _mm256_sub_epi64 (a.value, b.value) }; }
328 // clang-format on
329};
330
331inline AVXRegister<int32_t> AVXRegister<float>::convertToInt (AVXRegister x) { return { _mm256_cvtps_epi32 (x.value) }; }
332inline AVXRegister<int32_t> AVXRegister<float>::reinterpretAsInt (AVXRegister x) { return { _mm256_castps_si256 (x.value) }; }
333inline AVXRegister<int64_t> AVXRegister<double>::convertToInt (AVXRegister x) { return { _mm256_cvtpd_epi64 (x.value) }; }
334inline AVXRegister<int64_t> AVXRegister<double>::reinterpretAsInt (AVXRegister x) { return { _mm256_castpd_si256 (x.value) }; }
335#endif
336
337} // namespace vctr
constexpr ExpressionChainBuilder< expressions::Log10 > log10
Computes the logarithm to the base of ten of the source values.
Definition: Log10.h:84
constexpr ExpressionChainBuilder< expressions::Sin > sin
Computes the sine of each source element.
Definition: Sin.h:90
constexpr ExpressionChainBuilder< expressions::Asinh > asinh
Computes the inverse hyperbolic sine of each source element.
Definition: Asinh.h:90
constexpr ExpressionChainBuilder< expressions::Exp > exp
Computes e (Euler's number, 2.7182818...) raised to the source vector elements power.
Definition: Exp.h:104
constexpr ExpressionChainBuilder< expressions::Acosh > acosh
Computes the inverse hyperbolic cosine of each source element.
Definition: Acosh.h:90
constexpr ExpressionChainBuilder< expressions::Cosh > cosh
Computes the hyperbolic cosine of each source element.
Definition: Cosh.h:90
constexpr ExpressionChainBuilder< expressions::Cos > cos
Computes the cosine of each source element.
Definition: Cos.h:90
constexpr ExpressionChainBuilder< expressions::Tan > tan
Computes the tangent of each source element.
Definition: Tan.h:90
constexpr ExpressionChainBuilder< expressions::Atanh > atanh
Computes the inverse hyperbolic tangent of each source element.
Definition: Atanh.h:90
constexpr auto pow(SrcBaseType &&bases, SrcExpType &&exponents)
Returns an expression that raises the elements in bases element-wise to the power of the elements in ...
Definition: Pow.h:213
constexpr ExpressionChainBuilder< expressions::Max > max
Computes the maximum value of the source values.
Definition: Max.h:198
constexpr ExpressionChainBuilder< expressions::Abs > abs
Computes the absolute value of the source values.
Definition: Abs.h:135
constexpr ExpressionChainBuilder< expressions::Log2 > log2
Computes the logarithm to the base of two of the source values.
Definition: Log2.h:91
constexpr ExpressionChainBuilder< expressions::Sinh > sinh
Computes the hyperbolic sine of each source element.
Definition: Sinh.h:90
constexpr ExpressionChainBuilder< expressions::Min > min
Computes the minimum value of the source values.
Definition: Min.h:198
constexpr ExpressionChainBuilder< expressions::Tanh > tanh
Computes the hyperbolic tangent of each source element.
Definition: Tanh.h:90
The main namespace of the VCTR project.
Definition: Array.h:24
Definition: AVXRegister.h:28