VCTR
Loading...
Searching...
No Matches
SSERegister.h
1/*
2 ==============================================================================
3 DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
5 Copyright 2022- by sonible GmbH.
6
7 This file is part of VCTR - Versatile Container Templates Reconceptualized.
8
9 VCTR is free software: you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License version 3
11 only, as published by the Free Software Foundation.
12
13 VCTR is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Lesser General Public License version 3 for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 version 3 along with VCTR. If not, see <https://www.gnu.org/licenses/>.
20 ==============================================================================
21*/
22
23namespace vctr
24{
25
26template <class T>
28{
29 static constexpr SSERegister broadcast (const T&) { return {}; }
30};
31
32#if VCTR_X64
33
34template <>
35struct SSERegister<float>
36{
37 static constexpr size_t numElements = 4;
38
39 using NativeType = __m128;
40 __m128 value;
41
42 //==============================================================================
43 // Loading
44 // clang-format off
45 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadUnaligned (const float* d) { return { _mm_loadu_ps (d) }; }
46 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadAligned (const float* d) { return { _mm_load_ps (d) }; }
47 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister broadcast (float x) { return { _mm_load1_ps (&x) }; }
48
49 //==============================================================================
50 // Storing
51 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeUnaligned (float* d) const { _mm_storeu_ps (d, value); }
52 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeAligned (float* d) const { _mm_store_ps (d, value); }
53
54 //==============================================================================
55 // Bit Operations
57 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister bitwiseAndNot (SSERegister a, SSERegister b) { return { _mm_andnot_ps (b.value, a.value) }; }
58
59 //==============================================================================
60 // Math
61 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister floor (SSERegister x) { return { _mm_floor_ps (x.value) }; }
62 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister ceil (SSERegister x) { return { _mm_ceil_ps (x.value) }; }
63 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister mul (SSERegister a, SSERegister b) { return { _mm_mul_ps (a.value, b.value) }; }
64 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister div (SSERegister a, SSERegister b) { return { _mm_div_ps (a.value, b.value) }; }
65 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister add (SSERegister a, SSERegister b) { return { _mm_add_ps (a.value, b.value) }; }
66 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sub (SSERegister a, SSERegister b) { return { _mm_sub_ps (a.value, b.value) }; }
67 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister max (SSERegister a, SSERegister b) { return { _mm_max_ps (a.value, b.value) }; }
68 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister min (SSERegister a, SSERegister b) { return { _mm_min_ps (a.value, b.value) }; }
69
70#if VCTR_APPLE
71 // The Apple Accelerate vfp function collection contains some optimised math functions that can be directly called
72 // on 128 bit float vectors and which are therefore compatible to __m128 arguments.
73 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister exp (SSERegister x) { return { vexpf (x.value) }; }
74 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister exp2 (SSERegister x) { return { vexp2f (x.value) }; }
75 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister expm1 (SSERegister x) { return { vexpm1f (x.value) }; }
76 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log (SSERegister x) { return { vlogf (x.value) }; }
77 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log1p (SSERegister x) { return { vlog1pf (x.value) }; }
78 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log10 (SSERegister x) { return { vlog10f (x.value) }; }
79 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister logb (SSERegister x) { return { vlogbf (x.value) }; }
80 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log2 (SSERegister x) { return { vlog2f (x.value) }; }
81 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sin (SSERegister x) { return { vsinf (x.value) }; }
82 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister cos (SSERegister x) { return { vcosf (x.value) }; }
83 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister tan (SSERegister x) { return { vtanf (x.value) }; }
84 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sinh (SSERegister x) { return { vsinhf (x.value) }; }
85 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister cosh (SSERegister x) { return { vcoshf (x.value) }; }
86 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister tanh (SSERegister x) { return { vtanhf (x.value) }; }
87 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister asinh (SSERegister x) { return { vasinhf (x.value) }; }
88 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister acosh (SSERegister x) { return { vacoshf (x.value) }; }
89 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister atanh (SSERegister x) { return { vatanhf (x.value) }; }
90 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister pow (SSERegister x, SSERegister y) { return { vpowf (x.value, y.value) }; }
91 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister pow (SSERegister x, SSERegister<int32_t> y);
92#elif VCTR_HAS_SVML
93 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister exp (SSERegister x) { return { _mm_exp_ps (x.value) }; }
94 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister exp2 (SSERegister x) { return { _mm_exp2_ps (x.value) }; }
95 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister expm1 (SSERegister x) { return { _mm_expm1_ps (x.value) }; }
96 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log (SSERegister x) { return { _mm_log_ps (x.value) }; }
97 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log1p (SSERegister x) { return { _mm_log1p_ps (x.value) }; }
98 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log10 (SSERegister x) { return { _mm_log10_ps (x.value) }; }
99 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister logb (SSERegister x) { return { _mm_logb_ps (x.value) }; }
100 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log2 (SSERegister x) { return { _mm_log2_ps (x.value) }; }
101 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sin (SSERegister x) { return { _mm_sin_ps (x.value) }; }
102 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister cos (SSERegister x) { return { _mm_cos_ps (x.value) }; }
103 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister tan (SSERegister x) { return { _mm_tan_ps (x.value) }; }
104 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sinh (SSERegister x) { return { _mm_sinh_ps (x.value) }; }
105 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister cosh (SSERegister x) { return { _mm_cosh_ps (x.value) }; }
106 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister tanh (SSERegister x) { return { _mm_tanh_ps (x.value) }; }
107 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister asinh (SSERegister x) { return { _mm_asinh_ps (x.value) }; }
108 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister acosh (SSERegister x) { return { _mm_acosh_ps (x.value) }; }
109 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister atanh (SSERegister x) { return { _mm_atanh_ps (x.value) }; }
110 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister pow (SSERegister x, SSERegister y) { return { _mm_pow_ps (x.value, y.value) }; }
111#endif
112
113 //==============================================================================
114 // Type conversion
115 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<int32_t> convertToInt (SSERegister x);
116 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<int32_t> reinterpretAsInt (SSERegister x);
117 // clang-format on
118};
119
120template <>
121struct SSERegister<double>
122{
123 static constexpr size_t numElements = 2;
124
125 using NativeType = __m128d;
126 __m128d value;
127
128 //==============================================================================
129 // Loading
130 // clang-format off
131 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadUnaligned (const double* d) { return { _mm_loadu_pd (d) }; }
132 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadAligned (const double* d) { return { _mm_load_pd (d) }; }
133 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister broadcast (double x) { return { _mm_load1_pd (&x) }; }
134
135 //==============================================================================
136 // Storing
137 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeUnaligned (double* d) const { _mm_storeu_pd (d, value); }
138 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeAligned (double* d) const { _mm_store_pd (d, value); }
139
140 //==============================================================================
141 // Bit Operations
143 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister bitwiseAndNot (SSERegister a, SSERegister b) { return { _mm_andnot_pd (b.value, a.value) }; }
144
145 //==============================================================================
146 // Math
147 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister floor (SSERegister x) { return { _mm_floor_pd (x.value) }; }
148 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister ceil (SSERegister x) { return { _mm_ceil_pd (x.value) }; }
149 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister mul (SSERegister a, SSERegister b) { return { _mm_mul_pd (a.value, b.value) }; }
150 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister div (SSERegister a, SSERegister b) { return { _mm_div_pd (a.value, b.value) }; }
151 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister add (SSERegister a, SSERegister b) { return { _mm_add_pd (a.value, b.value) }; }
152 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sub (SSERegister a, SSERegister b) { return { _mm_sub_pd (a.value, b.value) }; }
153 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister max (SSERegister a, SSERegister b) { return { _mm_max_pd (a.value, b.value) }; }
154 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister min (SSERegister a, SSERegister b) { return { _mm_min_pd (a.value, b.value) }; }
155
156#if VCTR_HAS_SVML
157 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister exp (SSERegister x) { return { _mm_exp_pd (x.value) }; }
158 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister exp2 (SSERegister x) { return { _mm_exp2_pd (x.value) }; }
159 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister expm1 (SSERegister x) { return { _mm_expm1_pd (x.value) }; }
160 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log (SSERegister x) { return { _mm_log_pd (x.value) }; }
161 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log1p (SSERegister x) { return { _mm_log1p_pd (x.value) }; }
162 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log10 (SSERegister x) { return { _mm_log10_pd (x.value) }; }
163 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister logb (SSERegister x) { return { _mm_logb_pd (x.value) }; }
164 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister log2 (SSERegister x) { return { _mm_log2_pd (x.value) }; }
165 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sin (SSERegister x) { return { _mm_sin_pd (x.value) }; }
166 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister cos (SSERegister x) { return { _mm_cos_pd (x.value) }; }
167 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister tan (SSERegister x) { return { _mm_tan_pd (x.value) }; }
168 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sinh (SSERegister x) { return { _mm_sinh_pd (x.value) }; }
169 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister cosh (SSERegister x) { return { _mm_cosh_pd (x.value) }; }
170 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister tanh (SSERegister x) { return { _mm_tanh_pd (x.value) }; }
171 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister asinh (SSERegister x) { return { _mm_asinh_pd (x.value) }; }
172 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister acosh (SSERegister x) { return { _mm_acosh_pd (x.value) }; }
173 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister atanh (SSERegister x) { return { _mm_atanh_pd (x.value) }; }
174 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister pow (SSERegister x, SSERegister y) { return { _mm_pow_pd (x.value, y.value) }; }
175#endif
176
177 //==============================================================================
178 // Type conversion
179 VCTR_FORCEDINLINE VCTR_TARGET ("avx512vl") VCTR_TARGET ("avx512dq") static SSERegister<int64_t> convertToInt (SSERegister x);
180 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<int64_t> reinterpretAsInt (SSERegister x);
181 // clang-format on
182};
183
184template <>
185struct SSERegister<int32_t>
186{
187 static constexpr size_t numElements = 4;
188
189 using NativeType = __m128i;
190 __m128i value;
191
192 //==============================================================================
193 // Loading
194 // clang-format off
195 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadUnaligned (const int32_t* d) { return { _mm_loadu_si128 (reinterpret_cast<const __m128i*> (d)) }; }
196 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadAligned (const int32_t* d) { return { _mm_load_si128 (reinterpret_cast<const __m128i*> (d)) }; }
197 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister broadcast (int32_t x) { return { _mm_set1_epi32 (x) }; }
198
199 //==============================================================================
200 // Storing
201 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeUnaligned (int32_t* d) const { _mm_storeu_si128 (reinterpret_cast<__m128i*> (d), value); }
202 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeAligned (int32_t* d) const { _mm_store_si128 (reinterpret_cast<__m128i*> (d), value); }
203
204 //==============================================================================
205 // Bit Operations
206 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister bitwiseAnd (SSERegister a, SSERegister b) { return { _mm_castps_si128 (_mm_and_ps (_mm_castsi128_ps (a.value), _mm_castsi128_ps (b.value))) }; }
207 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister bitwiseOr (SSERegister a, SSERegister b) { return { _mm_castps_si128 (_mm_or_ps (_mm_castsi128_ps (a.value), _mm_castsi128_ps (b.value))) }; }
208
209 //==============================================================================
210 // Math
211 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister abs (SSERegister x) { return { _mm_abs_epi32 (x.value) }; }
212 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister add (SSERegister a, SSERegister b) { return { _mm_add_epi32 (a.value, b.value) }; }
213 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sub (SSERegister a, SSERegister b) { return { _mm_sub_epi32 (a.value, b.value) }; }
214 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister max (SSERegister a, SSERegister b) { return { _mm_max_epi32 (a.value, b.value) }; }
215 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister min (SSERegister a, SSERegister b) { return { _mm_min_epi32 (a.value, b.value) }; }
216
217 //==============================================================================
218 // Type conversion
219 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<float> convertToFp (SSERegister x) { return { _mm_cvtepi32_ps (x.value) }; }
220 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<float> reinterpretAsFp (SSERegister x) { return { _mm_castsi128_ps (x.value) }; }
221 // clang-format on
222};
223
224template <>
225struct SSERegister<uint32_t>
226{
227 static constexpr size_t numElements = 4;
228
229 using NativeType = __m128i;
230 __m128i value;
231
232 //==============================================================================
233 // Loading
234 // clang-format off
235 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadUnaligned (const uint32_t* d) { return { _mm_loadu_si128 (reinterpret_cast<const __m128i*> (d)) }; }
236 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadAligned (const uint32_t* d) { return { _mm_load_si128 (reinterpret_cast<const __m128i*> (d)) }; }
237 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister broadcast (uint32_t x) { return { _mm_set1_epi32 ((int32_t) x) }; }
238
239 //==============================================================================
240 // Storing
241 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeUnaligned (uint32_t* d) const { _mm_storeu_si128 (reinterpret_cast<__m128i*> (d), value); }
242 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeAligned (uint32_t* d) const { _mm_store_si128 (reinterpret_cast<__m128i*> (d), value); }
243
244 //==============================================================================
245 // Bit Operations
246
247 //==============================================================================
248 // Math
249 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister add (SSERegister a, SSERegister b) { return { _mm_add_epi32 (a.value, b.value) }; }
250 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sub (SSERegister a, SSERegister b) { return { _mm_sub_epi32 (a.value, b.value) }; }
251 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister max (SSERegister a, SSERegister b) { return { _mm_max_epu32 (a.value, b.value) }; }
252 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister min (SSERegister a, SSERegister b) { return { _mm_min_epu32 (a.value, b.value) }; }
253 // clang-format on
254};
255
256template <>
257struct SSERegister<int64_t>
258{
259 static constexpr size_t numElements = 2;
260
261 using NativeType = __m128i;
262 __m128i value;
263
264 //==============================================================================
265 // Loading
266 // clang-format off
267 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadUnaligned (const int64_t* d) { return { _mm_loadu_si128 (reinterpret_cast<const __m128i*> (d)) }; }
268 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadAligned (const int64_t* d) { return { _mm_load_si128 (reinterpret_cast<const __m128i*> (d)) }; }
269 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister broadcast (int64_t x) { return { _mm_set1_epi64x (x) }; }
270
271 //==============================================================================
272 // Storing
273 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeUnaligned (int64_t* d) const { _mm_storeu_si128 (reinterpret_cast<__m128i*> (d), value); }
274 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeAligned (int64_t* d) const { _mm_store_si128 (reinterpret_cast<__m128i*> (d), value); }
275
276 //==============================================================================
277 // Bit Operations
278 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister bitwiseAnd (SSERegister a, SSERegister b) { return { _mm_castpd_si128 (_mm_and_pd (_mm_castsi128_pd (a.value), _mm_castsi128_pd (b.value))) }; }
279 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister bitwiseOr (SSERegister a, SSERegister b) { return { _mm_castpd_si128 (_mm_or_pd (_mm_castsi128_pd (a.value), _mm_castsi128_pd (b.value))) }; }
280
281 //==============================================================================
282 // Math
283 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister add (SSERegister a, SSERegister b) { return { _mm_add_epi64 (a.value, b.value) }; }
284 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sub (SSERegister a, SSERegister b) { return { _mm_sub_epi64 (a.value, b.value) }; }
285
286 //==============================================================================
287 // Type conversion
288 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<double> convertToFp (SSERegister x) { return { _mm_cvtepi64_pd (x.value) }; }
289 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister<double> reinterpretAsFp (SSERegister x) { return { _mm_castsi128_pd (x.value) }; }
290 // clang-format on
291};
292
293template <>
294struct SSERegister<uint64_t>
295{
296 static constexpr size_t numElements = 2;
297
298 using NativeType = __m128i;
299 __m128i value;
300
301 //==============================================================================
302 // Loading
303 // clang-format off
304 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadUnaligned (const uint64_t* d) { return { _mm_loadu_si128 (reinterpret_cast<const __m128i*> (d)) }; }
305 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister loadAligned (const uint64_t* d) { return { _mm_load_si128 (reinterpret_cast<const __m128i*> (d)) }; }
306 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister broadcast (uint64_t x) { return { _mm_set1_epi64x ((int64_t) x) }; }
307
308 //==============================================================================
309 // Storing
310 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeUnaligned (uint64_t* d) const { _mm_storeu_si128 (reinterpret_cast<__m128i*> (d), value); }
311 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") void storeAligned (uint64_t* d) const { _mm_store_si128 (reinterpret_cast<__m128i*> (d), value); }
312
313 //==============================================================================
314 // Bit Operations
315
316 //==============================================================================
317 // Math
318 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister add (SSERegister a, SSERegister b) { return { _mm_add_epi64 (a.value, b.value) }; }
319 VCTR_FORCEDINLINE VCTR_TARGET ("sse4.1") static SSERegister sub (SSERegister a, SSERegister b) { return { _mm_sub_epi64 (a.value, b.value) }; }
320 // clang-format on
321};
322
323inline SSERegister<int32_t> SSERegister<float>::convertToInt (SSERegister x) { return { _mm_cvtps_epi32 (x.value) }; }
324inline SSERegister<int32_t> SSERegister<float>::reinterpretAsInt (SSERegister x) { return { _mm_castps_si128 (x.value) }; }
325inline SSERegister<int64_t> SSERegister<double>::convertToInt (SSERegister x) { return { _mm_cvtpd_epi64 (x.value) }; }
326inline SSERegister<int64_t> SSERegister<double>::reinterpretAsInt (SSERegister x) { return { _mm_castpd_si128 (x.value) }; }
327
328
329#if VCTR_APPLE
330inline SSERegister<float> SSERegister<float>::pow (SSERegister<float> x, SSERegister<int32_t> y) { return { vipowf (x.value, y.value) }; }
331#endif
332
333#endif
334
335} // namespace vctr
constexpr ExpressionChainBuilder< expressions::Log10 > log10
Computes the logarithm to the base of ten of the source values.
Definition: Log10.h:84
constexpr ExpressionChainBuilder< expressions::Sin > sin
Computes the sine of each source element.
Definition: Sin.h:90
constexpr ExpressionChainBuilder< expressions::Asinh > asinh
Computes the inverse hyperbolic sine of each source element.
Definition: Asinh.h:90
constexpr ExpressionChainBuilder< expressions::Exp > exp
Computes e (Euler's number, 2.7182818...) raised to the source vector elements power.
Definition: Exp.h:104
constexpr ExpressionChainBuilder< expressions::Acosh > acosh
Computes the inverse hyperbolic cosine of each source element.
Definition: Acosh.h:90
constexpr ExpressionChainBuilder< expressions::Cosh > cosh
Computes the hyperbolic cosine of each source element.
Definition: Cosh.h:90
constexpr ExpressionChainBuilder< expressions::Cos > cos
Computes the cosine of each source element.
Definition: Cos.h:90
constexpr ExpressionChainBuilder< expressions::Tan > tan
Computes the tangent of each source element.
Definition: Tan.h:90
constexpr ExpressionChainBuilder< expressions::Atanh > atanh
Computes the inverse hyperbolic tangent of each source element.
Definition: Atanh.h:90
constexpr auto pow(SrcBaseType &&bases, SrcExpType &&exponents)
Returns an expression that raises the elements in bases element-wise to the power of the elements in ...
Definition: Pow.h:213
constexpr ExpressionChainBuilder< expressions::Max > max
Computes the maximum value of the source values.
Definition: Max.h:198
constexpr ExpressionChainBuilder< expressions::Abs > abs
Computes the absolute value of the source values.
Definition: Abs.h:135
constexpr ExpressionChainBuilder< expressions::Log2 > log2
Computes the logarithm to the base of two of the source values.
Definition: Log2.h:91
constexpr ExpressionChainBuilder< expressions::Sinh > sinh
Computes the hyperbolic sine of each source element.
Definition: Sinh.h:90
constexpr ExpressionChainBuilder< expressions::Min > min
Computes the minimum value of the source values.
Definition: Min.h:198
constexpr ExpressionChainBuilder< expressions::Tanh > tanh
Computes the hyperbolic tangent of each source element.
Definition: Tanh.h:90
The main namespace of the VCTR project.
Definition: Array.h:24
Definition: SSERegister.h:28