29 static constexpr AVXRegister broadcast (
const T&) {
return {}; }
36 static constexpr size_t numElements = 8;
38 using NativeType = __m256;
44 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static
AVXRegister loadUnaligned (const
float* d) {
return { _mm256_loadu_ps (d) }; }
45 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadAligned (const
float* d) {
return { _mm256_load_ps (d) }; }
46 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister broadcast (
float x) {
return { _mm256_broadcast_ss (&x) }; }
47 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister fromSSE (SSERegister<
float> a, SSERegister<
float> b) {
return { _mm256_set_m128 (a.value, b.value) }; }
51 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeUnaligned (
float* d)
const { _mm256_storeu_ps (d, value); }
52 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeAligned (
float* d)
const { _mm256_store_ps (d, value); }
56 template <CompareOp Op>
57 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister compare (AVXRegister a, AVXRegister b) {
return { _mm256_cmp_ps (a.value, b.value, int (Op)) }; }
62 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseAndNot (AVXRegister a, AVXRegister b) {
return { _mm256_andnot_ps (b.value, a.value) }; }
63 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) {
return { _mm256_and_ps (a.value, b.value) }; }
64 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseBlend (AVXRegister a, AVXRegister b, AVXRegister mask) {
return { _mm256_blendv_ps (a.value, b.value, mask.value) }; }
68 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister floor (AVXRegister x) {
return { _mm256_floor_ps (x.value) }; }
69 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister ceil (AVXRegister x) {
return { _mm256_ceil_ps (x.value) }; }
70 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister mul (AVXRegister a, AVXRegister b) {
return { _mm256_mul_ps (a.value, b.value) }; }
71 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister add (AVXRegister a, AVXRegister b) {
return { _mm256_add_ps (a.value, b.value) }; }
72 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister sub (AVXRegister a, AVXRegister b) {
return { _mm256_sub_ps (a.value, b.value) }; }
73 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister div (AVXRegister a, AVXRegister b) {
return { _mm256_div_ps (a.value, b.value) }; }
74 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister
max (AVXRegister a, AVXRegister b) {
return { _mm256_max_ps (a.value, b.value) }; }
75 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister
min (AVXRegister a, AVXRegister b) {
return { _mm256_min_ps (a.value, b.value) }; }
76 VCTR_FORCEDINLINE VCTR_TARGET (
"fma") static AVXRegister fma (AVXRegister a, AVXRegister b, AVXRegister c) {
return { _mm256_fmadd_ps (a.value, b.value, c.value) }; }
77 VCTR_FORCEDINLINE VCTR_TARGET (
"fma") static AVXRegister fms (AVXRegister a, AVXRegister b, AVXRegister c) {
return { _mm256_fnmadd_ps (a.value, b.value, c.value) }; }
81 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister<int32_t> convertToInt (AVXRegister x);
82 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<int32_t> reinterpretAsInt (AVXRegister x);
87struct AVXRegister<
double>
89 static constexpr size_t numElements = 4;
91 using NativeType = __m256d;
97 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadUnaligned (const
double* d) {
return { _mm256_loadu_pd (d) }; }
98 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadAligned (const
double* d) {
return { _mm256_load_pd (d) }; }
99 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister broadcast (
double x) {
return { _mm256_broadcast_sd (&x) }; }
100 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister fromSSE (SSERegister<
double> a, SSERegister<
double> b) {
return { _mm256_set_m128d (a.value, b.value) }; }
104 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeUnaligned (
double* d)
const { _mm256_storeu_pd (d, value); }
105 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeAligned (
double* d)
const { _mm256_store_pd (d, value); }
109 template <CompareOp Op>
110 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister compare (AVXRegister a, AVXRegister b) {
return { _mm256_cmp_pd (a.value, b.value, int (Op)) }; }
115 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseAndNot (AVXRegister a, AVXRegister b) {
return { _mm256_andnot_pd (b.value, a.value) }; }
116 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) {
return { _mm256_and_pd (a.value, b.value) }; }
117 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseBlend (AVXRegister a, AVXRegister b, AVXRegister mask) {
return { _mm256_blendv_pd (a.value, b.value, mask.value) }; }
121 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister floor (AVXRegister x) {
return { _mm256_floor_pd (x.value) }; }
122 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister ceil (AVXRegister x) {
return { _mm256_ceil_pd (x.value) }; }
123 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister mul (AVXRegister a, AVXRegister b) {
return { _mm256_mul_pd (a.value, b.value) }; }
124 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister add (AVXRegister a, AVXRegister b) {
return { _mm256_add_pd (a.value, b.value) }; }
125 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister sub (AVXRegister a, AVXRegister b) {
return { _mm256_sub_pd (a.value, b.value) }; }
126 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister div (AVXRegister a, AVXRegister b) {
return { _mm256_div_pd (a.value, b.value) }; }
127 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister
max (AVXRegister a, AVXRegister b) {
return { _mm256_max_pd (a.value, b.value) }; }
128 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister
min (AVXRegister a, AVXRegister b) {
return { _mm256_min_pd (a.value, b.value) }; }
129 VCTR_FORCEDINLINE VCTR_TARGET (
"fma") static AVXRegister fma (AVXRegister a, AVXRegister b, AVXRegister c) {
return { _mm256_fmadd_pd (a.value, b.value, c.value) }; }
130 VCTR_FORCEDINLINE VCTR_TARGET (
"fma") static AVXRegister fms (AVXRegister a, AVXRegister b, AVXRegister c) {
return { _mm256_fnmadd_pd (a.value, b.value, c.value) }; }
134 VCTR_FORCEDINLINE VCTR_TARGET (
"avx512vl") VCTR_TARGET ("avx512dq") static AVXRegister<int64_t> convertToInt (AVXRegister x);
135 VCTR_FORCEDINLINE VCTR_TARGET ("avx") static AVXRegister<int64_t> reinterpretAsInt (AVXRegister x);
140struct AVXRegister<int32_t>
142 static constexpr size_t numElements = 8;
144 using NativeType = __m256i;
150 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadUnaligned (const int32_t* d) {
return { _mm256_loadu_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
151 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadAligned (const int32_t* d) {
return { _mm256_load_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
152 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister broadcast (int32_t x) {
return { _mm256_set1_epi32 (x) }; }
153 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister fromSSE (SSERegister<int32_t> a, SSERegister<int32_t> b) {
return { _mm256_set_m128i (a.value, b.value) }; }
157 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeUnaligned (int32_t* d)
const { _mm256_storeu_si256 (
reinterpret_cast<__m256i*
> (d), value); }
158 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeAligned (int32_t* d)
const { _mm256_store_si256 (
reinterpret_cast<__m256i*
> (d), value); }
162 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) {
return { _mm256_and_si256 (a.value, b.value) }; }
163 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister bitwiseOr (AVXRegister a, AVXRegister b) {
return { _mm256_or_si256 (a.value, b.value) }; }
165 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseAndLegacy (AVXRegister a, AVXRegister b) {
return { _mm256_castps_si256 (_mm256_and_ps (_mm256_castsi256_ps (a.value), _mm256_castsi256_ps (b.value))) }; }
166 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseOrLegacy (AVXRegister a, AVXRegister b) {
return { _mm256_castps_si256 (_mm256_or_ps (_mm256_castsi256_ps (a.value), _mm256_castsi256_ps (b.value))) }; }
171 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister
abs (AVXRegister x) {
return { _mm256_abs_epi32 (x.value) }; }
172 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister add (AVXRegister a, AVXRegister b) {
return { _mm256_add_epi32 (a.value, b.value) }; }
173 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) {
return { _mm256_sub_epi32 (a.value, b.value) }; }
174 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister
max (AVXRegister a, AVXRegister b) {
return { _mm256_max_epi32 (a.value, b.value) }; }
175 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister
min (AVXRegister a, AVXRegister b) {
return { _mm256_min_epi32 (a.value, b.value) }; }
179 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister<
float> convertToFp (AVXRegister x) {
return { _mm256_cvtepi32_ps (x.value) }; }
180 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister<
float> reinterpretAsFp (AVXRegister x) {
return { _mm256_castsi256_ps (x.value) }; }
185struct AVXRegister<uint32_t>
187 static constexpr size_t numElements = 8;
189 using NativeType = __m256i;
195 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadUnaligned (const uint32_t* d) {
return { _mm256_loadu_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
196 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadAligned (const uint32_t* d) {
return { _mm256_load_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
197 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister broadcast (uint32_t x) {
return { _mm256_set1_epi32 ((int32_t) x) }; }
198 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister fromSSE (SSERegister<uint32_t> a, SSERegister<uint32_t> b) {
return { _mm256_set_m128i (a.value, b.value) }; }
202 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeUnaligned (uint32_t* d)
const { _mm256_storeu_si256 (
reinterpret_cast<__m256i*
> (d), value); }
203 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeAligned (uint32_t* d)
const { _mm256_store_si256 (
reinterpret_cast<__m256i*
> (d), value); }
210 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister add (AVXRegister a, AVXRegister b) {
return { _mm256_add_epi32 (a.value, b.value) }; }
211 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) {
return { _mm256_sub_epi32 (a.value, b.value) }; }
212 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister max (AVXRegister a, AVXRegister b) {
return { _mm256_max_epu32 (a.value, b.value) }; }
213 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister min (AVXRegister a, AVXRegister b) {
return { _mm256_min_epu32 (a.value, b.value) }; }
218struct AVXRegister<int64_t>
220 static constexpr size_t numElements = 4;
222 using NativeType = __m256i;
228 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadUnaligned (const int64_t* d) {
return { _mm256_loadu_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
229 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadAligned (const int64_t* d) {
return { _mm256_load_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
230 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister broadcast (int64_t x) {
return { _mm256_set1_epi64x (x) }; }
231 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister fromSSE (SSERegister<int64_t> a, SSERegister<int64_t> b) {
return { _mm256_set_m128i (a.value, b.value) }; }
235 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeUnaligned (int64_t* d)
const { _mm256_storeu_si256 (
reinterpret_cast<__m256i*
> (d), value); }
236 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeAligned (int64_t* d)
const { _mm256_store_si256 (
reinterpret_cast<__m256i*
> (d), value); }
240 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister bitwiseAnd (AVXRegister a, AVXRegister b) {
return { _mm256_and_si256 (a.value, b.value) }; }
241 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister bitwiseOr (AVXRegister a, AVXRegister b) {
return { _mm256_or_si256 (a.value, b.value) }; }
243 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseAndLegacy (AVXRegister a, AVXRegister b) {
return { _mm256_castpd_si256 (_mm256_and_pd (_mm256_castsi256_pd (a.value), _mm256_castsi256_pd (b.value))) }; }
244 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister bitwiseOrLegacy (AVXRegister a, AVXRegister b) {
return { _mm256_castpd_si256 (_mm256_or_pd (_mm256_castsi256_pd (a.value), _mm256_castsi256_pd (b.value))) }; }
248 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister add (AVXRegister a, AVXRegister b) {
return { _mm256_add_epi64 (a.value, b.value) }; }
249 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) {
return { _mm256_sub_epi64 (a.value, b.value) }; }
253 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister<
double> convertToFp (AVXRegister x) {
return { _mm256_cvtepi64_pd (x.value) }; }
254 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister<
double> reinterpretAsFp (AVXRegister x) {
return { _mm256_castsi256_pd (x.value) }; }
259struct AVXRegister<uint64_t>
261 static constexpr size_t numElements = 4;
263 using NativeType = __m256i;
269 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadUnaligned (const uint64_t* d) {
return { _mm256_loadu_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
270 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister loadAligned (const uint64_t* d) {
return { _mm256_load_si256 (
reinterpret_cast<const __m256i*
> (d)) }; }
271 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister broadcast (uint64_t x) {
return { _mm256_set1_epi64x ((int64_t) x) }; }
272 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") static AVXRegister fromSSE (SSERegister<uint64_t> a, SSERegister<uint64_t> b) {
return { _mm256_set_m128i (a.value, b.value) }; }
276 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeUnaligned (uint64_t* d)
const { _mm256_storeu_si256 (
reinterpret_cast<__m256i*
> (d), value); }
277 VCTR_FORCEDINLINE VCTR_TARGET (
"avx") void storeAligned (uint64_t* d)
const { _mm256_store_si256 (
reinterpret_cast<__m256i*
> (d), value); }
284 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister add (AVXRegister a, AVXRegister b) {
return { _mm256_add_epi64 (a.value, b.value) }; }
285 VCTR_FORCEDINLINE VCTR_TARGET (
"avx2") static AVXRegister sub (AVXRegister a, AVXRegister b) {
return { _mm256_sub_epi64 (a.value, b.value) }; }
289inline AVXRegister<int32_t> AVXRegister<float>::convertToInt (AVXRegister x) {
return { _mm256_cvtps_epi32 (x.value) }; }
290inline AVXRegister<int32_t> AVXRegister<float>::reinterpretAsInt (AVXRegister x) {
return { _mm256_castps_si256 (x.value) }; }
291inline AVXRegister<int64_t> AVXRegister<double>::convertToInt (AVXRegister x) {
return { _mm256_cvtpd_epi64 (x.value) }; }
292inline AVXRegister<int64_t> AVXRegister<double>::reinterpretAsInt (AVXRegister x) {
return { _mm256_castpd_si256 (x.value) }; }
constexpr ExpressionChainBuilder< expressions::Max > max
Computes the maximum value of the source values.
Definition: Max.h:198
constexpr ExpressionChainBuilder< expressions::Abs > abs
Computes the absolute value of the source values.
Definition: Abs.h:135
constexpr ExpressionChainBuilder< expressions::Min > min
Computes the minimum value of the source values.
Definition: Min.h:198
The main namespace of the VCTR project.
Definition: Array.h:24
Definition: AVXRegister.h:28