-
Notifications
You must be signed in to change notification settings - Fork 1
/
x86_detail.h
286 lines (227 loc) · 10.6 KB
/
x86_detail.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
/* Copyright © 2023-2024 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
* Matthias Kretz <[email protected]>
*/
#ifndef PROTOTYPE_X86_DETAIL_H_
#define PROTOTYPE_X86_DETAIL_H_
#include "simd_meta.h"
#include "vec_detail.h"
#include <cstdint>
#if _GLIBCXX_SIMD_HAVE_SSE
#pragma GCC push_options
// ensure GCC knows about the __builtin_ia32_* calls
#pragma GCC target("sse2", "sse3", "ssse3", "sse4.1", "sse4.2", "avx", "avx2", "bmi", "bmi2")
#pragma GCC pop_options
namespace std::__detail
{
struct _MachineFlags
{
uint64_t _M_have_mmx : 1 = _GLIBCXX_SIMD_HAVE_MMX;
uint64_t _M_have_sse : 1 = _GLIBCXX_SIMD_HAVE_SSE;
uint64_t _M_have_sse2 : 1 = _GLIBCXX_SIMD_HAVE_SSE2;
uint64_t _M_have_sse3 : 1 = _GLIBCXX_SIMD_HAVE_SSE3;
uint64_t _M_have_ssse3 : 1 = _GLIBCXX_SIMD_HAVE_SSSE3;
uint64_t _M_have_sse4_1 : 1 = _GLIBCXX_SIMD_HAVE_SSE4_1;
uint64_t _M_have_sse4_2 : 1 = _GLIBCXX_SIMD_HAVE_SSE4_2;
uint64_t _M_have_xop : 1 = _GLIBCXX_SIMD_HAVE_XOP;
uint64_t _M_have_avx : 1 = _GLIBCXX_SIMD_HAVE_AVX;
uint64_t _M_have_avx2 : 1 = _GLIBCXX_SIMD_HAVE_AVX2;
uint64_t _M_have_bmi : 1 = _GLIBCXX_SIMD_HAVE_BMI;
uint64_t _M_have_bmi2 : 1 = _GLIBCXX_SIMD_HAVE_BMI2;
uint64_t _M_have_lzcnt : 1 = _GLIBCXX_SIMD_HAVE_LZCNT;
uint64_t _M_have_sse4a : 1 = _GLIBCXX_SIMD_HAVE_SSE4A;
uint64_t _M_have_fma : 1 = _GLIBCXX_SIMD_HAVE_FMA;
uint64_t _M_have_fma4 : 1 = _GLIBCXX_SIMD_HAVE_FMA4;
uint64_t _M_have_f16c : 1 = _GLIBCXX_SIMD_HAVE_F16C;
uint64_t _M_have_popcnt : 1 = _GLIBCXX_SIMD_HAVE_POPCNT;
uint64_t _M_have_avx512f : 1 = _GLIBCXX_SIMD_HAVE_AVX512F;
uint64_t _M_have_avx512dq : 1 = _GLIBCXX_SIMD_HAVE_AVX512DQ;
uint64_t _M_have_avx512vl : 1 = _GLIBCXX_SIMD_HAVE_AVX512VL;
uint64_t _M_have_avx512bw : 1 = _GLIBCXX_SIMD_HAVE_AVX512BW;
uint64_t _M_have_avx512bitalg : 1 = _GLIBCXX_SIMD_HAVE_AVX512BITALG;
uint64_t _M_have_avx512vbmi : 1 = _GLIBCXX_SIMD_HAVE_AVX512VBMI;
uint64_t _M_have_avx512vbmi2 : 1 = _GLIBCXX_SIMD_HAVE_AVX512VBMI2;
uint64_t _M_have_avx512ifma : 1 = _GLIBCXX_SIMD_HAVE_AVX512IFMA;
uint64_t _M_have_avx512cd : 1 = _GLIBCXX_SIMD_HAVE_AVX512CD;
uint64_t _M_have_avx512vnni : 1 = _GLIBCXX_SIMD_HAVE_AVX512VNNI;
uint64_t _M_have_avx512vpopcntdq : 1 = _GLIBCXX_SIMD_HAVE_AVX512VPOPCNTDQ;
uint64_t _M_have_avx512vp2intersect : 1 = _GLIBCXX_SIMD_HAVE_AVX512VP2INTERSECT;
uint64_t _M_have_avx512fp16 : 1 = _GLIBCXX_SIMD_HAVE_AVX512FP16;
uint64_t _M_padding = 0;
};
static_assert(sizeof(_MachineFlags) == sizeof(uint64_t) * 2);
template <__vectorizable _Tp>
struct __x86_builtin_int;
template <__vectorizable _Tp>
using __x86_builtin_int_t = typename __x86_builtin_int<_Tp>::type;
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 1)
struct __x86_builtin_int<_Tp>
{ using type = char; };
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 2)
struct __x86_builtin_int<_Tp>
{ using type = short; };
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 4)
struct __x86_builtin_int<_Tp>
{ using type = int; };
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 8)
struct __x86_builtin_int<_Tp>
{ using type = long long; };
template <__vectorizable _Tp>
struct __x86_builtin_fp;
template <__vectorizable _Tp>
using __x86_builtin_fp_t = typename __x86_builtin_fp<_Tp>::type;
#ifdef __STDCPP_FLOAT16_T__
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 2)
struct __x86_builtin_fp<_Tp>
{ using type = std::float16_t; };
#endif
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 4)
struct __x86_builtin_fp<_Tp>
{ using type = float; };
template <__vectorizable _Tp>
requires(sizeof(_Tp) == 8)
struct __x86_builtin_fp<_Tp>
{ using type = double; };
template <typename _TV>
using __x86_intrin_t = __vec_builtin_type_bytes<
typename conditional_t<is_floating_point_v<__value_type_of<_TV>>,
__x86_builtin_fp<__value_type_of<_TV>>,
type_identity<long long>>::type,
sizeof(_TV) <= 16 ? 16z : sizeof(_TV)>;
/**
* Return __x with suitable type for Intel intrinsics. If __x is smaller than a full XMM register,
* then a zero-padded 16-Byte object will be returned.
*/
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr __x86_intrin_t<_TV>
__to_x86_intrin(_TV __x)
{
static_assert(sizeof(_TV) <= 64);
using _RV = __x86_intrin_t<_TV>;
if constexpr (sizeof(_TV) < 16)
{
using _Up = __make_signed_int_t<_TV>;
__vec_builtin_type_bytes<_Up, 16> __tmp = {__builtin_bit_cast(_Up, __x)};
return reinterpret_cast<_RV>(__tmp);
}
else if constexpr (is_same_v<_TV, _RV>)
return __x;
else
return reinterpret_cast<_RV>(__x);
}
_GLIBCXX_SIMD_INTRINSIC int
__movmsk(__vec_builtin_sizeof<8, 16> auto __x) noexcept
{ return __builtin_ia32_movmskpd(reinterpret_cast<__v2double>(__x)); }
_GLIBCXX_SIMD_INTRINSIC int
__movmsk(__vec_builtin_sizeof<8, 32> auto __x) noexcept
{ return __builtin_ia32_movmskpd256(reinterpret_cast<__v4double>(__x)); }
_GLIBCXX_SIMD_INTRINSIC int
__movmsk(__vec_builtin_sizeof<4, 8> auto __x) noexcept
{
#if defined __x86_64__ and defined __BMI2__
return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
0x80000000'80000000ULL);
#else
using _Float2 [[gnu::vector_size(8)]] = float;
const _Float2 __tmp = __builtin_bit_cast(_Float2, __x);
return __builtin_ia32_movmskps(__builtin_shufflevector(__tmp, _Float2(), 0, 1, 2, 3));
#endif
}
_GLIBCXX_SIMD_INTRINSIC int
__movmsk(__vec_builtin_sizeof<4, 16> auto __x) noexcept
{ return __builtin_ia32_movmskps(reinterpret_cast<__v4float>(__x)); }
_GLIBCXX_SIMD_INTRINSIC int
__movmsk(__vec_builtin_sizeof<4, 32> auto __x) noexcept
{ return __builtin_ia32_movmskps256(reinterpret_cast<__v8float>(__x)); }
template <__vec_builtin _TV, auto _Flags = _MachineFlags()>
requires (sizeof(__value_type_of<_TV>) <= 2)
_GLIBCXX_SIMD_ALWAYS_INLINE inline int
__movmsk(_TV __x) noexcept
{
static_assert(__width_of<decltype(__x)> > 1);
if constexpr (sizeof(__x) == 32)
return __builtin_ia32_pmovmskb256(reinterpret_cast<__v32char>(__x));
else if constexpr (sizeof(__x) == 16)
return __builtin_ia32_pmovmskb128(reinterpret_cast<__v16char>(__x));
else if constexpr (sizeof(__x) == 8)
{
using _Int2 [[gnu::vector_size(8)]] = int;
const _Int2 __tmp = __builtin_bit_cast(_Int2, __x);
return __builtin_ia32_pmovmskb128(
reinterpret_cast<__v16char>(
__builtin_shufflevector(__tmp, _Int2(), 0, 1, 2, 3)));
}
else if constexpr (sizeof(__x) == 4)
{
if constexpr (_Flags._M_have_bmi2)
return __builtin_ia32_pext_si(__builtin_bit_cast(unsigned int, __x), 0x80808080u);
using _Int1 [[gnu::vector_size(4)]] = int;
const _Int1 __tmp = __builtin_bit_cast(_Int1, __x);
return __builtin_ia32_pmovmskb128(
reinterpret_cast<__v16char>(
__builtin_shufflevector(__tmp, _Int1(), 0, 1, 1, 1)));
}
else if constexpr (sizeof(__x) == 2)
{
auto __bits = __builtin_bit_cast(unsigned short, __x);
if constexpr (_Flags._M_have_bmi2)
return __builtin_ia32_pext_si(__bits, 0x00008080u);
else
return ((__bits >> 7) & 1) | ((__bits & 0x8000) >> 14);
}
else
__assert_unreachable<decltype(__x)>();
}
// calling the andnot builtins inhibits some optimizations, whereas GCC seems to be perfectly able
// to choose andn instructions by itself without any help
#if 0 // not defined __clang__
// overload __vec_andnot from detail.h
template <__vec_builtin _TV>
requires (sizeof(_TV) >= 16)
_GLIBCXX_SIMD_INTRINSIC constexpr _TV
__vec_andnot(_TV __a, _TV __b)
{
using _Tp = __value_type_of<_TV>;
using _UV = __vec_builtin_type<__make_unsigned_int_t<_Tp>, __width_of<_TV>>;
if (__builtin_is_constant_evaluated()
or (__builtin_constant_p(__a) and __builtin_constant_p(__b)))
return reinterpret_cast<_TV>(~reinterpret_cast<_UV>(__a) & reinterpret_cast<_UV>(__b));
else
return reinterpret_cast<_TV>([&] [[__gnu__::__always_inline__]] {
if constexpr (sizeof(_TV) == 16 and is_same_v<_Tp, float>)
return __builtin_ia32_andnps(__a, __b);
else if constexpr (sizeof(_TV) == 16 and is_same_v<_Tp, double>)
return __builtin_ia32_andnpd(__a, __b);
else if constexpr (sizeof(_TV) == 16 and is_integral_v<_Tp>)
return __builtin_ia32_pandn128(reinterpret_cast<__v2llong>(__a),
reinterpret_cast<__v2llong>(__b));
else if constexpr (sizeof(_TV) == 32 and is_same_v<_Tp, float>)
return __builtin_ia32_andnps256(__a, __b);
else if constexpr (sizeof(_TV) == 32 and is_same_v<_Tp, double>)
return __builtin_ia32_andnpd256(__a, __b);
else if constexpr (sizeof(_TV) == 32 and is_integral_v<_Tp> and __have_avx2)
return __builtin_ia32_andnotsi256(reinterpret_cast<__v4llong>(__a),
reinterpret_cast<__v4llong>(__b));
else if constexpr (sizeof(_TV) == 32 and is_integral_v<_Tp>)
return __builtin_ia32_andnpd256(reinterpret_cast<__v4double>(__a),
reinterpret_cast<__v4double>(__b));
else if constexpr (sizeof(_TV) == 64 and is_same_v<_Tp, float> and __have_avx512dq)
return __builtin_ia32_andnps512_mask(__a, __b, __v16float{}, -1);
else if constexpr (sizeof(_TV) == 64 and is_same_v<_Tp, double> and __have_avx512dq)
return __builtin_ia32_andnpd512_mask(__a, __b, __v8double{}, -1);
else if constexpr (sizeof(_TV) == 64)
return __builtin_ia32_pandnd512_mask(
reinterpret_cast<__v16int>(__a), reinterpret_cast<__v16int>(__b),
__v16int{}, -1);
}());
}
#endif // not __clang__
}
#endif // _GLIBCXX_SIMD_HAVE_SSE
#endif // PROTOTYPE_X86_DETAIL_H_