46#define HWY_NEON_BUILD_TPL_1
47#define HWY_NEON_BUILD_TPL_2
48#define HWY_NEON_BUILD_TPL_3
52#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
53#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
54#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
57#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
58#define HWY_NEON_BUILD_PARAM_2(type, size) \
59 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
60#define HWY_NEON_BUILD_PARAM_3(type, size) \
61 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
62 const Vec128<type##_t, size> c
66#define HWY_NEON_BUILD_ARG_1 a.raw
67#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
68#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
77#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
83#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
84 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
85 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
86 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
87 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
88 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
98#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
99 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
100 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
101 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
102 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
103 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
106#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
107 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
108 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
109 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
110 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
111 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
114#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
115 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
116 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
117 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
118 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
121#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
122 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
123 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
124 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
125 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
128#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
129 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
130 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
131 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
134#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
135 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
136 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
137 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
140#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
141 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
142 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
145#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
146 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
147 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
150#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
151 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
152 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
153 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
157#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
158 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
159 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
161#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
166#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
167 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
168 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
172#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
173 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
174 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
175 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
178#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
179 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
180 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
181 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
184#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
185 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
186 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
189#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
190 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
191 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
194#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
195 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
196 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
199#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
200 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
201 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
203#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
204 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
205 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
206 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
209#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \
210 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
211 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
212 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
213 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
214 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
215 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
216 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
217 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
221#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
222#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
223#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
224#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
225#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
226#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
227#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
228#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
229#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
230#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
231#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
232#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
233#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
234#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
235#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
236#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
237#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
238#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
239#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
240#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
241#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
242#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
243#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
244#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
245#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
246#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
247#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
248#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
249#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
250#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
251#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
252#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
253#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
254#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
255#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
256#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
257#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
258#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
259#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
260#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
261#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
262#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
263#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
264#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
265#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
266#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
267#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
268#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
269#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
270#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
271#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
272#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
273#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
274#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
275#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
276#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
282template <
typename T,
size_t N>
284template <
typename T,
size_t N>
286template <
typename T,
size_t N>
592template <
typename T,
size_t N>
654 using type = float64x2_t;
717 using type = float64x1_t;
777template <
typename T,
size_t N = 16 /
sizeof(T)>
783 static constexpr size_t kPrivateN =
N;
793 return *
this = (*
this * other);
796 return *
this = (*
this / other);
799 return *
this = (*
this + other);
802 return *
this = (*
this - other);
805 return *
this = (*
this & other);
808 return *
this = (*
this | other);
811 return *
this = (*
this ^ other);
824template <
typename T,
size_t N = 16 /
sizeof(T)>
853#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
854#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
855 Vec128<uint8_t, size * sizeof(type##_t)>
856#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
857#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
882#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
883#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
884#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
885#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
895template <
size_t N, HWY_IF_LE64(
int8_t, N)>
900template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
905template <
size_t N, HWY_IF_LE64(
int16_t, N)>
910template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
915template <
size_t N, HWY_IF_LE64(
int32_t, N)>
920template <
size_t N, HWY_IF_LE64(
float, N)>
996template <
typename T,
size_t N,
typename FromT>
998 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
1005#define HWY_NEON_BUILD_TPL_HWY_SET1
1006#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
1007#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
1008 Simd<type##_t, size, 0> , const type##_t t
1009#define HWY_NEON_BUILD_ARG_HWY_SET1 t
1013#undef HWY_NEON_BUILD_TPL_HWY_SET1
1014#undef HWY_NEON_BUILD_RET_HWY_SET1
1015#undef HWY_NEON_BUILD_PARAM_HWY_SET1
1016#undef HWY_NEON_BUILD_ARG_HWY_SET1
1019template <
typename T,
size_t N>
1034#if HWY_COMPILER_GCC_ACTUAL
1039template <
typename T,
size_t N>
1048template <
typename T,
size_t N,
typename T2>
1051 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
1055 return Load(
d, lanes);
1061#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1062#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1063#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1064#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1068#undef HWY_NEON_BUILD_TPL_HWY_GET
1069#undef HWY_NEON_BUILD_RET_HWY_GET
1070#undef HWY_NEON_BUILD_PARAM_HWY_GET
1071#undef HWY_NEON_BUILD_ARG_HWY_GET
1077 return detail::GetLane<0>(
v);
1084template <
typename T>
1088 return detail::GetLane<0>(
v);
1091template <
typename T>
1093#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1094 if (__builtin_constant_p(i)) {
1097 return detail::GetLane<0>(
v);
1099 return detail::GetLane<1>(
v);
1103 alignas(16) T lanes[2];
1108template <
typename T>
1110#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1111 if (__builtin_constant_p(i)) {
1114 return detail::GetLane<0>(
v);
1116 return detail::GetLane<1>(
v);
1118 return detail::GetLane<2>(
v);
1120 return detail::GetLane<3>(
v);
1124 alignas(16) T lanes[4];
1129template <
typename T>
1131#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1132 if (__builtin_constant_p(i)) {
1135 return detail::GetLane<0>(
v);
1137 return detail::GetLane<1>(
v);
1139 return detail::GetLane<2>(
v);
1141 return detail::GetLane<3>(
v);
1143 return detail::GetLane<4>(
v);
1145 return detail::GetLane<5>(
v);
1147 return detail::GetLane<6>(
v);
1149 return detail::GetLane<7>(
v);
1153 alignas(16) T lanes[8];
1158template <
typename T>
1160#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1161 if (__builtin_constant_p(i)) {
1164 return detail::GetLane<0>(
v);
1166 return detail::GetLane<1>(
v);
1168 return detail::GetLane<2>(
v);
1170 return detail::GetLane<3>(
v);
1172 return detail::GetLane<4>(
v);
1174 return detail::GetLane<5>(
v);
1176 return detail::GetLane<6>(
v);
1178 return detail::GetLane<7>(
v);
1180 return detail::GetLane<8>(
v);
1182 return detail::GetLane<9>(
v);
1184 return detail::GetLane<10>(
v);
1186 return detail::GetLane<11>(
v);
1188 return detail::GetLane<12>(
v);
1190 return detail::GetLane<13>(
v);
1192 return detail::GetLane<14>(
v);
1194 return detail::GetLane<15>(
v);
1198 alignas(16) T lanes[16];
1206#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1207#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1208#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1209 Vec128<type##_t, size> v, type##_t t
1210#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1214#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1215#undef HWY_NEON_BUILD_RET_HWY_INSERT
1216#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1217#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1224template <
typename T>
1231template <
typename T>
1233#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1234 if (__builtin_constant_p(i)) {
1237 return detail::InsertLane<0>(
v, t);
1239 return detail::InsertLane<1>(
v, t);
1244 alignas(16) T lanes[2];
1247 return Load(
d, lanes);
1250template <
typename T>
1252#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1253 if (__builtin_constant_p(i)) {
1256 return detail::InsertLane<0>(
v, t);
1258 return detail::InsertLane<1>(
v, t);
1260 return detail::InsertLane<2>(
v, t);
1262 return detail::InsertLane<3>(
v, t);
1267 alignas(16) T lanes[4];
1270 return Load(
d, lanes);
1273template <
typename T>
1275#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1276 if (__builtin_constant_p(i)) {
1279 return detail::InsertLane<0>(
v, t);
1281 return detail::InsertLane<1>(
v, t);
1283 return detail::InsertLane<2>(
v, t);
1285 return detail::InsertLane<3>(
v, t);
1287 return detail::InsertLane<4>(
v, t);
1289 return detail::InsertLane<5>(
v, t);
1291 return detail::InsertLane<6>(
v, t);
1293 return detail::InsertLane<7>(
v, t);
1298 alignas(16) T lanes[8];
1301 return Load(
d, lanes);
1304template <
typename T>
1306#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1307 if (__builtin_constant_p(i)) {
1310 return detail::InsertLane<0>(
v, t);
1312 return detail::InsertLane<1>(
v, t);
1314 return detail::InsertLane<2>(
v, t);
1316 return detail::InsertLane<3>(
v, t);
1318 return detail::InsertLane<4>(
v, t);
1320 return detail::InsertLane<5>(
v, t);
1322 return detail::InsertLane<6>(
v, t);
1324 return detail::InsertLane<7>(
v, t);
1326 return detail::InsertLane<8>(
v, t);
1328 return detail::InsertLane<9>(
v, t);
1330 return detail::InsertLane<10>(
v, t);
1332 return detail::InsertLane<11>(
v, t);
1334 return detail::InsertLane<12>(
v, t);
1336 return detail::InsertLane<13>(
v, t);
1338 return detail::InsertLane<14>(
v, t);
1340 return detail::InsertLane<15>(
v, t);
1345 alignas(16) T lanes[16];
1348 return Load(
d, lanes);
1424#pragma push_macro("HWY_NEON_DEF_FUNCTION")
1425#undef HWY_NEON_DEF_FUNCTION
1426#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1427 template <int kBits> \
1428 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1429 return kBits == 0 ? v \
1430 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1431 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1439#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1443template <
int kBits,
size_t N>
1445 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1446 if (kBits == 0)
return v;
1450template <
int kBits,
size_t N>
1452 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1453 if (kBits == 0)
return v;
1466template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1476template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1486template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1505template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1515template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1525template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1547template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1559template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1571template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1593template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1603template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1613template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1630template <
typename T,
size_t N>
1632 return v << Set(Simd<T, N, 0>(),
static_cast<T
>(bits));
1634template <
typename T,
size_t N>
1651template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1656template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1672template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1677template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1686 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
1688 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
1690 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
1693 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1697 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
1699 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
1701 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
1704 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1707template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1710 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
1713template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1716 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
1723template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1776template <
size_t N, HWY_IF_LE64(
float, N)>
1785#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1786template <
size_t N, HWY_IF_LE64(
float, N)>
1788 const Vec128<float, N> x,
1789 const Vec128<float, N> add) {
1790 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1792HWY_API Vec128<float>
MulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1793 const Vec128<float> add) {
1794 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1802 return mul * x + add;
1807HWY_API Vec64<double>
MulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1808 const Vec64<double> add) {
1809 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1811HWY_API Vec128<double>
MulAdd(
const Vec128<double> mul,
const Vec128<double> x,
1812 const Vec128<double> add) {
1813 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1818#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1819template <
size_t N, HWY_IF_LE64(
float, N)>
1821 const Vec128<float, N> x,
1822 const Vec128<float, N> add) {
1823 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1825HWY_API Vec128<float>
NegMulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1826 const Vec128<float> add) {
1827 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1835 return add - mul * x;
1840HWY_API Vec64<double>
NegMulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1841 const Vec64<double> add) {
1842 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1845 const Vec128<double> x,
1846 const Vec128<double> add) {
1847 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1869HWY_API Vec128<double, N>
MulSub(
const Vec128<double, N> mul,
1870 const Vec128<double, N> x,
1871 const Vec128<double, N> sub) {
1876 const Vec128<double, N> x,
1877 const Vec128<double, N> sub) {
1920 const auto root =
v * recip;
1930template <
typename T>
1936template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1940 using V8 =
decltype(
Zero(d8));
1963template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1966 return detail::reversed_andnot(mask, not_mask);
1970template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1972 const Vec128<T, N> mask) {
1973 const DFromV<
decltype(mask)>
d;
1975 VFromD<
decltype(du)> ret =
1976 detail::reversed_andnot(
BitCast(du, mask),
BitCast(du, not_mask));
2005#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
2011HWY_API Vec128<T,
N>
Xor3(Vec128<T,
N> x1, Vec128<T,
N> x2, Vec128<T,
N> x3) {
2012 return Xor(x1,
Xor(x2, x3));
2015template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2016HWY_API Vec128<T, N>
Xor3(
const Vec128<T, N> x1,
const Vec128<T, N> x2,
2017 const Vec128<T, N> x3) {
2024template <
typename T,
size_t N>
2026 return Xor(x1,
Xor(x2, x3));
2032template <
typename T,
size_t N>
2034 return Or(o1,
Or(o2, o3));
2039template <
typename T,
size_t N>
2041 return Or(o,
And(a1, a2));
2046template <
typename T,
size_t N>
2054template <
typename T,
size_t N>
2059template <
typename T,
size_t N>
2064template <
typename T,
size_t N>
2071#ifdef HWY_NATIVE_POPCNT
2072#undef HWY_NATIVE_POPCNT
2074#define HWY_NATIVE_POPCNT
2079template <
typename T>
2084template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2092template <
typename T>
2095 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2098template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2102 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2106template <
typename T>
2109 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2110 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2112template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2116 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2120template <
typename T>
2123 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2124 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2126template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2130 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2131 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2136template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
2160template <
size_t N, HWY_IF_LE64(
int8_t, N)>
2164template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2168template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2172template <
size_t N, HWY_IF_LE64(
float, N)>
2178HWY_API Vec128<double>
Abs(
const Vec128<double>
v) {
2179 return Vec128<double>(vabsq_f64(
v.raw));
2183 return Vec64<double>(vabs_f64(
v.raw));
2189template <
typename T,
size_t N>
2192 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2197template <
typename T,
size_t N>
2200 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2206template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
2216template <
typename T,
size_t N>
2222template <
typename T,
size_t N>
2229template <
typename TFrom,
typename TTo,
size_t N>
2231 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
2237#define HWY_NEON_BUILD_TPL_HWY_IF
2238#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2239#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2240 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2241 const Vec128<type##_t, size> no
2242#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2246#undef HWY_NEON_BUILD_TPL_HWY_IF
2247#undef HWY_NEON_BUILD_RET_HWY_IF
2248#undef HWY_NEON_BUILD_PARAM_HWY_IF
2249#undef HWY_NEON_BUILD_ARG_HWY_IF
2252template <
typename T,
size_t N>
2259template <
typename T,
size_t N>
2265template <
typename T,
size_t N>
2268 static_assert(IsSigned<T>(),
"Only works for signed/float");
2276template <
typename T,
size_t N>
2279 return Max(zero,
v);
2284template <
typename T,
size_t N>
2289template <
typename T,
size_t N>
2295template <
typename T,
size_t N>
2301template <
typename T,
size_t N>
2307template <
typename T,
size_t N>
2313template <
typename T,
size_t N>
2345#define HWY_NEON_BUILD_TPL_HWY_COMPARE
2346#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2347#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2348 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2349#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2373#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2374#undef HWY_NEON_BUILD_RET_HWY_COMPARE
2375#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2376#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2384 const Vec128<int64_t, N> b) {
2385 const Simd<int32_t, N * 2, 0> d32;
2386 const Simd<int64_t, N, 0> d64;
2394 const Vec128<uint64_t, N> b) {
2395 const Simd<uint32_t, N * 2, 0> d32;
2396 const Simd<uint64_t, N, 0> d64;
2403 const Vec128<int64_t> b) {
2404 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2408 const Vec64<int64_t> b) {
2409 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2415 const Vec128<uint64_t, N> b) {
2416 const DFromV<
decltype(a)> du;
2418 const Vec128<uint64_t, N> msb =
AndNot(a, b) |
AndNot(a ^ b, a - b);
2427#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2428#undef HWY_NEON_DEF_FUNCTION
2432#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2433 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2434 Vec128<type##_t, size> b) { \
2435 return Not(a == b); \
2440#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2444template <
typename T,
size_t N>
2448template <
typename T,
size_t N>
2455template <
typename T,
size_t N>
2463#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2464#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2465#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2466 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2467#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2479 return (
v & bit) == bit;
2484 return (
v & bit) == bit;
2488#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2489#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2490#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2491#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2518 const
Vec128<uint64_t,
N> b) {
2522 const DFromV<
decltype(a)> du;
2556 const
Vec128<uint64_t,
N> b) {
2560 const DFromV<
decltype(a)> du;
2630 return Vec128<double>(vld1q_f64(unaligned));
2675 return Vec64<double>(vld1_f64(p));
2694template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
2698 CopyBytes<4>(p, &buf);
2715template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2719 CopyBytes<2>(p, &buf);
2740 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2747 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2752template <
typename T,
size_t N>
2757template <
typename T,
size_t N>
2764template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2774 vst1q_u8(unaligned,
v.raw);
2778 vst1q_u16(unaligned,
v.raw);
2782 vst1q_u32(unaligned,
v.raw);
2786 vst1q_u64(unaligned,
v.raw);
2790 vst1q_s8(unaligned,
v.raw);
2794 vst1q_s16(unaligned,
v.raw);
2798 vst1q_s32(unaligned,
v.raw);
2802 vst1q_s64(unaligned,
v.raw);
2806 vst1q_f32(unaligned,
v.raw);
2811 vst1q_f64(unaligned,
v.raw);
2864 vst1_lane_u32(p,
v.raw, 0);
2868 vst1_lane_s32(p,
v.raw, 0);
2872 vst1_lane_f32(p,
v.raw, 0);
2875template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
2879 CopyBytes<4>(&buf, p);
2886 vst1_lane_u16(p,
v.raw, 0);
2890 vst1_lane_s16(p,
v.raw, 0);
2893template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2897 CopyBytes<2>(&buf, p);
2904 vst1_lane_u8(p,
v.raw, 0);
2908 vst1_lane_s8(p,
v.raw, 0);
2916 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2923 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2928#if HWY_COMPILER_GCC_ACTUAL
2933template <
typename T,
size_t N>
2940template <
typename T,
size_t N>
2945 const auto blended =
2954template <
typename T,
size_t N>
2971 uint16x8_t a = vmovl_u8(
v.raw);
2986 uint16x8_t a = vmovl_u8(
v.raw);
2994template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
2999template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3002 uint16x8_t a = vmovl_u8(
v.raw);
3010template <
size_t N, HWY_IF_LE64(u
int64_t, N)>
3015template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3020template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3023 uint16x8_t a = vmovl_u8(
v.raw);
3024 uint32x4_t b = vmovl_u16(vget_low_u16(a));
3027template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3030 uint32x4_t a = vmovl_u16(
v.raw);
3041 int16x8_t a = vmovl_s8(
v.raw);
3062 int16x8_t a = vmovl_s8(
v.raw);
3063 int32x4_t b = vmovl_s16(vget_low_s16(a));
3080 const Vec128<float16_t, 4>
v) {
3081 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
3082 return Vec128<float>(f32);
3086 const Vec128<float16_t, N>
v) {
3087 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
3088 return Vec128<float, N>(vget_low_f32(f32));
3100 const auto sign = ShiftRight<15>(bits16);
3101 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3102 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3103 const auto subnormal =
3105 Set(df32, 1.0f / 16384 / 1024));
3107 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3108 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3109 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3110 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3111 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3119 const Vec64<float>
v) {
3120 return Vec128<double>(vcvt_f64_f32(
v.raw));
3124 const Vec32<float>
v) {
3125 return Vec64<double>(vget_low_f64(vcvt_f64_f32(
v.raw)));
3129 const Vec64<int32_t>
v) {
3130 const int64x2_t i64 = vmovl_s32(
v.raw);
3131 return Vec128<double>(vcvtq_f64_s64(i64));
3135 const Vec32<int32_t>
v) {
3136 const int64x1_t i64 = vget_low_s64(vmovl_s32(
v.raw));
3137 return Vec64<double>(vcvt_f64_s64(i64));
3155 const uint16x4_t a = vqmovun_s32(
v.raw);
3164 const int16x4_t a = vqmovn_s32(
v.raw);
3173template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3178template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3183template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3186 const uint16x4_t a = vqmovun_s32(vcombine_s32(
v.raw,
v.raw));
3189template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3194template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3197 const int16x4_t a = vqmovn_s32(vcombine_s32(
v.raw,
v.raw));
3200template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3209 const Vec128<float>
v) {
3210 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(
v.raw))};
3214 const Vec128<float, N>
v) {
3215 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(
v.raw,
v.raw));
3216 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3225 const Rebind<uint32_t,
decltype(du16)> du;
3227 const auto bits32 =
BitCast(du,
v);
3228 const auto sign = ShiftRight<31>(bits32);
3229 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3230 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3232 const auto k15 =
Set(di, 15);
3233 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3234 const auto is_tiny = exp <
Set(di, -24);
3236 const auto is_subnormal = exp <
Set(di, -14);
3237 const auto biased_exp16 =
3239 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3240 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3241 (mantissa32 >> (
Set(du, 13) + sub_exp));
3243 ShiftRight<13>(mantissa32));
3245 const auto sign16 = ShiftLeft<15>(sign);
3246 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3256 const Rebind<int32_t,
decltype(dbf16)> di32;
3257 const Rebind<uint32_t,
decltype(dbf16)> du32;
3258 const Rebind<uint16_t,
decltype(dbf16)> du16;
3259 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3266 return Vec64<float>(vcvt_f32_f64(
v.raw));
3269 return Vec32<float>(vcvt_f32_f64(vcombine_f64(
v.raw,
v.raw)));
3273 const Vec128<double>
v) {
3274 const int64x2_t i64 = vcvtq_s64_f64(
v.raw);
3275 return Vec64<int32_t>(vqmovn_s64(i64));
3278 const Vec64<double>
v) {
3279 const int64x1_t i64 = vcvt_s64_f64(
v.raw);
3281 const int64x2_t i64x2 = vcombine_s64(i64, i64);
3282 return Vec32<int32_t>(vqmovn_s64(i64x2));
3289 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3292template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3295 const uint8x8_t w = vuzp1_u8(org_v, org_v);
3310 uint16x8_t c = vcombine_u16(a.
raw, b.
raw);
3319 int16x8_t c = vcombine_s16(a.
raw, b.
raw);
3331template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3341template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3352template <
size_t N, HWY_IF_LE64(
float, N)>
3361 const Vec128<int64_t>
v) {
3362 return Vec128<double>(vcvtq_f64_s64(
v.raw));
3365 const Vec64<int64_t>
v) {
3366 return Vec64<double>(vcvt_f64_s64(
v.raw));
3370 const Vec128<uint64_t>
v) {
3371 return Vec128<double>(vcvtq_f64_u64(
v.raw));
3374 const Vec64<uint64_t>
v) {
3375 return Vec64<double>(vcvt_f64_u64(
v.raw));
3380 const Vec128<double>
v) {
3381 return Vec128<int64_t>(vcvtq_s64_f64(
v.raw));
3384 const Vec64<double>
v) {
3385 return Vec64<int64_t>(vcvt_s64_f64(
v.raw));
3430 const auto int_f =
ConvertTo(df, integer);
3445 const auto added = large +
v;
3446 const auto rounded = added - large;
3458 const auto int_f =
ConvertTo(df, integer);
3472 const auto int_f =
ConvertTo(df, integer);
3487 return Vec128<int32_t>(vcvtnq_s32_f32(
v.raw));
3489template <
size_t N, HWY_IF_LE64(
float, N)>
3491 return Vec128<int32_t, N>(vcvtn_s32_f32(
v.raw));
3505template <
typename T,
size_t N>
3510template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3520template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3529 const VFromD<
decltype(di)> exp =
3531 return RebindMask(
d, Lt(exp,
Set(di, hwy::MaxExponentField<T>())));
3539template <
typename T,
size_t N, HWY_IF_LE64(u
int8_t, N)>
3573 return Vec64<double>(vget_low_f64(
v.raw));
3582template <
typename T,
size_t N>
3591template <
int kBytes,
typename T,
class V128 = Vec128<T>>
3593 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
3595 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3600template <
int kBytes,
typename T>
3602 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
3604 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3616template <
int kBytes>
3626 template <
class T,
size_t N, HWY_IF_LE64(T, N)>
3630 const auto zero64 =
Zero(d64);
3631 const decltype(zero64) v64(
v.raw);
3633 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3638 template <
class T,
size_t N>
3645 template <
class T,
size_t N>
3651template <
int kBytes>
3653 template <
class T,
size_t N>
3657 if (
N *
sizeof(T) < 8) {
3658 constexpr size_t kReg =
N *
sizeof(T) == 16 ? 16 : 8;
3659 const Simd<T, kReg /
sizeof(T), 0> dreg;
3663 return CombineShiftRightBytes<kBytes>(
d,
Zero(
d),
v);
3668 template <
class T,
size_t N>
3675 template <
class T,
size_t N>
3683template <
int kBytes,
typename T,
size_t N>
3689template <
int kBytes,
typename T,
size_t N>
3694template <
int kLanes,
typename T,
size_t N>
3700template <
int kLanes,
typename T,
size_t N>
3706template <
int kBytes,
typename T,
size_t N>
3712template <
int kLanes,
typename T,
size_t N>
3719template <
int kBytes,
typename T,
size_t N, HWY_IF_LE32(T, N)>
3722 constexpr size_t kSize =
N *
sizeof(T);
3723 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3727 using V64 =
VFromD<
decltype(d_full8)>;
3728 const V64 hi64(
BitCast(d8, hi).raw);
3776 const Vec128<double>
v) {
3777 return Vec64<double>(vget_high_f64(
v.raw));
3784 const Twice<
decltype(duh)> du;
3789template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3796 return Vec128<T, (
N + 1) / 2>(upper.raw);
3805 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3806 return Vec128<uint16_t>(vdupq_laneq_u16(
v.raw, kLane));
3808template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3810 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3811 return Vec128<uint16_t, N>(vdup_lane_u16(
v.raw, kLane));
3815 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3816 return Vec128<uint32_t>(vdupq_laneq_u32(
v.raw, kLane));
3818template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3820 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3821 return Vec128<uint32_t, N>(vdup_lane_u32(
v.raw, kLane));
3825 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3826 return Vec128<uint64_t>(vdupq_laneq_u64(
v.raw, kLane));
3833 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3834 return Vec128<int16_t>(vdupq_laneq_s16(
v.raw, kLane));
3836template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3838 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3839 return Vec128<int16_t, N>(vdup_lane_s16(
v.raw, kLane));
3843 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3844 return Vec128<int32_t>(vdupq_laneq_s32(
v.raw, kLane));
3846template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3848 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3849 return Vec128<int32_t, N>(vdup_lane_s32(
v.raw, kLane));
3853 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3854 return Vec128<int64_t>(vdupq_laneq_s64(
v.raw, kLane));
3861 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3862 return Vec128<float>(vdupq_laneq_f32(
v.raw, kLane));
3864template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3866 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3867 return Vec128<float, N>(vdup_lane_f32(
v.raw, kLane));
3871 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3872 return Vec128<double>(vdupq_laneq_f64(
v.raw, kLane));
3876 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3886 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3889template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3891 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3896 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3899template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3901 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3906 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3914 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3917template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3919 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3924 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3927template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3929 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3934 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3942 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3945template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3947 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3955 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3960 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3967template <
typename T,
size_t N>
3972template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3974 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3975#if HWY_IS_DEBUG_BUILD
3976 const Rebind<TI,
decltype(
d)> di;
3978 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(
N)))));
3982 using V8 =
VFromD<
decltype(d8)>;
3986 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
3987 if (
sizeof(T) == 4) {
3988 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
3989 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3990 const V8 lane_indices =
3992 const V8 byte_indices =
3994 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3995 0, 1, 2, 3, 0, 1, 2, 3};
3996 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3999 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
4000 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
4001 const V8 lane_indices =
4003 const V8 byte_indices =
4005 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
4006 0, 1, 2, 3, 4, 5, 6, 7};
4007 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
4012template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
4014 const Rebind<TI,
decltype(
d)> di;
4018template <
typename T,
size_t N>
4029template <
typename T>
4035template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4040template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4046template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4052template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4060template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4065template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4071template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
4076template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4082template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4089template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4094template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4100template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4105template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4112template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4117template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4130template <
typename T>
4134template <
typename T>
4140template <
typename T>
4146template <
typename T>
4152template <
typename T>
4168 const Vec128<uint64_t> b) {
4169 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4172 const Vec128<int64_t> b) {
4173 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4176 const Vec128<double> b) {
4177 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4196template <
size_t N, HWY_IF_LE64(
float, N)>
4203template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4209template <
typename T,
size_t N,
class V = Vec128<T, N>>
4224 const Vec128<uint64_t> b) {
4225 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4228 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4231 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4255template <
typename T,
size_t N, HWY_IF_GE64(T, N),
class V = Vec128<T, N>>
4261template <
typename T,
size_t N, HWY_IF_LE32(T, N),
class V = Vec128<T, N>>
4263 const Half<
decltype(
d)> d2;
4271template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4275template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4280template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4293 const Rebind<uint32_t,
decltype(df32)> du32;
4294 using VU32 =
VFromD<
decltype(du32)>;
4295 const VU32 odd =
Set(du32, 0xFFFF0000u);
4297 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
4299 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
4391HWY_API Vec128<double>
Combine(Full128<double> , Vec64<double> hi,
4393 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4398template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4414 return Add(sum0, sum1);
4424 const Half<
decltype(
d)> d64;
4447template <
typename T,
size_t N>
4455template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4470#define HWY_NEON_BUILD_TPL_HWY_TRN
4471#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4474#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4475 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4476#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4498template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4506 using VU =
VFromD<
decltype(du)>;
4508 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4516template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4525template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4533 using VU =
VFromD<
decltype(du)>;
4535 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4543template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4550template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4553 constexpr size_t kSize =
N *
sizeof(T);
4555 const Full64<uint8_t> d8x8;
4556 const Full64<T> d64;
4557 using V8x8 =
VFromD<
decltype(d8x8)>;
4558 const V8x8 hi8x8(
BitCast(d8, hi).raw);
4563 return Vec128<T, N>(
BitCast(d64, r).raw);
4569template <
typename T,
size_t N>
4584template <typename T,
size_t N,
4592template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4595 const Twice<
decltype(
d)> d2;
4606template <
typename T>
4615template <
typename T,
size_t N,
4623template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4626 const Twice<
decltype(
d)> d2;
4637template <
typename T>
4645template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4648 return detail::InterleaveEven(
v,
v);
4650 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[0]);
4654template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4661template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4664 return detail::InterleaveOdd(
v,
v);
4666 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[1]);
4670template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4677template <
typename T,
size_t N>
4681 alignas(16)
constexpr uint8_t kBytes[16] = {
4682 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
4683 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
4684 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
4685 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
4686 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
4687 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
4688 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
4689 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
4696template <
typename T,
size_t N>
4703template <
typename T,
size_t N>
4711template <
typename T>
4722 const Repartition<uint32_t,
decltype(dbf16)> du32;
4735 return Combine(d16, a16, b16);
4755#if defined(__ARM_FEATURE_AES) || \
4756 (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64)
4759#ifdef HWY_NATIVE_AES
4760#undef HWY_NATIVE_AES
4762#define HWY_NATIVE_AES
4766 Vec128<uint8_t> round_key) {
4771 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4776 Vec128<uint8_t> round_key) {
4777 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4781 return Vec128<uint64_t>((uint64x2_t)vmull_p64(
GetLane(a),
GetLane(b)));
4785 return Vec128<uint64_t>(
4786 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4796 const Rebind<uint16_t,
decltype(df32)> du16;
4840template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
4850template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
4859template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
4877 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4884 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4893 return Vec128<int64_t, (
N + 1) / 2>(
4894 vget_low_s64(vmull_s32(a_packed, b_packed)));
4902 return Vec128<uint64_t, (
N + 1) / 2>(
4903 vget_low_u64(vmull_u32(a_packed, b_packed)));
4908 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 0), vgetq_lane_u64(b.
raw, 0), &hi);
4914 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 1), vgetq_lane_u64(b.
raw, 1), &hi);
4921template <
typename T,
typename TI>
4930 uint8x16_t table0 =
BitCast(d8, bytes).raw;
4932 table.val[0] = vget_low_u8(table0);
4933 table.val[1] = vget_high_u8(table0);
4934 uint8x16_t idx =
BitCast(d8, from).raw;
4935 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4936 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4942template <
typename T,
typename TI,
size_t NI, HWY_IF_LE64(TI, NI)>
4947 const auto idx_full =
Combine(d_full, from64, from64);
4953template <
typename T,
size_t N,
typename TI, HWY_IF_LE64(T, N)>
4961template <
typename T,
size_t N,
typename TI,
size_t NI,
HWY_IF_LE64(T,
N),
4967 const Repartition<uint8_t,
decltype(d_idx)> d_idx8;
4970 const auto from8 =
BitCast(d_idx8, from);
4971 const VFromD<
decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4976template <
class V,
class VI>
4983template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
4987 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4989 alignas(16) T lanes[
N];
4992 alignas(16) Offset offset_lanes[
N];
4993 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
4995 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
4996 for (
size_t i = 0; i <
N; ++i) {
4997 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
5001template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
5004 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
5006 alignas(16) T lanes[
N];
5009 alignas(16) Index index_lanes[
N];
5010 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
5012 for (
size_t i = 0; i <
N; ++i) {
5013 base[index_lanes[i]] = lanes[i];
5019template <
typename T,
size_t N,
typename Offset>
5023 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
5025 alignas(16) Offset offset_lanes[
N];
5026 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
5028 alignas(16) T lanes[
N];
5029 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
5030 for (
size_t i = 0; i <
N; ++i) {
5031 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
5033 return Load(
d, lanes);
5036template <
typename T,
size_t N,
typename Index>
5040 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
5042 alignas(16) Index index_lanes[
N];
5043 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
5045 alignas(16) T lanes[
N];
5046 for (
size_t i = 0; i <
N; ++i) {
5047 lanes[i] = base[index_lanes[i]];
5049 return Load(
d, lanes);
5057template <
typename T>
5062template <
typename T>
5067template <
typename T>
5075#define HWY_NEON_BUILD_RET_REDUCTION(type, size) Vec128<type##_t, size>
5076#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix, dup) \
5077 HWY_API HWY_NEON_BUILD_RET_REDUCTION(type, size) \
5078 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5079 return HWY_NEON_BUILD_RET_REDUCTION( \
5080 type, size)(dup##suffix(HWY_NEON_EVAL(prefix##infix##suffix, v.raw))); \
5083#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
5084 HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8, vdup_n_) \
5085 HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8, vdupq_n_) \
5086 HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16, vdup_n_) \
5087 HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16, vdupq_n_) \
5088 HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32, vdup_n_) \
5089 HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32, vdupq_n_) \
5090 HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8, vdup_n_) \
5091 HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8, vdupq_n_) \
5092 HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16, vdup_n_) \
5093 HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16, vdupq_n_) \
5094 HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32, vdup_n_) \
5095 HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32, vdupq_n_) \
5096 HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32, vdup_n_) \
5097 HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32, vdupq_n_) \
5098 HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64, vdupq_n_)
5100HWY_NEON_DEF_REDUCTION_CORE_TYPES(
MinOfLanes, vminv)
5101HWY_NEON_DEF_REDUCTION_CORE_TYPES(
MaxOfLanes, vmaxv)
5104#define HWY_NEON_DEF_REDUCTION_ALL_TYPES(name, prefix) \
5105 HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
5106 HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64, vdupq_n_) \
5107 HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64, vdupq_n_)
5109HWY_NEON_DEF_REDUCTION_ALL_TYPES(
SumOfLanes, vaddv)
5111#undef HWY_NEON_DEF_REDUCTION_ALL_TYPES
5112#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
5113#undef HWY_NEON_DEF_REDUCTION
5114#undef HWY_NEON_BUILD_RET_REDUCTION
5117#define HWY_IF_SUM_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 1 << 2)
5118#define HWY_IF_MINMAX_REDUCTION(T) \
5119 HWY_IF_LANE_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
5123template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
5128template <
typename T>
5133template <
typename T>
5142 uint32x4x2_t v0 = vuzpq_u32(
v.raw,
v.raw);
5143 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
5144 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
5149 int32x4x2_t v0 = vuzpq_s32(
v.raw,
v.raw);
5150 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
5151 int32x4x2_t v1 = vuzpq_s32(c0, c0);
5156 float32x4x2_t v0 = vuzpq_f32(
v.raw,
v.raw);
5157 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
5158 float32x4x2_t v1 = vuzpq_f32(c0, c0);
5170template <
typename T>
5176 return Min(v20_31_20_31, v31_20_31_20);
5178template <
typename T>
5184 return Max(v20_31_20_31, v31_20_31_20);
5187#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
5188#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
5189#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
5190 HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) \
5191 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5192 HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
5193 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5194 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5195 return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
5196 type, size)(HWY_NEON_EVAL(vdup##_lane_##suffix, tmp, 0)); \
5198#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
5200 HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) \
5201 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5202 HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
5203 tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
5204 vget_low_##suffix(v.raw)); \
5205 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5206 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5207 if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5208 tmp = vdup_lane_##suffix(tmp, 0); \
5209 return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
5210 type, size)(HWY_NEON_EVAL(vcombine_##suffix, tmp, tmp)); \
5213#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
5214 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \
5215 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
5216 HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
5217 HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
5218 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
5219 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
5220 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
5221 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8)
5227#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
5228#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
5229#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
5230#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
5231#undef HWY_NEON_BUILD_TYPE_T
5233template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
5239 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5244template <
size_t N, HWY_IF_GE32(
int16_t, N)>
5250 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5251 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5257template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
5263 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5268template <
size_t N, HWY_IF_GE32(
int16_t, N)>
5274 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5275 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5281template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
5287 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5292template <
size_t N, HWY_IF_GE32(
int16_t, N)>
5298 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5299 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5306#define HWY_IF_SUM_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 0)
5307#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 1 << 8)
5312template <
typename T, HWY_IF_SUM_REDUCTION(T)>
5317template <
typename T, HWY_IF_MINMAX_REDUCTION(T)>
5322template <
typename T, HWY_IF_MINMAX_REDUCTION(T)>
5328#undef HWY_IF_SUM_REDUCTION
5329#undef HWY_IF_MINMAX_REDUCTION
5333template <
typename T,
size_t N>
5337template <
typename T,
size_t N>
5341template <
typename T,
size_t N>
5353template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5358template <
typename T>
5363template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
5368 const auto vmask_bits =
Set64(du, mask_bits);
5371 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
5372 1, 1, 1, 1, 1, 1, 1, 1};
5375 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
5376 1, 2, 4, 8, 16, 32, 64, 128};
5380template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5383 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
5384 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
5388template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
5391 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
5392 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
5396template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5399 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
5406template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5409 uint64_t mask_bits = 0;
5420template <
typename T>
5428template <
typename T>
5431 const Twice<
decltype(
d)> d2;
5437template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5442 constexpr size_t kBytes =
sizeof(T) *
N;
5443 return nib & ((1ull << (kBytes * 4)) - 1);
5446template <
typename T>
5449 alignas(16)
constexpr uint8_t kSliceLanes[16] = {
5450 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5458 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
5459 const uint8x8_t x4 = vpadd_u8(x2, x2);
5460 const uint8x8_t x8 = vpadd_u8(x4, x4);
5461 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5464 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
5465 const uint32x4_t x4 = vpaddlq_u16(x2);
5466 const uint64x2_t x8 = vpaddlq_u32(x4);
5467 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5471template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5476 alignas(8)
constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5477 0x10, 0x20, 0x40, 0x80};
5484 return vaddv_u8(values.
raw);
5486 const uint16x4_t x2 = vpaddl_u8(values.
raw);
5487 const uint32x2_t x4 = vpaddl_u16(x2);
5488 const uint64x1_t x8 = vpaddl_u32(x4);
5489 return vget_lane_u64(x8, 0);
5493template <
typename T>
5496 alignas(16)
constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5497 0x10, 0x20, 0x40, 0x80};
5503 return vaddvq_u16(values.
raw);
5505 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
5506 const uint64x2_t x4 = vpaddlq_u32(x2);
5507 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5511template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5516 alignas(8)
constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5522 return vaddv_u16(values.
raw);
5524 const uint32x2_t x2 = vpaddl_u16(values.
raw);
5525 const uint64x1_t x4 = vpaddl_u32(x2);
5526 return vget_lane_u64(x4, 0);
5530template <
typename T>
5533 alignas(16)
constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5539 return vaddvq_u32(values.
raw);
5541 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
5542 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5546template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5551 alignas(8)
constexpr uint32_t kSliceLanes[2] = {1, 2};
5557 return vaddv_u32(values.
raw);
5559 const uint64x1_t x2 = vpaddl_u32(values.
raw);
5560 return vget_lane_u64(x2, 0);
5564template <
typename T>
5566 alignas(16)
constexpr uint64_t kSliceLanes[2] = {1, 2};
5572 return vaddvq_u64(values.
raw);
5574 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
5578template <
typename T>
5584 return vget_lane_u64(values.
raw, 0);
5588template <
typename T,
size_t N>
5590 return ((
N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull <<
N) - 1));
5593template <
typename T,
size_t N>
5608template <
typename T>
5611 const int8x16_t ones =
5615 return static_cast<size_t>(vaddvq_s8(ones));
5617 const int16x8_t x2 = vpaddlq_s8(ones);
5618 const int32x4_t x4 = vpaddlq_s16(x2);
5619 const int64x2_t x8 = vpaddlq_s32(x4);
5620 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5623template <
typename T>
5626 const int16x8_t ones =
5630 return static_cast<size_t>(vaddvq_s16(ones));
5632 const int32x4_t x2 = vpaddlq_s16(ones);
5633 const int64x2_t x4 = vpaddlq_s32(x2);
5634 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5638template <
typename T>
5641 const int32x4_t ones =
5645 return static_cast<size_t>(vaddvq_s32(ones));
5647 const int64x2_t x2 = vpaddlq_s32(ones);
5648 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5652template <
typename T>
5656 const int64x2_t ones =
5658 return static_cast<size_t>(vaddvq_s64(ones));
5662 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5663 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5670template <
typename T>
5676template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5678 constexpr int kDiv = 4 *
sizeof(T);
5682template <
typename T,
size_t N>
5686 constexpr size_t kDiv = 4 *
sizeof(T);
5690template <
typename T,
size_t N>
5694 if (nib == 0)
return -1;
5695 constexpr int kDiv = 4 *
sizeof(T);
5700template <
typename T,
size_t N>
5704 const size_t kNumBytes = (
N + 7) / 8;
5705 CopyBytes<kNumBytes>(&mask_bits, bits);
5709template <
typename T,
size_t N>
5715template <
typename T>
5720template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5722 constexpr size_t kBytes =
sizeof(T) *
N;
5728template <
typename T>
5730 enum { value = (
sizeof(T) != 1) };
5737 const uint8_t* bytes) {
5739 vld1q_dup_u64(
reinterpret_cast<const uint64_t*
>(bytes))));
5743template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
5745 const uint8_t* bytes) {
5746 return Load(
d, bytes);
5749template <
typename T,
size_t N>
5751 const uint64_t mask_bits) {
5765 alignas(16)
constexpr uint8_t table[256 * 8] = {
5767 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5768 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5769 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5770 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5771 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5772 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5773 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5774 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5775 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5776 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5777 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5778 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5779 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5780 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5781 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5782 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5783 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5784 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5785 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5786 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5787 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5788 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5789 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5790 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5791 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5792 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5793 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5794 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5795 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5796 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5797 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5798 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5799 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5800 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5801 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5802 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5803 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5804 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5805 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5806 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5807 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5808 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5809 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5810 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5811 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5812 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5813 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5814 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5815 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5816 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5817 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5818 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5819 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5820 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5821 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5822 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5823 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5824 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5825 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5826 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5827 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5828 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5829 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5830 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5831 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5832 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5833 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5834 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5835 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5836 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5837 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5838 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5839 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5840 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5841 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5842 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5843 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5844 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5845 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5846 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5847 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5848 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5849 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5850 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5851 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5852 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5853 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5854 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5855 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5856 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5857 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5858 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5859 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5860 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5861 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5862 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5863 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5864 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5865 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5866 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5867 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5868 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5869 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5870 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5871 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5872 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5873 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5874 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5875 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5876 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5877 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5878 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5879 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5880 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5881 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5882 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5883 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5884 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5885 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5886 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5887 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5888 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5889 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5890 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5891 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5892 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5893 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5894 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5901template <
typename T,
size_t N>
5903 const uint64_t mask_bits) {
5917 alignas(16)
constexpr uint8_t table[256 * 8] = {
5919 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
5920 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
5921 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
5922 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5923 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
5924 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
5925 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
5926 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
5927 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
5928 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
5929 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
5930 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
5931 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
5932 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
5933 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
5934 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
5935 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
5936 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
5937 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
5938 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
5939 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
5940 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
5941 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
5942 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
5943 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
5944 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
5945 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
5946 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
5947 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
5948 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
5949 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
5950 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
5951 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
5952 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
5953 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
5954 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
5955 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
5956 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
5957 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
5958 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
5959 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
5960 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
5961 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
5962 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
5963 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
5964 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
5965 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
5966 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
5967 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
5968 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
5969 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
5970 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
5971 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
5972 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
5973 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
5974 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
5975 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
5976 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
5977 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
5978 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
5979 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
5980 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
5981 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
5982 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
5983 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
5984 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
5985 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
5986 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
5987 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
5988 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
5989 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
5990 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
5991 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
5992 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
5993 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
5994 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
5995 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
5996 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
5997 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
5998 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
5999 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
6000 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
6001 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
6002 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
6003 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
6004 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
6005 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
6006 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
6007 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
6008 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
6009 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
6010 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
6011 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
6012 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
6013 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
6014 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
6015 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
6016 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
6017 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
6018 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
6019 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
6020 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
6021 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
6022 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
6023 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
6024 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
6025 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
6026 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
6027 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
6028 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
6029 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
6030 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
6031 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
6032 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
6033 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
6034 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
6035 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
6036 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
6037 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
6038 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
6039 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
6040 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
6041 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
6042 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
6043 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
6044 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
6045 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
6046 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6053template <
typename T,
size_t N>
6055 const uint64_t mask_bits) {
6059 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
6061 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6062 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6063 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
6064 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6065 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
6066 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
6067 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
6068 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6069 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6070 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
6071 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
6072 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6073 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6074 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
6075 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
6076 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6079 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6082template <
typename T,
size_t N>
6084 const uint64_t mask_bits) {
6088 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
6090 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6091 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6092 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6093 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6094 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6095 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6096 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6097 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6098 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6099 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6100 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6101 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6102 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6103 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6107 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6110#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
6112template <
typename T,
size_t N>
6114 const uint64_t mask_bits) {
6118 alignas(16)
constexpr uint8_t u8_indices[64] = {
6120 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6121 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6122 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6123 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6125 const Simd<T, N, 0>
d;
6127 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6130template <
typename T,
size_t N>
6132 const uint64_t mask_bits) {
6136 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
6138 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6139 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6140 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6141 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6143 const Simd<T, N, 0>
d;
6145 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6152template <
typename T,
size_t N>
6155 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
6161template <
typename T,
size_t N>
6164 detail::IdxFromNotBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
6173template <
typename T>
6179template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6191template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6197template <
typename T>
6203template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6215template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6219 if (
N < 16 /
sizeof(T)) {
6233template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6236 uint64_t mask_bits = 0;
6237 constexpr size_t kNumBytes = (
N + 7) / 8;
6238 CopyBytes<kNumBytes>(bits, &mask_bits);
6240 mask_bits &= (1ull <<
N) - 1;
6247template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6256template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6261 using TU =
TFromD<
decltype(du)>;
6263 const size_t count =
PopCount(mask_bits);
6272template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6276 uint64_t mask_bits = 0;
6277 constexpr size_t kNumBytes = (
N + 7) / 8;
6278 CopyBytes<kNumBytes>(bits, &mask_bits);
6280 mask_bits &= (1ull <<
N) - 1;
6290#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
6291#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
6293#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
6297#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
6298#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
6301#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
6302#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6305#define HWY_IF_LOAD_INT(T, N) \
6306 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6307#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
6308 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6309 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6310 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6311 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6312 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6318#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6319 decltype(Tuple2<type##_t, size>().raw)
6321#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6322 const type##_t *from, Tuple2<type##_t, size>
6324#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6325#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6327#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6328 decltype(Tuple3<type##_t, size>().raw)
6329#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6330 const type##_t *from, Tuple3<type##_t, size>
6332#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6333#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6335#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6336 decltype(Tuple4<type##_t, size>().raw)
6337#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6338 const type##_t *from, Tuple4<type##_t, size>
6340#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6341#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6343#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
6344#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
6345#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
6348template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6358template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6363 alignas(16) T buf[2 * 8 /
sizeof(T)] = {};
6364 CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
6366 v0 = Vec128<T, N>(raw.val[0]);
6367 v1 = Vec128<T, N>(raw.val[1]);
6372template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6374 Vec128<T>& v0, Vec128<T>& v1) {
6375 const Half<
decltype(
d)> dh;
6376 VFromD<
decltype(dh)> v00, v10, v01, v11;
6386template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6397template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6400 Vec128<T, N>& v1, Vec128<T, N>& v2) {
6402 alignas(16) T buf[3 * 8 /
sizeof(T)] = {};
6403 CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
6405 v0 = Vec128<T, N>(raw.val[0]);
6406 v1 = Vec128<T, N>(raw.val[1]);
6407 v2 = Vec128<T, N>(raw.val[2]);
6412template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6414 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6415 const Half<
decltype(
d)> dh;
6416 VFromD<
decltype(dh)> v00, v10, v20, v01, v11, v21;
6427template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6440template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6443 Vec128<T, N>& v1, Vec128<T, N>& v2,
6445 alignas(16) T buf[4 * 8 /
sizeof(T)] = {};
6446 CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6448 v0 = Vec128<T, N>(raw.val[0]);
6449 v1 = Vec128<T, N>(raw.val[1]);
6450 v2 = Vec128<T, N>(raw.val[2]);
6451 v3 = Vec128<T, N>(raw.val[3]);
6456template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6458 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6460 const Half<
decltype(
d)> dh;
6461 VFromD<
decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6471#undef HWY_IF_LOAD_INT
6476#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6477#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6478#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6481#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6482#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6485#define HWY_IF_STORE_INT(T, N) \
6486 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6487#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6488 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6489 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6490 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6491 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6492 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6495#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6496 Tuple2<type##_t, size> tup, type##_t *to
6498#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6500#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6501 Tuple3<type##_t, size> tup, type##_t *to
6503#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6505#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6506 Tuple4<type##_t, size> tup, type##_t *to
6508#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6510#undef HWY_NEON_DEF_FUNCTION_STORE_INT
6511#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6512#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6513#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6516template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6525template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6529 alignas(16) T buf[2 * 8 /
sizeof(T)];
6530 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6532 CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6537template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6540 const Half<
decltype(
d)> dh;
6548template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6557template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6559 const Vec128<T, N> v2, Simd<T, N, 0> ,
6561 alignas(16) T buf[3 * 8 /
sizeof(T)];
6562 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6564 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6569template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6571 const Vec128<T> v2, Full128<T>
d,
6573 const Half<
decltype(
d)> dh;
6583template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6593template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6595 const Vec128<T, N> v2,
const Vec128<T, N> v3,
6598 alignas(16) T buf[4 * 8 /
sizeof(T)];
6599 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6601 CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6606template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6608 const Vec128<T> v2,
const Vec128<T> v3,
6610 const Half<
decltype(
d)> dh;
6618#undef HWY_IF_STORE_INT
6622template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6625 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
6650template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6659template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6662 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
6667template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6676template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6679 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
6684template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6774#undef HWY_NEON_BUILD_ARG_1
6775#undef HWY_NEON_BUILD_ARG_2
6776#undef HWY_NEON_BUILD_ARG_3
6777#undef HWY_NEON_BUILD_PARAM_1
6778#undef HWY_NEON_BUILD_PARAM_2
6779#undef HWY_NEON_BUILD_PARAM_3
6780#undef HWY_NEON_BUILD_RET_1
6781#undef HWY_NEON_BUILD_RET_2
6782#undef HWY_NEON_BUILD_RET_3
6783#undef HWY_NEON_BUILD_TPL_1
6784#undef HWY_NEON_BUILD_TPL_2
6785#undef HWY_NEON_BUILD_TPL_3
6786#undef HWY_NEON_DEF_FUNCTION
6787#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6788#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6789#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6790#undef HWY_NEON_DEF_FUNCTION_FULL_UI
6791#undef HWY_NEON_DEF_FUNCTION_INT_16
6792#undef HWY_NEON_DEF_FUNCTION_INT_32
6793#undef HWY_NEON_DEF_FUNCTION_INT_8
6794#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6795#undef HWY_NEON_DEF_FUNCTION_INTS
6796#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6797#undef HWY_NEON_DEF_FUNCTION_TPL
6798#undef HWY_NEON_DEF_FUNCTION_UIF81632
6799#undef HWY_NEON_DEF_FUNCTION_UINT_16
6800#undef HWY_NEON_DEF_FUNCTION_UINT_32
6801#undef HWY_NEON_DEF_FUNCTION_UINT_8
6802#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
6803#undef HWY_NEON_DEF_FUNCTION_UINTS
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:166
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:189
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:199
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:145
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:140
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:6487
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:98
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:194
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)
Definition: arm_neon-inl.h:209
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix)
Definition: arm_neon-inl.h:5213
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:128
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:172
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:2432
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:121
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition: arm_neon-inl.h:203
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:184
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:106
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:134
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:6307
#define HWY_IF_FLOAT(T)
Definition: base.h:417
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:78
#define HWY_IF_LE64(T, N)
Definition: base.h:407
#define HWY_API
Definition: base.h:129
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_IF_NOT_FLOAT(T)
Definition: base.h:418
#define HWY_INLINE
Definition: base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:79
#define HWY_DASSERT(condition)
Definition: base.h:238
#define HWY_ASSERT(condition)
Definition: base.h:192
#define HWY_IF_UNSIGNED(T)
Definition: base.h:414
Definition: arm_neon-inl.h:825
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:830
Mask128(const Mask128 &)=default
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:833
Raw raw
Definition: arm_neon-inl.h:835
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:827
Definition: arm_neon-inl.h:778
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:785
T PrivateT
Definition: arm_neon-inl.h:782
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:788
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:795
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:779
Raw raw
Definition: arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:801
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:810
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:807
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:792
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:804
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:798
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:718
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5447
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition: emu128-inl.h:633
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1570
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1748
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3661
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2612
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:5736
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3286
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:5354
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5902
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:6153
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1844
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3275
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition: arm_neon-inl.h:5421
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:663
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4543
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:889
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:6162
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:852
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5609
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:861
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:671
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5750
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:5068
HWY_INLINE Vec128< T > ZeroExtendVector(hwy::NonFloatTag, Full128< T >, Vec64< T > lo)
Definition: x86_128-inl.h:4567
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5589
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4235
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1613
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:5364
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1406
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1899
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6584
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:1403
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6677
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
typename D::Twice Twice
Definition: ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition: arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition: arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec128< T, 2 > ConcatEven(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4638
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
typename D::Half Half
Definition: ops/shared-inl.h:227
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, 2 > ConcatOdd(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4607
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6549
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:806
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:924
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:607
constexpr float MantissaEnd< float >()
Definition: base.h:758
double float64_t
Definition: base.h:303
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:383
float float32_t
Definition: base.h:302
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:595
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5729
Definition: arm_neon-inl.h:3968
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3969
Definition: ops/shared-inl.h:52
uint16x4_t type
Definition: arm_neon-inl.h:706
uint16x8_t type
Definition: arm_neon-inl.h:643
uint16x4_t type
Definition: arm_neon-inl.h:701
uint16x8_t type
Definition: arm_neon-inl.h:638
float32x2_t type
Definition: arm_neon-inl.h:711
float32x4_t type
Definition: arm_neon-inl.h:648
int16x4_t type
Definition: arm_neon-inl.h:686
int16x8_t type
Definition: arm_neon-inl.h:623
int32x2_t type
Definition: arm_neon-inl.h:691
int32x4_t type
Definition: arm_neon-inl.h:628
int64x1_t type
Definition: arm_neon-inl.h:696
int64x2_t type
Definition: arm_neon-inl.h:633
int8x16_t type
Definition: arm_neon-inl.h:618
int8x8_t type
Definition: arm_neon-inl.h:681
uint16x4_t type
Definition: arm_neon-inl.h:666
uint16x8_t type
Definition: arm_neon-inl.h:603
uint32x2_t type
Definition: arm_neon-inl.h:671
uint32x4_t type
Definition: arm_neon-inl.h:608
uint64x1_t type
Definition: arm_neon-inl.h:676
uint64x2_t type
Definition: arm_neon-inl.h:613
uint8x16_t type
Definition: arm_neon-inl.h:598
uint8x8_t type
Definition: arm_neon-inl.h:661
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:61
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3639
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3646
Definition: arm_neon-inl.h:3617
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3627
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3620
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3669
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3676
Definition: arm_neon-inl.h:3652
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3654
uint16x8x2_t raw
Definition: arm_neon-inl.h:364
uint16x4x2_t raw
Definition: arm_neon-inl.h:368
uint16x8x2_t raw
Definition: arm_neon-inl.h:356
uint16x4x2_t raw
Definition: arm_neon-inl.h:360
float32x4x2_t raw
Definition: arm_neon-inl.h:373
float32x2x2_t raw
Definition: arm_neon-inl.h:377
int16x8x2_t raw
Definition: arm_neon-inl.h:315
int16x4x2_t raw
Definition: arm_neon-inl.h:319
int32x4x2_t raw
Definition: arm_neon-inl.h:331
int32x2x2_t raw
Definition: arm_neon-inl.h:335
int64x2x2_t raw
Definition: arm_neon-inl.h:347
int64x1x2_t raw
Definition: arm_neon-inl.h:351
int8x16x2_t raw
Definition: arm_neon-inl.h:299
int8x8x2_t raw
Definition: arm_neon-inl.h:303
uint16x8x2_t raw
Definition: arm_neon-inl.h:307
uint16x4x2_t raw
Definition: arm_neon-inl.h:311
uint32x4x2_t raw
Definition: arm_neon-inl.h:323
uint32x2x2_t raw
Definition: arm_neon-inl.h:327
uint64x2x2_t raw
Definition: arm_neon-inl.h:339
uint64x1x2_t raw
Definition: arm_neon-inl.h:343
uint8x16x2_t raw
Definition: arm_neon-inl.h:291
uint8x8x2_t raw
Definition: arm_neon-inl.h:295
Definition: arm_neon-inl.h:283
uint16x8x3_t raw
Definition: arm_neon-inl.h:465
uint16x4x3_t raw
Definition: arm_neon-inl.h:469
uint16x8x3_t raw
Definition: arm_neon-inl.h:457
uint16x4x3_t raw
Definition: arm_neon-inl.h:461
float32x4x3_t raw
Definition: arm_neon-inl.h:474
float32x2x3_t raw
Definition: arm_neon-inl.h:478
int16x8x3_t raw
Definition: arm_neon-inl.h:416
int16x4x3_t raw
Definition: arm_neon-inl.h:420
int32x4x3_t raw
Definition: arm_neon-inl.h:432
int32x2x3_t raw
Definition: arm_neon-inl.h:436
int64x2x3_t raw
Definition: arm_neon-inl.h:448
int64x1x3_t raw
Definition: arm_neon-inl.h:452
int8x16x3_t raw
Definition: arm_neon-inl.h:400
int8x8x3_t raw
Definition: arm_neon-inl.h:404
uint16x8x3_t raw
Definition: arm_neon-inl.h:408
uint16x4x3_t raw
Definition: arm_neon-inl.h:412
uint32x4x3_t raw
Definition: arm_neon-inl.h:424
uint32x2x3_t raw
Definition: arm_neon-inl.h:428
uint64x2x3_t raw
Definition: arm_neon-inl.h:440
uint64x1x3_t raw
Definition: arm_neon-inl.h:444
uint8x16x3_t raw
Definition: arm_neon-inl.h:392
uint8x8x3_t raw
Definition: arm_neon-inl.h:396
Definition: arm_neon-inl.h:285
uint16x8x4_t raw
Definition: arm_neon-inl.h:566
uint16x4x4_t raw
Definition: arm_neon-inl.h:570
uint16x8x4_t raw
Definition: arm_neon-inl.h:558
uint16x4x4_t raw
Definition: arm_neon-inl.h:562
float32x4x4_t raw
Definition: arm_neon-inl.h:575
float32x2x4_t raw
Definition: arm_neon-inl.h:579
int16x8x4_t raw
Definition: arm_neon-inl.h:517
int16x4x4_t raw
Definition: arm_neon-inl.h:521
int32x4x4_t raw
Definition: arm_neon-inl.h:533
int32x2x4_t raw
Definition: arm_neon-inl.h:537
int64x2x4_t raw
Definition: arm_neon-inl.h:549
int64x1x4_t raw
Definition: arm_neon-inl.h:553
int8x16x4_t raw
Definition: arm_neon-inl.h:501
int8x8x4_t raw
Definition: arm_neon-inl.h:505
uint16x8x4_t raw
Definition: arm_neon-inl.h:509
uint16x4x4_t raw
Definition: arm_neon-inl.h:513
uint32x4x4_t raw
Definition: arm_neon-inl.h:525
uint32x2x4_t raw
Definition: arm_neon-inl.h:529
uint64x2x4_t raw
Definition: arm_neon-inl.h:541
uint64x1x4_t raw
Definition: arm_neon-inl.h:545
uint8x16x4_t raw
Definition: arm_neon-inl.h:493
uint8x8x4_t raw
Definition: arm_neon-inl.h:497
Definition: arm_neon-inl.h:287