45 return *
this = (*
this * other);
48 return *
this = (*
this / other);
51 return *
this = (*
this + other);
54 return *
this = (*
this - other);
57 return *
this = (*
this & other);
60 return *
this = (*
this | other);
63 return *
this = (*
this ^ other);
88using TFromV =
typename V::PrivateT;
92template <
typename T,
typename FromT>
94 static_assert(
sizeof(T) <=
sizeof(FromT),
"Promoting is undefined");
96 CopyBytes<sizeof(FromT)>(&
v.raw, &to);
107template <
typename T,
typename T2>
109 return Vec1<T>(
static_cast<T
>(t));
117template <
typename T,
typename T2>
119 return Vec1<T>(
static_cast<T
>(first));
189 return Xor(x1,
Xor(x2, x3));
196 return Or(o1,
Or(o2, o3));
203 return Or(o,
And(a1, a2));
217 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
224 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
238#ifdef HWY_NATIVE_POPCNT
239#undef HWY_NATIVE_POPCNT
241#define HWY_NATIVE_POPCNT
251template <
typename TFrom,
typename TTo>
253 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
288 return mask.
bits ? yes : no;
303 return v.raw < 0 ? yes : no;
352template <
int kBits,
typename T>
354 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
359template <
int kBits,
typename T>
361 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
362#if __cplusplus >= 202002L
365 return Vec1<T>(
static_cast<T
>(
v.raw >> kBits));
372 const TU shifted =
static_cast<TU
>(
BitCast(du,
v).raw >> kBits);
374 const size_t sign_shift =
375 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - kBits);
376 const TU upper =
static_cast<TU
>(sign << sign_shift);
379 return Vec1<T>(
static_cast<T
>(
v.raw >> kBits));
391 template <
typename T>
393 return Or(ShiftRight<kBits>(
v),
ShiftLeft<
sizeof(T) * 8 - kBits>(
v));
399 template <
typename T>
407template <
int kBits,
typename T>
409 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
423#if __cplusplus >= 202002L
426 return Vec1<T>(
static_cast<T
>(
v.raw >> bits));
433 const TU shifted =
static_cast<TU
>(
BitCast(du,
v).raw >> bits);
435 const size_t sign_shift =
436 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - bits);
437 const TU upper =
static_cast<TU
>(sign << sign_shift);
440 return Vec1<T>(
static_cast<T
>(
v.raw >> bits));
462 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
463 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
464 return Vec1<T>(
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0))));
475 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
476 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
477 return Vec1<T>(
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0))));
564 return (i >= 0 || i == hwy::LimitsMin<T>()) ? a :
Vec1<T>(
static_cast<T
>(-i));
568 CopyBytes<sizeof(i)>(&a.
raw, &i);
570 CopyBytes<sizeof(i)>(&i, &a.
raw);
575 CopyBytes<sizeof(i)>(&a.
raw, &i);
576 i &= 0x7FFFFFFFFFFFFFFFL;
577 CopyBytes<sizeof(i)>(&i, &a.
raw);
586static inline float Abs(
float f) {
588 CopyBytes<4>(&f, &i);
590 CopyBytes<4>(&i, &f);
593static inline double Abs(
double f) {
595 CopyBytes<8>(&f, &i);
596 i &= 0x7FFFFFFFFFFFFFFFull;
597 CopyBytes<8>(&i, &f);
603 CopyBytes<4>(&f, &i);
604 return (i >> 31) != 0;
608 CopyBytes<8>(&f, &i);
609 return (i >> 63) != 0;
614template <
typename T, HWY_IF_NOT_FLOAT(T)>
619template <
typename T, HWY_IF_FLOAT(T)>
620HWY_API Vec1<T>
Min(
const Vec1<T> a,
const Vec1<T> b) {
621 if (isnan(a.raw))
return b;
622 if (isnan(b.raw))
return a;
623 return Vec1<T>(
HWY_MIN(a.raw, b.raw));
626template <
typename T, HWY_IF_NOT_FLOAT(T)>
631template <
typename T, HWY_IF_FLOAT(T)>
632HWY_API Vec1<T>
Max(
const Vec1<T> a,
const Vec1<T> b) {
633 if (isnan(a.raw))
return b;
634 if (isnan(b.raw))
return a;
635 return Vec1<T>(
HWY_MAX(a.raw, b.raw));
640template <
typename T, HWY_IF_FLOAT(T)>
645template <
typename T, HWY_IF_NOT_FLOAT(T)>
647 return Zero(Sisd<T>()) -
v;
652template <
typename T, HWY_IF_FLOAT(T)>
657template <
typename T, HWY_IF_SIGNED(T)>
659 return Vec1<T>(
static_cast<T
>(
static_cast<uint64_t
>(a.raw) *
660 static_cast<uint64_t
>(b.raw)));
663template <
typename T, HWY_IF_UNSIGNED(T)>
665 return Vec1<T>(
static_cast<T
>(
static_cast<uint64_t
>(a.raw) *
666 static_cast<uint64_t
>(b.raw)));
683 (
static_cast<uint32_t
>(a.
raw) *
static_cast<uint32_t
>(b.
raw)) >> 16));
692 const int64_t a64 = a.
raw;
696 const uint64_t a64 = a.
raw;
718 return mul * x + add;
724 return add - mul * x;
729 return mul * x - sub;
735 return Neg(mul) * x - sub;
743 const float half = f * 0.5f;
747 bits = 0x5F3759DF - (bits >> 1);
755#if HWY_COMPILER_GCC && defined(HWY_NO_LIBCXX)
762#if HWY_COMPILER_GCC && defined(HWY_NO_LIBCXX)
774 if (!(
Abs(
v).raw < MantissaEnd<T>())) {
777 const T bias =
v.raw < T(0.0) ? T(-0.5) : T(0.5);
778 const TI rounded =
static_cast<TI
>(
v.raw + bias);
781 if ((rounded & 1) &&
detail::Abs(
static_cast<T
>(rounded) -
v.raw) == T(0.5)) {
782 return Vec1<T>(
static_cast<T
>(rounded - (
v.raw < T(0) ? -1 : 1)));
784 return Vec1<T>(
static_cast<T
>(rounded));
792 const T abs =
Abs(
v).raw;
795 if (!(abs < MantissaEnd<T>())) {
797 if (!(abs <=
static_cast<T
>(LimitsMax<TI>()))) {
798 return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
802 const T bias =
v.raw < T(0.0) ? T(-0.5) : T(0.5);
803 const TI rounded =
static_cast<TI
>(
v.raw + bias);
806 if ((rounded & 1) &&
detail::Abs(
static_cast<T
>(rounded) -
v.raw) == T(0.5)) {
807 return Vec1<TI>(rounded - (is_sign ? -1 : 1));
815 if (!(
Abs(
v).raw <= MantissaEnd<T>())) {
818 const TI truncated =
static_cast<TI
>(
v.raw);
820 return Vec1<T>(
static_cast<T
>(truncated));
823template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
826 const Bits kExponentMask = (1ull << kExponentBits) - 1;
827 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
828 const Bits kBias = kExponentMask / 2;
831 const bool positive = f > Float(0.0);
837 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
839 if (exponent >= kMantissaBits)
return v;
841 if (exponent < 0)
return positive ? V(1) : V(-0.0);
843 const Bits mantissa_mask = kMantissaMask >> exponent;
845 if ((bits & mantissa_mask) == 0)
return v;
848 if (positive) bits += (kMantissaMask + 1) >> exponent;
849 bits &= ~mantissa_mask;
855template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
858 const Bits kExponentMask = (1ull << kExponentBits) - 1;
859 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
860 const Bits kBias = kExponentMask / 2;
863 const bool negative = f < Float(0.0);
869 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
871 if (exponent >= kMantissaBits)
return v;
873 if (exponent < 0)
return V(negative ? Float(-1.0) : Float(0.0));
875 const Bits mantissa_mask = kMantissaMask >> exponent;
877 if ((bits & mantissa_mask) == 0)
return v;
880 if (negative) bits += (kMantissaMask + 1) >> exponent;
881 bits &= ~mantissa_mask;
889 return Ceiling<float, uint32_t, 23, 8>(
v);
892 return Ceiling<double, uint64_t, 52, 11>(
v);
897 return Floor<float, uint32_t, 23, 8>(
v);
900 return Floor<double, uint64_t, 52, 11>(
v);
917 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
918 return (
v & bit) == bit;
964 return RebindMask(
d, (vu + vu) ==
Set(du, 0xFFE0000000000000ull));
1001template <
typename T>
1003 return Load(
d, aligned);
1008template <
typename T>
1014template <
typename T>
1019template <
typename T>
1022 if (!m.
bits)
return;
1029#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1030#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1032#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1035template <
typename T>
1038 v0 =
LoadU(
d, unaligned + 0);
1039 v1 =
LoadU(
d, unaligned + 1);
1042template <
typename T>
1045 v0 =
LoadU(
d, unaligned + 0);
1046 v1 =
LoadU(
d, unaligned + 1);
1047 v2 =
LoadU(
d, unaligned + 2);
1050template <
typename T>
1054 v0 =
LoadU(
d, unaligned + 0);
1055 v1 =
LoadU(
d, unaligned + 1);
1056 v2 =
LoadU(
d, unaligned + 2);
1057 v3 =
LoadU(
d, unaligned + 3);
1062template <
typename T>
1069template <
typename T>
1078template <
typename T>
1090template <
typename T>
1097template <
typename T,
typename Offset>
1100 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1101 uint8_t*
const base8 =
reinterpret_cast<uint8_t*
>(base) + offset.
raw;
1102 return Store(
v,
d,
reinterpret_cast<T*
>(base8));
1105template <
typename T,
typename Index>
1108 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1114template <
typename T,
typename Offset>
1117 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1118 const intptr_t addr =
1119 reinterpret_cast<intptr_t
>(base) +
static_cast<intptr_t
>(offset.
raw);
1120 return Load(
d,
reinterpret_cast<const T*
>(addr));
1123template <
typename T,
typename Index>
1126 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1135template <
typename FromT,
typename ToT>
1137 static_assert(
sizeof(ToT) >
sizeof(FromT),
"Not promoting");
1146 if (
IsInf(from).bits ||
1155 if (
IsInf(from).bits ||
1156 Abs(from).raw >
static_cast<double>(HighestValue<int32_t>())) {
1158 : HighestValue<int32_t>());
1163template <
typename FromT,
typename ToT>
1165 static_assert(!IsFloat<FromT>(),
"FromT=double are handled above");
1166 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
1176 const uint32_t sign =
static_cast<uint32_t
>(bits16 >> 15);
1177 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1178 const uint32_t mantissa = bits16 & 0x3FF;
1181 if (biased_exp == 0) {
1182 const float subnormal =
1183 (1.0f / 16384) * (
static_cast<float>(mantissa) * (1.0f / 1024));
1184 return Vec1<float>(sign ? -subnormal : subnormal);
1188 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1189 const uint32_t mantissa32 = mantissa << (23 - 10);
1190 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1204 const uint32_t sign = bits32 >> 31;
1205 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1206 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1208 const int32_t exp =
HWY_MIN(
static_cast<int32_t
>(biased_exp32) - 127, 15);
1213 const uint16_t zero = 0;
1218 uint32_t biased_exp16, mantissa16;
1223 const uint32_t sub_exp =
static_cast<uint32_t
>(-14 - exp);
1225 mantissa16 =
static_cast<uint32_t
>((1u << (10 - sub_exp)) +
1226 (mantissa32 >> (13 + sub_exp)));
1229 biased_exp16 =
static_cast<uint32_t
>(exp + 15);
1230 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1231 mantissa16 = mantissa32 >> 13;
1235 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1237 const uint16_t narrowed =
static_cast<uint16_t
>(bits16);
1246template <
typename FromT,
typename ToT, HWY_IF_FLOAT(FromT)>
1248 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1251 const double f =
static_cast<double>(from.
raw);
1252 if (
IsInf(from).bits ||
1255 : LimitsMax<ToT>());
1260template <
typename FromT,
typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1262 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1264 return Vec1<ToT>(
static_cast<ToT
>(from.raw));
1306template <
typename T>
1311template <
typename T>
1318template <
typename T>
1323template <
typename T>
1330template <
typename T>
1338template <
typename T>
1344template <
typename T>
1349template <
typename T>
1356template <
typename T>
1364template <
typename T>
1369template <
typename T,
typename TI>
1371 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane size");
1376template <
typename T,
typename TI>
1381template <
typename T>
1389template <
typename T>
1396template <
typename T>
1402template <
typename T>
1407template <
typename T>
1412template <
typename T>
1422template <
int kLane,
typename T>
1424 static_assert(kLane == 0,
"Scalar only has one lane");
1430template <
typename T,
typename TI>
1432 uint8_t in_bytes[
sizeof(T)];
1433 uint8_t idx_bytes[
sizeof(T)];
1434 uint8_t out_bytes[
sizeof(T)];
1435 CopyBytes<sizeof(T)>(&in, &in_bytes);
1436 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1437 for (
size_t i = 0; i <
sizeof(T); ++i) {
1438 out_bytes[i] = in_bytes[idx_bytes[i]];
1441 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1445template <
typename T,
typename TI>
1447 uint8_t in_bytes[
sizeof(T)];
1448 uint8_t idx_bytes[
sizeof(T)];
1449 uint8_t out_bytes[
sizeof(T)];
1450 CopyBytes<sizeof(T)>(&in, &in_bytes);
1451 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1452 for (
size_t i = 0; i <
sizeof(T); ++i) {
1453 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1456 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1483template <
typename T,
typename TW = MakeW
ide<T>,
class VW = Vec1<TW>>
1485 return VW(
static_cast<TW
>((TW{b.
raw} << (
sizeof(T) * 8)) + a.
raw));
1490template <
typename T>
1492 return mask.
bits == 0;
1495template <
typename T>
1497 return mask.
bits != 0;
1501template <
typename T>
1508template <
typename T>
1514template <
typename T>
1516 return mask.
bits == 0 ? 0 : 1;
1519template <
typename T>
1521 return mask.
bits == 0 ? -1 : 0;
1524template <
typename T>
1531template <
typename T>
1532struct CompressIsPartition {
1536template <
typename T>
1542template <
typename T>
1549template <
typename T>
1557template <
typename T>
1560 if (!mask.
bits)
return 0;
1566template <
typename T>
1572template <
typename T>
1600template <
typename TW>
1609template <
typename T>
1613template <
typename T>
1617template <
typename T>
#define HWY_MAX(a, b)
Definition: base.h:135
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_API
Definition: base.h:129
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_INLINE
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:238
Definition: scalar-inl.h:71
Raw bits
Definition: scalar-inl.h:81
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:72
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:75
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition: emu128-inl.h:633
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:889
static bool SignBit(float f)
Definition: scalar-inl.h:601
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:6349
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition: arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
V Ceiling(const V v)
Definition: scalar-inl.h:825
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:580
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6549
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:975
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:983
constexpr float HighestValue< float >()
Definition: base.h:688
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition: base.h:961
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
constexpr float LowestValue< float >()
Definition: base.h:675
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:595
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5730
Definition: scalar-inl.h:1365
MakeSigned< T > raw
Definition: scalar-inl.h:1366
Definition: ops/shared-inl.h:52
Definition: scalar-inl.h:35
T raw
Definition: scalar-inl.h:66
static constexpr size_t kPrivateN
Definition: scalar-inl.h:37
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:44
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:62
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:42
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:56
T PrivateT
Definition: scalar-inl.h:36
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:53
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:50
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:59
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:47
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:400
Definition: emu128-inl.h:422
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:392