Grok 10.0.5
rvv-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// RISC-V V vectors (length not known at compile time).
17// External include guard in highway.h - see comment there.
18
19#include <riscv_vector.h>
20#include <stddef.h>
21#include <stdint.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30template <class V>
31struct DFromV_t {}; // specialized in macros
32template <class V>
33using DFromV = typename DFromV_t<RemoveConst<V>>::type;
34
35template <class V>
36using TFromV = TFromD<DFromV<V>>;
37
38// Enables the overload if Pow2 is in [min, max].
39#define HWY_RVV_IF_POW2_IN(D, min, max) \
40 hwy::EnableIf<(min) <= Pow2(D()) && Pow2(D()) <= (max)>* = nullptr
41
42template <typename T, size_t N, int kPow2>
43constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
44 // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
45 // argument enables fractional LMUL < 1. Limit to 64 because that is the
46 // largest value for which vbool##_t are defined.
47 return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
48}
49
50// ================================================== MACROS
51
52// Generate specializations and function definitions using X macros. Although
53// harder to read and debug, writing everything manually is too bulky.
54
55namespace detail { // for code folding
56
57// For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
58// The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN.
59#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
60 X_MACRO(64, 0, 64, NAME, OP) \
61 X_MACRO(32, 0, 32, NAME, OP) \
62 X_MACRO(16, 0, 16, NAME, OP) \
63 X_MACRO(8, 0, 8, NAME, OP) \
64 X_MACRO(8, 1, 4, NAME, OP) \
65 X_MACRO(8, 2, 2, NAME, OP) \
66 X_MACRO(8, 3, 1, NAME, OP)
67
68// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
69// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
70// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
71//
72// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
73// reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
74// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
75// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
76
77// LMULS = _TRUNC: truncatable (not the smallest LMUL)
78#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
79 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
80 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
81 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
82 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
83 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
84 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
85
86#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
87 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
88 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
89 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
90 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
91 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
92
93#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
94 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
95 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
96 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
97 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
98
99#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
100 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
101 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
102 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
103
104// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
105#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
106 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
107 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
108 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
109 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
110 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
111 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
112
113#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
114 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
115 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
116 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
117 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
118 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
119 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
120
121#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
122 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
123 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
124 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
125 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
126 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
127
128#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
129 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
130 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
131 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
132 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
133
134// LMULS = _LE2: <= 2
135#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
136 X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \
137 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
138 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
139 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
140 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)
141
142#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
143 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
144 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
145 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
146 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)
147
148#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
149 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
150 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
151 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)
152
153#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
154 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
155 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)
156
157// LMULS = _EXT: not the largest LMUL
158#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
159 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
160 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
161
162#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
163 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
164 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
165
166#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
167 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
168 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
169
170#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
171 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
172 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
173
174// LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
175#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
176 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
177 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
178
179#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
180 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
181 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
182
183#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
184 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
185 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
186
187#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
188 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
189 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
190
191// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
192// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
193// though RISC-V LMUL must be at least SEW/64 (notice that this rules out
194// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
195// one less than should be supported, with all other parameters (vector type
196// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
197// returns half of what it usually would.
198//
199// Notice that we can only add overloads whenever there is a D argument: those
200// are unique with respect to non-virtual-LMUL overloads because their kPow2
201// template argument differs. Otherwise, there is no actual vuint64mf2_t, and
202// defining another overload with the same LMUL would be an error. Thus we have
203// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
204// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
205// functions that take a D.
206
207#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
208
209#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
210 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
211
212#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
213 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
214
215#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
216 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
217
218// ALL + VIRT
219#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
220 HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
221 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
222
223#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
224 HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
225 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
226
227#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
228 HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
229 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
230
231#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
232 HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
233 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
234
235// LE2 + VIRT
236#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
237 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
238 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
239
240#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
241 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
242 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
243
244#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
245 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
246 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
247
248#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
249 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
250 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
251
252// EXT + VIRT
253#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
254 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
255 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
256
257#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
258 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
259 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
260
261#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
262 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
263 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
264
265#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
266 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
267 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
268
269// DEMOTE + VIRT
270#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
271 HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
272 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
273
274#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
275 HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
276 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
277
278#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
279 HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
280 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
281
282#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
283 HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
284 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
285
286// SEW for unsigned:
287#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
288 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
289#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
290 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
291#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
292 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
293#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
294 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
295
296// SEW for signed:
297#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
298 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
299#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
300 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
301#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
302 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
303#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
304 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
305
306// SEW for float:
307#if HWY_HAVE_FLOAT16
308#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
309 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
310#else
311#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
312#endif
313#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
314 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
315#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
316 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
317
318// Commonly used type/SEW groups:
319#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
320 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
321 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
322
323#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
324 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
325 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
326
327#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
328 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
329 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
330
331#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
332 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
333 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
334
335#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
336 HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
337 HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
338
339#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
340 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
341 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
342 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
343
344#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
345 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
346 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
347 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
348
349#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
350 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
351 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
352
353#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
354 HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
355 HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
356
357// For all combinations of SEW:
358#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
359 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
360 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
361 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
362 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
363
364#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
365 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
366 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
367 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
368 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
369
370#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
371 HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
372 HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
373
374// Commonly used type categories:
375#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
376 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
377 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
378
379#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
380 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
381 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
382 HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
383
384// Assemble types for use in x-macros
385#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
386#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
387#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
388#define HWY_RVV_M(MLEN) vbool##MLEN##_t
389
390} // namespace detail
391
392// Until we have full intrinsic support for fractional LMUL, mixed-precision
393// code can use LMUL 1..8 (adequate unless they need many registers).
394#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
395 MLEN, NAME, OP) \
396 template <> \
397 struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
398 using Lane = HWY_RVV_T(BASE, SEW); \
399 using type = ScalableTag<Lane, SHIFT>; \
400 };
401
403#undef HWY_SPECIALIZE
404
405// ------------------------------ Lanes
406
407// WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
408// vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
409#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
410 MLEN, NAME, OP) \
411 template <size_t N> \
412 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
413 size_t actual = v##OP##SEW##LMUL(); \
414 /* Common case of full vectors: avoid any extra instructions. */ \
415 /* actual includes LMUL, so do not shift again. */ \
416 if (detail::IsFull(d)) return actual; \
417 /* Check for virtual LMUL, e.g. "uint16mf8_t" (not provided by */ \
418 /* intrinsics). In this case the actual LMUL is 1/4, so divide by */ \
419 /* another factor of two. */ \
420 if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) actual >>= 1; \
421 return HWY_MIN(actual, N); \
422 }
423
424HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT)
425#undef HWY_RVV_LANES
426
427template <size_t N, int kPow2>
430}
431
432// ------------------------------ Common x-macros
433
434// Last argument to most intrinsics. Use when the op has no d arg of its own,
435// which means there is no user-specified cap.
436#define HWY_RVV_AVL(SEW, SHIFT) \
437 Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
438
439// vector = f(vector), e.g. Not
440#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
441 SHIFT, MLEN, NAME, OP) \
442 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
443 return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
444 }
445
446// vector = f(vector, scalar), e.g. detail::AddS
447#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
448 SHIFT, MLEN, NAME, OP) \
449 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
450 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
451 return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
452 }
453
454// vector = f(vector, vector), e.g. Add
455#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
456 SHIFT, MLEN, NAME, OP) \
457 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
458 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
459 return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
460 }
461
462// mask = f(mask)
463#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
464 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
465 return vm##OP##_m_b##MLEN(m, ~0ull); \
466 }
467
468// ================================================== INIT
469
470// ------------------------------ Set
471
472#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
473 MLEN, NAME, OP) \
474 template <size_t N> \
475 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
476 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
477 return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
478 }
479
480HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
481HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
482#undef HWY_RVV_SET
483
484// Treat bfloat16_t as uint16_t (using the previously defined Set overloads);
485// required for Zero and VFromD.
486template <size_t N, int kPow2>
488 bfloat16_t arg) {
489 return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
490}
491
492template <class D>
493using VFromD = decltype(Set(D(), TFromD<D>()));
494
495// ------------------------------ Zero
496
497template <class D>
499 // Cast to support bfloat16_t.
500 const RebindToUnsigned<decltype(d)> du;
501 return BitCast(d, Set(du, 0));
502}
503
504// ------------------------------ Undefined
505
506// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
507// by it gives unpredictable results. It should only be used for maskoff, so
508// keep it internal. For the Highway op, just use Zero (single instruction).
509namespace detail {
510#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
511 SHIFT, MLEN, NAME, OP) \
512 template <size_t N> \
513 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
514 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \
515 return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \
516 }
517
519#undef HWY_RVV_UNDEFINED
520} // namespace detail
521
522template <class D>
524 return Zero(d);
525}
526
527// ------------------------------ BitCast
528
529namespace detail {
530
531// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
532#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
533 MLEN, NAME, OP) \
534 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
535 return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(v); /* no AVL */ \
536 }
537HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
538#undef HWY_RVV_TRUNC
539
540// Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
541#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
542 MLEN, NAME, OP) \
543 template <size_t N> \
544 HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
545 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
546 HWY_RVV_V(BASE, SEW, LMUL) v) { \
547 return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(v); /* no AVL */ \
548 }
549HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
550#undef HWY_RVV_EXT
551
552// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
553// the same as the actual input type.
554#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
555 SHIFT, MLEN, NAME, OP) \
556 template <size_t N> \
557 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
558 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
559 HWY_RVV_V(BASE, SEW, LMUL) v) { \
560 return v; \
561 }
562HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
563#undef HWY_RVV_EXT_VIRT
564
565// For BitCastToByte, the D arg is only to prevent duplicate definitions caused
566// by _ALL_VIRT.
567
568// There is no reinterpret from u8 <-> u8, so just return.
569#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
570 SHIFT, MLEN, NAME, OP) \
571 template <typename T, size_t N> \
572 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
573 vuint8##LMUL##_t v) { \
574 return v; \
575 } \
576 template <size_t N> \
577 HWY_API vuint8##LMUL##_t BitCastFromByte( \
578 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
579 return v; \
580 }
581
582// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
583#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
584 SHIFT, MLEN, NAME, OP) \
585 template <typename T, size_t N> \
586 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
587 vint8##LMUL##_t v) { \
588 return vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
589 } \
590 template <size_t N> \
591 HWY_API vint8##LMUL##_t BitCastFromByte( \
592 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
593 return vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
594 }
595
596// Separate u/i because clang only provides signed <-> unsigned reinterpret for
597// the same SEW.
598#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
599 MLEN, NAME, OP) \
600 template <typename T, size_t N> \
601 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
602 HWY_RVV_V(BASE, SEW, LMUL) v) { \
603 return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
604 } \
605 template <size_t N> \
606 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
607 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
608 return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
609 }
610
611// Signed/Float: first cast to/from unsigned
612#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
613 SHIFT, MLEN, NAME, OP) \
614 template <typename T, size_t N> \
615 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
616 HWY_RVV_V(BASE, SEW, LMUL) v) { \
617 return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
618 v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
619 } \
620 template <size_t N> \
621 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
622 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
623 return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
624 v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
625 }
626
627// Additional versions for virtual LMUL using LMULH for byte vectors.
628#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
629 SHIFT, MLEN, NAME, OP) \
630 template <typename T, size_t N> \
631 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
632 HWY_RVV_V(BASE, SEW, LMUL) v) { \
633 return detail::Trunc(v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
634 } \
635 template <size_t N> \
636 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
637 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
638 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
639 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
640 return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \
641 }
642
643// Signed/Float: first cast to/from unsigned
644#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
645 SHIFT, MLEN, NAME, OP) \
646 template <typename T, size_t N> \
647 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
648 HWY_RVV_V(BASE, SEW, LMUL) v) { \
649 return detail::Trunc(v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
650 v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \
651 } \
652 template <size_t N> \
653 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
654 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
655 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
656 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
657 return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
658 v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \
659 }
660
661HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
662HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
663HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
664HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
665HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
666HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
667HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
668HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
669
670#undef HWY_RVV_CAST_U8
671#undef HWY_RVV_CAST_I8
672#undef HWY_RVV_CAST_U
673#undef HWY_RVV_CAST_IF
674#undef HWY_RVV_CAST_VIRT_U
675#undef HWY_RVV_CAST_VIRT_IF
676
677template <size_t N, int kPow2>
681}
682
683} // namespace detail
684
685template <class D, class FromV>
686HWY_API VFromD<D> BitCast(D d, FromV v) {
688}
689
690namespace detail {
691
692template <class V, class DU = RebindToUnsigned<DFromV<V>>>
694 return BitCast(DU(), v);
695}
696
697} // namespace detail
698
699// ------------------------------ Iota
700
701namespace detail {
702
703#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
704 MLEN, NAME, OP) \
705 template <size_t N> \
706 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
707 return v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
708 }
709
710HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
711#undef HWY_RVV_IOTA
712
713template <class D, class DU = RebindToUnsigned<D>>
715 return BitCastToUnsigned(Iota0(DU()));
716}
717
718} // namespace detail
719
720// ================================================== LOGICAL
721
722// ------------------------------ Not
723
725
726template <class V, HWY_IF_FLOAT_V(V)>
727HWY_API V Not(const V v) {
728 using DF = DFromV<V>;
729 using DU = RebindToUnsigned<DF>;
730 return BitCast(DF(), Not(BitCast(DU(), v)));
731}
732
733// ------------------------------ And
734
735// Non-vector version (ideally immediate) for use with Iota0
736namespace detail {
737HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
738} // namespace detail
739
741
742template <class V, HWY_IF_FLOAT_V(V)>
743HWY_API V And(const V a, const V b) {
744 using DF = DFromV<V>;
745 using DU = RebindToUnsigned<DF>;
746 return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
747}
748
749// ------------------------------ Or
750
752
753template <class V, HWY_IF_FLOAT_V(V)>
754HWY_API V Or(const V a, const V b) {
755 using DF = DFromV<V>;
756 using DU = RebindToUnsigned<DF>;
757 return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
758}
759
760// ------------------------------ Xor
761
762// Non-vector version (ideally immediate) for use with Iota0
763namespace detail {
764HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
765} // namespace detail
766
768
769template <class V, HWY_IF_FLOAT_V(V)>
770HWY_API V Xor(const V a, const V b) {
771 using DF = DFromV<V>;
772 using DU = RebindToUnsigned<DF>;
773 return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
774}
775
776// ------------------------------ AndNot
777template <class V>
778HWY_API V AndNot(const V not_a, const V b) {
779 return And(Not(not_a), b);
780}
781
782// ------------------------------ Xor3
783template <class V>
784HWY_API V Xor3(V x1, V x2, V x3) {
785 return Xor(x1, Xor(x2, x3));
786}
787
788// ------------------------------ Or3
789template <class V>
790HWY_API V Or3(V o1, V o2, V o3) {
791 return Or(o1, Or(o2, o3));
792}
793
794// ------------------------------ OrAnd
795template <class V>
796HWY_API V OrAnd(const V o, const V a1, const V a2) {
797 return Or(o, And(a1, a2));
798}
799
800// ------------------------------ CopySign
801
803
804template <class V>
805HWY_API V CopySignToAbs(const V abs, const V sign) {
806 // RVV can also handle abs < 0, so no extra action needed.
807 return CopySign(abs, sign);
808}
809
810// ================================================== ARITHMETIC
811
812// ------------------------------ Add
813
814namespace detail {
815HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
816HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
817HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
818HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
819} // namespace detail
820
823
824// ------------------------------ Sub
827
828// ------------------------------ SaturatedAdd
829
832
835
836// ------------------------------ SaturatedSub
837
840
843
844// ------------------------------ AverageRound
845
846// TODO(janwas): check vxrm rounding mode
849
850// ------------------------------ ShiftLeft[Same]
851
852// Intrinsics do not define .vi forms, so use .vx instead.
853#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
854 MLEN, NAME, OP) \
855 template <int kBits> \
856 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
857 return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT)); \
858 } \
859 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
860 NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
861 return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
862 HWY_RVV_AVL(SEW, SHIFT)); \
863 }
864
866
867// ------------------------------ ShiftRight[Same]
868
871
872#undef HWY_RVV_SHIFT
873
874// ------------------------------ SumsOf8 (ShiftRight, Add)
875template <class VU8>
877 const DFromV<VU8> du8;
878 const RepartitionToWide<decltype(du8)> du16;
879 const RepartitionToWide<decltype(du16)> du32;
880 const RepartitionToWide<decltype(du32)> du64;
881 using VU16 = VFromD<decltype(du16)>;
882
883 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
884 const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
885 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
886
887 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
888 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
889 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
890 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
891 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
892 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
893 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
894 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
895 return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
896}
897
898// ------------------------------ RotateRight
899template <int kBits, class V>
900HWY_API V RotateRight(const V v) {
901 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
902 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
903 if (kBits == 0) return v;
904 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
905}
906
907// ------------------------------ Shl
908#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
909 SHIFT, MLEN, NAME, OP) \
910 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
911 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
912 return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT)); \
913 }
914
916
917#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
918 SHIFT, MLEN, NAME, OP) \
919 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
920 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
921 return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \
922 HWY_RVV_AVL(SEW, SHIFT)); \
923 }
924
926
927// ------------------------------ Shr
928
929HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL)
930HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL)
931
932#undef HWY_RVV_SHIFT_II
933#undef HWY_RVV_SHIFT_VV
934
935// ------------------------------ Min
936
940
941// ------------------------------ Max
942
943namespace detail {
944
945HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
946HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
947HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
948
949} // namespace detail
950
954
955// ------------------------------ Mul
956
959
960// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
961#ifdef HWY_NATIVE_I64MULLO
962#undef HWY_NATIVE_I64MULLO
963#else
964#define HWY_NATIVE_I64MULLO
965#endif
966
967// ------------------------------ MulHigh
968
969// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
970// Used by MulEven; vwmul does not work for m8.
971namespace detail {
975} // namespace detail
976
979
980// ------------------------------ MulFixedPoint15
982
983// ------------------------------ Div
985
986// ------------------------------ ApproximateReciprocal
988
989// ------------------------------ Sqrt
991
992// ------------------------------ ApproximateReciprocalSqrt
994
995// ------------------------------ MulAdd
996// Note: op is still named vv, not vvv.
997#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
998 MLEN, NAME, OP) \
999 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1000 NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
1001 HWY_RVV_V(BASE, SEW, LMUL) add) { \
1002 return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \
1003 }
1004
1006
1007// ------------------------------ NegMulAdd
1009
1010// ------------------------------ MulSub
1012
1013// ------------------------------ NegMulSub
1015
1016#undef HWY_RVV_FMA
1017
1018// ================================================== COMPARE
1019
1020// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
1021// vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th
1022// of all bits; SLEN 8 / LMUL 4 = half of all bits.
1023
1024// mask = f(vector, vector)
1025#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1026 SHIFT, MLEN, NAME, OP) \
1027 HWY_API HWY_RVV_M(MLEN) \
1028 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1029 return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b, \
1030 HWY_RVV_AVL(SEW, SHIFT)); \
1031 }
1032
1033// mask = f(vector, scalar)
1034#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1035 SHIFT, MLEN, NAME, OP) \
1036 HWY_API HWY_RVV_M(MLEN) \
1037 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
1038 return v##OP##_##CHAR##SEW##LMUL##_b##MLEN(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1039 }
1040
1041// ------------------------------ Eq
1044
1045namespace detail {
1046HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
1047HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
1048} // namespace detail
1049
1050// ------------------------------ Ne
1053
1054namespace detail {
1055HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
1056HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
1057} // namespace detail
1058
1059// ------------------------------ Lt
1060HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL)
1063
1064namespace detail {
1065HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
1066HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
1067HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
1068} // namespace detail
1069
1070// ------------------------------ Le
1072
1073#undef HWY_RVV_RETM_ARGVV
1074#undef HWY_RVV_RETM_ARGVS
1075
1076// ------------------------------ Gt/Ge
1077
1078template <class V>
1079HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
1080 return Le(b, a);
1081}
1082
1083template <class V>
1084HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1085 return Lt(b, a);
1086}
1087
1088// ------------------------------ TestBit
1089template <class V>
1090HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
1091 return detail::NeS(And(a, bit), 0);
1092}
1093
1094// ------------------------------ Not
1095// NOLINTNEXTLINE
1097
1098// ------------------------------ And
1099
1100// mask = f(mask_a, mask_b) (note arg2,arg1 order!)
1101#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
1102 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
1103 return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
1104 }
1105
1107
1108// ------------------------------ AndNot
1110
1111// ------------------------------ Or
1113
1114// ------------------------------ Xor
1116
1117// ------------------------------ ExclusiveNeither
1119
1120#undef HWY_RVV_RETM_ARGMM
1121
1122// ------------------------------ IfThenElse
1123#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1124 SHIFT, MLEN, NAME, OP) \
1125 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1126 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
1127 HWY_RVV_V(BASE, SEW, LMUL) no) { \
1128 return v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m, HWY_RVV_AVL(SEW, SHIFT)); \
1129 }
1130
1132
1133#undef HWY_RVV_IF_THEN_ELSE
1134
1135// ------------------------------ IfThenElseZero
1136template <class M, class V>
1137HWY_API V IfThenElseZero(const M mask, const V yes) {
1138 return IfThenElse(mask, yes, Zero(DFromV<V>()));
1139}
1140
1141// ------------------------------ IfThenZeroElse
1142
1143#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1144 LMULH, SHIFT, MLEN, NAME, OP) \
1145 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1146 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
1147 return v##OP##_##CHAR##SEW##LMUL(no, 0, m, HWY_RVV_AVL(SEW, SHIFT)); \
1148 }
1149
1152
1153#undef HWY_RVV_IF_THEN_ZERO_ELSE
1154
1155// ------------------------------ MaskFromVec
1156
1157template <class V>
1158HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) {
1159 return detail::NeS(v, 0);
1160}
1161
1162template <class D>
1163using MFromD = decltype(MaskFromVec(Zero(D())));
1164
1165template <class D, typename MFrom>
1166HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1167 // No need to check lane size/LMUL are the same: if not, casting MFrom to
1168 // MFromD<D> would fail.
1169 return mask;
1170}
1171
1172// ------------------------------ VecFromMask
1173
1174namespace detail {
1175#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1176 SHIFT, MLEN, NAME, OP) \
1177 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1178 NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) { \
1179 return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1, \
1180 HWY_RVV_AVL(SEW, SHIFT)); \
1181 }
1182
1183HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, SubS, sub_vx, _ALL)
1184#undef HWY_RVV_VEC_FROM_MASK
1185} // namespace detail
1186
1187template <class D, HWY_IF_NOT_FLOAT_D(D)>
1189 return detail::SubS(Zero(d), mask);
1190}
1191
1192template <class D, HWY_IF_FLOAT_D(D)>
1193HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
1194 return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
1195}
1196
1197// ------------------------------ IfVecThenElse (MaskFromVec)
1198
1199template <class V>
1200HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1201 return IfThenElse(MaskFromVec(mask), yes, no);
1202}
1203
1204// ------------------------------ ZeroIfNegative
1205template <class V>
1206HWY_API V ZeroIfNegative(const V v) {
1207 return IfThenZeroElse(detail::LtS(v, 0), v);
1208}
1209
1210// ------------------------------ BroadcastSignBit
1211template <class V>
1212HWY_API V BroadcastSignBit(const V v) {
1213 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1214}
1215
1216// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1217template <class V>
1218HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1219 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1220 const DFromV<V> d;
1221 const RebindToSigned<decltype(d)> di;
1222
1223 MFromD<decltype(d)> m =
1225 return IfThenElse(m, yes, no);
1226}
1227
1228// ------------------------------ FindFirstTrue
1229
1230#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1231 template <class D> \
1232 HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1233 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1234 return vfirst_m_b##MLEN(m, Lanes(d)); \
1235 } \
1236 template <class D> \
1237 HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1238 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1239 return static_cast<size_t>(vfirst_m_b##MLEN(m, Lanes(d))); \
1240 }
1241
1243#undef HWY_RVV_FIND_FIRST_TRUE
1244
1245// ------------------------------ AllFalse
1246template <class D>
1248 return FindFirstTrue(d, m) < 0;
1249}
1250
1251// ------------------------------ AllTrue
1252
1253#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1254 template <class D> \
1255 HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
1256 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1257 return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d))); \
1258 }
1259
1261#undef HWY_RVV_ALL_TRUE
1262
1263// ------------------------------ CountTrue
1264
1265#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1266 template <class D> \
1267 HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
1268 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1269 return vcpop_m_b##MLEN(m, Lanes(d)); \
1270 }
1271
1273#undef HWY_RVV_COUNT_TRUE
1274
1275// ================================================== MEMORY
1276
1277// ------------------------------ Load
1278
1279#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1280 MLEN, NAME, OP) \
1281 template <size_t N> \
1282 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1283 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1284 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1285 return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \
1286 }
1287HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1288#undef HWY_RVV_LOAD
1289
1290// There is no native BF16, treat as uint16_t.
1291template <size_t N, int kPow2>
1294 return Load(RebindToUnsigned<decltype(d)>(),
1295 reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1296}
1297
1298template <size_t N, int kPow2>
1301 Store(v, RebindToUnsigned<decltype(d)>(),
1302 reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1303}
1304
1305// ------------------------------ LoadU
1306
1307// RVV only requires lane alignment, not natural alignment of the entire vector.
1308template <class D>
1309HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1310 return Load(d, p);
1311}
1312
1313// ------------------------------ MaskedLoad
1314
1315#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1316 SHIFT, MLEN, NAME, OP) \
1317 template <size_t N> \
1318 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1319 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1320 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1321 return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d)); \
1322 }
1324#undef HWY_RVV_MASKED_LOAD
1325
1326// ------------------------------ Store
1327
1328#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1329 MLEN, NAME, OP) \
1330 template <size_t N> \
1331 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1332 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1333 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1334 return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \
1335 }
1336HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1337#undef HWY_RVV_STORE
1338
1339// ------------------------------ BlendedStore
1340
1341#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1342 SHIFT, MLEN, NAME, OP) \
1343 template <size_t N> \
1344 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1345 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1346 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1347 return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \
1348 }
1350#undef HWY_RVV_BLENDED_STORE
1351
1352namespace detail {
1353
1354#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1355 MLEN, NAME, OP) \
1356 template <size_t N> \
1357 HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1358 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1359 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1360 return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \
1361 }
1362HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1363#undef HWY_RVV_STOREN
1364
1365} // namespace detail
1366
1367// ------------------------------ StoreU
1368
1369// RVV only requires lane alignment, not natural alignment of the entire vector.
1370template <class V, class D>
1371HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1372 Store(v, d, p);
1373}
1374
1375// ------------------------------ Stream
1376template <class V, class D, typename T>
1377HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1378 Store(v, d, aligned);
1379}
1380
1381// ------------------------------ ScatterOffset
1382
1383#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1384 SHIFT, MLEN, NAME, OP) \
1385 template <size_t N> \
1386 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1387 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1388 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1389 HWY_RVV_V(int, SEW, LMUL) offset) { \
1390 return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1391 base, detail::BitCastToUnsigned(offset), v, Lanes(d)); \
1392 }
1394#undef HWY_RVV_SCATTER
1395
1396// ------------------------------ ScatterIndex
1397
1398template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1400 const VFromD<RebindToSigned<D>> index) {
1401 return ScatterOffset(v, d, base, ShiftLeft<2>(index));
1402}
1403
1404template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1405HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1406 const VFromD<RebindToSigned<D>> index) {
1407 return ScatterOffset(v, d, base, ShiftLeft<3>(index));
1408}
1409
1410// ------------------------------ GatherOffset
1411
1412#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1413 MLEN, NAME, OP) \
1414 template <size_t N> \
1415 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1416 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1417 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1418 HWY_RVV_V(int, SEW, LMUL) offset) { \
1419 return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1420 base, detail::BitCastToUnsigned(offset), Lanes(d)); \
1421 }
1423#undef HWY_RVV_GATHER
1424
1425// ------------------------------ GatherIndex
1426
1427template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1429 const VFromD<RebindToSigned<D>> index) {
1430 return GatherOffset(d, base, ShiftLeft<2>(index));
1431}
1432
1433template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1434HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1435 const VFromD<RebindToSigned<D>> index) {
1436 return GatherOffset(d, base, ShiftLeft<3>(index));
1437}
1438
1439// ------------------------------ LoadInterleaved2
1440
1441// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1442#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1443#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1444#else
1445#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1446#endif
1447
1448#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1449 MLEN, NAME, OP) \
1450 template <size_t N> \
1451 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1452 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
1453 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
1454 HWY_RVV_V(BASE, SEW, LMUL) & v1) { \
1455 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, unaligned, Lanes(d)); \
1456 }
1457// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1459#undef HWY_RVV_LOAD2
1460
1461// ------------------------------ LoadInterleaved3
1462
1463#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1464 MLEN, NAME, OP) \
1465 template <size_t N> \
1466 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1467 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
1468 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
1469 HWY_RVV_V(BASE, SEW, LMUL) & v1, \
1470 HWY_RVV_V(BASE, SEW, LMUL) & v2) { \
1471 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, unaligned, Lanes(d)); \
1472 }
1473// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1475#undef HWY_RVV_LOAD3
1476
1477// ------------------------------ LoadInterleaved4
1478
1479#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1480 MLEN, NAME, OP) \
1481 template <size_t N> \
1482 HWY_API void NAME( \
1483 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1484 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned, \
1485 HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \
1486 HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \
1487 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, &v3, aligned, \
1488 Lanes(d)); \
1489 }
1490// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1492#undef HWY_RVV_LOAD4
1493
1494// ------------------------------ StoreInterleaved2
1495
1496#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1497 MLEN, NAME, OP) \
1498 template <size_t N> \
1499 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \
1500 HWY_RVV_V(BASE, SEW, LMUL) v1, \
1501 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1502 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1503 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d)); \
1504 }
1505// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1507#undef HWY_RVV_STORE2
1508
1509// ------------------------------ StoreInterleaved3
1510
1511#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1512 MLEN, NAME, OP) \
1513 template <size_t N> \
1514 HWY_API void NAME( \
1515 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1516 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1517 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1518 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, Lanes(d)); \
1519 }
1520// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1522#undef HWY_RVV_STORE3
1523
1524// ------------------------------ StoreInterleaved4
1525
1526#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1527 MLEN, NAME, OP) \
1528 template <size_t N> \
1529 HWY_API void NAME( \
1530 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1531 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
1532 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1533 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
1534 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, Lanes(d)); \
1535 }
1536// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1538#undef HWY_RVV_STORE4
1539
1540// ================================================== CONVERT
1541
1542// ------------------------------ PromoteTo
1543
1544// SEW is for the input so we can use F16 (no-op if not supported).
1545#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1546 SHIFT, MLEN, NAME, OP) \
1547 template <size_t N> \
1548 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
1549 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1550 return OP##CHAR##SEWD##LMULD(v, Lanes(d)); \
1551 }
1552
1553HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1554HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1555HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1556HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1557HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1558HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1559HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1560HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1561#undef HWY_RVV_PROMOTE
1562
1563// The above X-macro cannot handle 4x promotion nor type switching.
1564// TODO(janwas): use BASE2 arg to allow the latter.
1565#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
1566 SHIFT, ADD) \
1567 template <size_t N> \
1568 HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
1569 PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \
1570 HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
1571 return OP##CHAR##BITS##LMUL(v, Lanes(d)); \
1572 }
1573
1574#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1575 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
1576 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
1577 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \
1578 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \
1579 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
1580
1581#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1582 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
1583 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
1584 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
1585 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \
1586 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
1587
1588HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
1589HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
1590
1591// i32 to f64
1592HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
1593
1594#undef HWY_RVV_PROMOTE_X4
1595#undef HWY_RVV_PROMOTE_X2
1596#undef HWY_RVV_PROMOTE
1597
1598// Unsigned to signed: cast for unsigned promote.
1599template <size_t N, int kPow2>
1601 VFromD<Rebind<uint8_t, decltype(d)>> v)
1602 -> VFromD<decltype(d)> {
1603 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1604}
1605
1606template <size_t N, int kPow2>
1608 VFromD<Rebind<uint8_t, decltype(d)>> v)
1609 -> VFromD<decltype(d)> {
1610 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1611}
1612
1613template <size_t N, int kPow2>
1615 VFromD<Rebind<uint16_t, decltype(d)>> v)
1616 -> VFromD<decltype(d)> {
1617 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1618}
1619
1620template <size_t N, int kPow2>
1622 VFromD<Rebind<bfloat16_t, decltype(d)>> v)
1623 -> VFromD<decltype(d)> {
1624 const RebindToSigned<decltype(d)> di32;
1625 const Rebind<uint16_t, decltype(d)> du16;
1626 return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
1627}
1628
1629// ------------------------------ DemoteTo U
1630
1631// SEW is for the source so we can use _DEMOTE.
1632#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1633 MLEN, NAME, OP) \
1634 template <size_t N> \
1635 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1636 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1637 return OP##CHAR##SEWH##LMULH(v, 0, Lanes(d)); \
1638 } \
1639 template <size_t N> \
1640 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME##Shr16( \
1641 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1642 return OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
1643 }
1644
1645// Unsigned -> unsigned (also used for bf16)
1646namespace detail {
1647HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1648HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1649} // namespace detail
1650
1651// SEW is for the source so we can use _DEMOTE.
1652#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1653 SHIFT, MLEN, NAME, OP) \
1654 template <size_t N> \
1655 HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \
1656 HWY_RVV_D(uint, SEWH, N, SHIFT - 1) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1657 /* First clamp negative numbers to zero to match x86 packus. */ \
1658 return detail::DemoteTo(d, detail::BitCastToUnsigned(detail::MaxS(v, 0))); \
1659 }
1662#undef HWY_RVV_DEMOTE_I_TO_U
1663
1664template <size_t N>
1665HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
1666 return vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, Lanes(d));
1667}
1668template <size_t N>
1669HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
1670 return vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, Lanes(d));
1671}
1672template <size_t N>
1673HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
1674 return vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, Lanes(d));
1675}
1676template <size_t N>
1677HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
1678 return vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, Lanes(d));
1679}
1680template <size_t N>
1681HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
1682 return vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, Lanes(d));
1683}
1684
1685HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
1686 const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
1687 return vnclipu_wx_u8mf8(vnclipu_wx_u16mf4(v, 0, avl), 0, avl);
1688}
1689HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
1690 const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
1691 return vnclipu_wx_u8mf4(vnclipu_wx_u16mf2(v, 0, avl), 0, avl);
1692}
1693HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
1694 const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
1695 return vnclipu_wx_u8mf2(vnclipu_wx_u16m1(v, 0, avl), 0, avl);
1696}
1697HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
1698 const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
1699 return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl);
1700}
1701HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
1702 const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
1703 return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
1704}
1705
1706// ------------------------------ Truncations
1707
1708template <size_t N>
1710 const VFromD<Simd<uint64_t, N, 0>> v) {
1711 const size_t avl = Lanes(d);
1712 const vuint64m1_t v1 = vand(v, 0xFF, avl);
1713 const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
1714 const vuint16mf4_t v3 = vnclipu_wx_u16mf4(v2, 0, avl);
1715 return vnclipu_wx_u8mf8(v3, 0, avl);
1716}
1717
1718template <size_t N>
1720 const VFromD<Simd<uint64_t, N, 1>> v) {
1721 const size_t avl = Lanes(d);
1722 const vuint64m2_t v1 = vand(v, 0xFF, avl);
1723 const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
1724 const vuint16mf2_t v3 = vnclipu_wx_u16mf2(v2, 0, avl);
1725 return vnclipu_wx_u8mf4(v3, 0, avl);
1726}
1727
1728template <size_t N>
1730 const VFromD<Simd<uint64_t, N, 2>> v) {
1731 const size_t avl = Lanes(d);
1732 const vuint64m4_t v1 = vand(v, 0xFF, avl);
1733 const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
1734 const vuint16m1_t v3 = vnclipu_wx_u16m1(v2, 0, avl);
1735 return vnclipu_wx_u8mf2(v3, 0, avl);
1736}
1737
1738template <size_t N>
1740 const VFromD<Simd<uint64_t, N, 3>> v) {
1741 const size_t avl = Lanes(d);
1742 const vuint64m8_t v1 = vand(v, 0xFF, avl);
1743 const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
1744 const vuint16m2_t v3 = vnclipu_wx_u16m2(v2, 0, avl);
1745 return vnclipu_wx_u8m1(v3, 0, avl);
1746}
1747
1748template <size_t N>
1750 const VFromD<Simd<uint64_t, N, 0>> v) {
1751 const size_t avl = Lanes(d);
1752 const vuint64m1_t v1 = vand(v, 0xFFFF, avl);
1753 const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
1754 return vnclipu_wx_u16mf4(v2, 0, avl);
1755}
1756
1757template <size_t N>
1759 const VFromD<Simd<uint64_t, N, 1>> v) {
1760 const size_t avl = Lanes(d);
1761 const vuint64m2_t v1 = vand(v, 0xFFFF, avl);
1762 const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
1763 return vnclipu_wx_u16mf2(v2, 0, avl);
1764}
1765
1766template <size_t N>
1768 const VFromD<Simd<uint64_t, N, 2>> v) {
1769 const size_t avl = Lanes(d);
1770 const vuint64m4_t v1 = vand(v, 0xFFFF, avl);
1771 const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
1772 return vnclipu_wx_u16m1(v2, 0, avl);
1773}
1774
1775template <size_t N>
1777 const VFromD<Simd<uint64_t, N, 3>> v) {
1778 const size_t avl = Lanes(d);
1779 const vuint64m8_t v1 = vand(v, 0xFFFF, avl);
1780 const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
1781 return vnclipu_wx_u16m2(v2, 0, avl);
1782}
1783
1784template <size_t N>
1786 const VFromD<Simd<uint64_t, N, 0>> v) {
1787 const size_t avl = Lanes(d);
1788 const vuint64m1_t v1 = vand(v, 0xFFFFFFFFu, avl);
1789 return vnclipu_wx_u32mf2(v1, 0, avl);
1790}
1791
1792template <size_t N>
1794 const VFromD<Simd<uint64_t, N, 1>> v) {
1795 const size_t avl = Lanes(d);
1796 const vuint64m2_t v1 = vand(v, 0xFFFFFFFFu, avl);
1797 return vnclipu_wx_u32m1(v1, 0, avl);
1798}
1799
1800template <size_t N>
1802 const VFromD<Simd<uint64_t, N, 2>> v) {
1803 const size_t avl = Lanes(d);
1804 const vuint64m4_t v1 = vand(v, 0xFFFFFFFFu, avl);
1805 return vnclipu_wx_u32m2(v1, 0, avl);
1806}
1807
1808template <size_t N>
1810 const VFromD<Simd<uint64_t, N, 3>> v) {
1811 const size_t avl = Lanes(d);
1812 const vuint64m8_t v1 = vand(v, 0xFFFFFFFFu, avl);
1813 return vnclipu_wx_u32m4(v1, 0, avl);
1814}
1815
1816template <size_t N>
1819 const size_t avl = Lanes(d);
1820 const vuint32mf2_t v1 = vand(v, 0xFF, avl);
1821 const vuint16mf4_t v2 = vnclipu_wx_u16mf4(v1, 0, avl);
1822 return vnclipu_wx_u8mf8(v2, 0, avl);
1823}
1824
1825template <size_t N>
1827 const VFromD<Simd<uint32_t, N, 0>> v) {
1828 const size_t avl = Lanes(d);
1829 const vuint32m1_t v1 = vand(v, 0xFF, avl);
1830 const vuint16mf2_t v2 = vnclipu_wx_u16mf2(v1, 0, avl);
1831 return vnclipu_wx_u8mf4(v2, 0, avl);
1832}
1833
1834template <size_t N>
1836 const VFromD<Simd<uint32_t, N, 1>> v) {
1837 const size_t avl = Lanes(d);
1838 const vuint32m2_t v1 = vand(v, 0xFF, avl);
1839 const vuint16m1_t v2 = vnclipu_wx_u16m1(v1, 0, avl);
1840 return vnclipu_wx_u8mf2(v2, 0, avl);
1841}
1842
1843template <size_t N>
1845 const VFromD<Simd<uint32_t, N, 2>> v) {
1846 const size_t avl = Lanes(d);
1847 const vuint32m4_t v1 = vand(v, 0xFF, avl);
1848 const vuint16m2_t v2 = vnclipu_wx_u16m2(v1, 0, avl);
1849 return vnclipu_wx_u8m1(v2, 0, avl);
1850}
1851
1852template <size_t N>
1854 const VFromD<Simd<uint32_t, N, 3>> v) {
1855 const size_t avl = Lanes(d);
1856 const vuint32m8_t v1 = vand(v, 0xFF, avl);
1857 const vuint16m4_t v2 = vnclipu_wx_u16m4(v1, 0, avl);
1858 return vnclipu_wx_u8m2(v2, 0, avl);
1859}
1860
1861template <size_t N>
1864 const size_t avl = Lanes(d);
1865 const vuint32mf2_t v1 = vand(v, 0xFFFF, avl);
1866 return vnclipu_wx_u16mf4(v1, 0, avl);
1867}
1868
1869template <size_t N>
1871 const VFromD<Simd<uint32_t, N, 0>> v) {
1872 const size_t avl = Lanes(d);
1873 const vuint32m1_t v1 = vand(v, 0xFFFF, avl);
1874 return vnclipu_wx_u16mf2(v1, 0, avl);
1875}
1876
1877template <size_t N>
1879 const VFromD<Simd<uint32_t, N, 1>> v) {
1880 const size_t avl = Lanes(d);
1881 const vuint32m2_t v1 = vand(v, 0xFFFF, avl);
1882 return vnclipu_wx_u16m1(v1, 0, avl);
1883}
1884
1885template <size_t N>
1887 const VFromD<Simd<uint32_t, N, 2>> v) {
1888 const size_t avl = Lanes(d);
1889 const vuint32m4_t v1 = vand(v, 0xFFFF, avl);
1890 return vnclipu_wx_u16m2(v1, 0, avl);
1891}
1892
1893template <size_t N>
1895 const VFromD<Simd<uint32_t, N, 3>> v) {
1896 const size_t avl = Lanes(d);
1897 const vuint32m8_t v1 = vand(v, 0xFFFF, avl);
1898 return vnclipu_wx_u16m4(v1, 0, avl);
1899}
1900
1901template <size_t N>
1904 const size_t avl = Lanes(d);
1905 const vuint16mf4_t v1 = vand(v, 0xFF, avl);
1906 return vnclipu_wx_u8mf8(v1, 0, avl);
1907}
1908
1909template <size_t N>
1912 const size_t avl = Lanes(d);
1913 const vuint16mf2_t v1 = vand(v, 0xFF, avl);
1914 return vnclipu_wx_u8mf4(v1, 0, avl);
1915}
1916
1917template <size_t N>
1919 const VFromD<Simd<uint16_t, N, 0>> v) {
1920 const size_t avl = Lanes(d);
1921 const vuint16m1_t v1 = vand(v, 0xFF, avl);
1922 return vnclipu_wx_u8mf2(v1, 0, avl);
1923}
1924
1925template <size_t N>
1927 const VFromD<Simd<uint16_t, N, 1>> v) {
1928 const size_t avl = Lanes(d);
1929 const vuint16m2_t v1 = vand(v, 0xFF, avl);
1930 return vnclipu_wx_u8m1(v1, 0, avl);
1931}
1932
1933template <size_t N>
1935 const VFromD<Simd<uint16_t, N, 2>> v) {
1936 const size_t avl = Lanes(d);
1937 const vuint16m4_t v1 = vand(v, 0xFF, avl);
1938 return vnclipu_wx_u8m2(v1, 0, avl);
1939}
1940
1941template <size_t N>
1943 const VFromD<Simd<uint16_t, N, 3>> v) {
1944 const size_t avl = Lanes(d);
1945 const vuint16m8_t v1 = vand(v, 0xFF, avl);
1946 return vnclipu_wx_u8m4(v1, 0, avl);
1947}
1948
1949// ------------------------------ DemoteTo I
1950
1951HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1952HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1953
1954template <size_t N>
1955HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
1957}
1958template <size_t N>
1959HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
1961}
1962template <size_t N>
1963HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
1965}
1966template <size_t N>
1967HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
1969}
1970template <size_t N>
1971HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
1973}
1974
1975#undef HWY_RVV_DEMOTE
1976
1977// ------------------------------ DemoteTo F
1978
1979// SEW is for the source so we can use _DEMOTE.
1980#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1981 SHIFT, MLEN, NAME, OP) \
1982 template <size_t N> \
1983 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1984 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1985 return OP##SEWH##LMULH(v, Lanes(d)); \
1986 }
1987
1988#if HWY_HAVE_FLOAT16
1989HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f,
1990 _DEMOTE_VIRT)
1991#endif
1993 _DEMOTE_VIRT)
1994#undef HWY_RVV_DEMOTE_F
1995
1996// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
1997template <size_t N>
1998HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
1999 return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
2000}
2001template <size_t N>
2002HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
2003 return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
2004}
2005template <size_t N>
2006HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
2007 return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
2008}
2009template <size_t N>
2010HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
2011 return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
2012}
2013template <size_t N>
2014HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
2015 return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
2016}
2017
2018template <size_t N, int kPow2>
2021 const RebindToUnsigned<decltype(d)> du16;
2022 const Rebind<uint32_t, decltype(d)> du32;
2023 return detail::DemoteToShr16(du16, BitCast(du32, v));
2024}
2025
2026// ------------------------------ ConvertTo F
2027
2028#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2029 SHIFT, MLEN, NAME, OP) \
2030 template <size_t N> \
2031 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
2032 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
2033 return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
2034 } \
2035 template <size_t N> \
2036 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
2037 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {\
2038 return vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \
2039 } \
2040 /* Truncates (rounds toward zero). */ \
2041 template <size_t N> \
2042 HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
2043 HWY_RVV_V(BASE, SEW, LMUL) v) { \
2044 return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
2045 } \
2046// API only requires f32 but we provide f64 for internal use.
2047HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
2048#undef HWY_RVV_CONVERT
2049
2050// Uses default rounding mode. Must be separate because there is no D arg.
2051#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2052 SHIFT, MLEN, NAME, OP) \
2053 HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2054 return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
2055 }
2057#undef HWY_RVV_NEAREST
2058
2059// ================================================== COMBINE
2060
2061namespace detail {
2062
2063// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
2064// offsets are implicitly relative to the start of their 128-bit block.
2065template <typename T, size_t N, int kPow2>
2067 size_t lpb = 16 / sizeof(T);
2068 if (IsFull(d)) return lpb;
2069 // Also honor the user-specified (constexpr) N limit.
2070 lpb = HWY_MIN(lpb, N);
2071 // No fraction, we're done.
2072 if (kPow2 >= 0) return lpb;
2073 // Fractional LMUL: Lanes(d) may be smaller than lpb, so honor that.
2074 return HWY_MIN(lpb, Lanes(d));
2075}
2076
2077template <class D, class V>
2078HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
2079 using T = MakeUnsigned<TFromD<D>>;
2080 return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
2081}
2082
2083template <size_t kLanes, class D>
2085 const RebindToUnsigned<D> du;
2086 const RebindToSigned<D> di;
2087 using TU = TFromD<decltype(du)>;
2088 const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1));
2089 return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
2090}
2091
2092// vector = f(vector, vector, size_t)
2093#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2094 MLEN, NAME, OP) \
2095 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2096 NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
2097 size_t lanes) { \
2098 return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
2099 HWY_RVV_AVL(SEW, SHIFT)); \
2100 }
2101
2102HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup, _ALL)
2103HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown, _ALL)
2104
2105#undef HWY_RVV_SLIDE
2106
2107} // namespace detail
2108
2109// ------------------------------ ConcatUpperLower
2110template <class D, class V>
2111HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
2112 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2113}
2114
2115// ------------------------------ ConcatLowerLower
2116template <class D, class V>
2117HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
2118 return detail::SlideUp(lo, hi, Lanes(d) / 2);
2119}
2120
2121// ------------------------------ ConcatUpperUpper
2122template <class D, class V>
2123HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
2124 // Move upper half into lower
2125 const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
2126 return ConcatUpperLower(d, hi, lo_down);
2127}
2128
2129// ------------------------------ ConcatLowerUpper
2130template <class D, class V>
2131HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
2132 // Move half of both inputs to the other half
2133 const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2);
2134 const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
2135 return ConcatUpperLower(d, hi_up, lo_down);
2136}
2137
2138// ------------------------------ Combine
2139template <class D2, class V>
2140HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
2141 return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
2142 Lanes(d2) / 2);
2143}
2144
2145// ------------------------------ ZeroExtendVector
2146
2147template <class D2, class V>
2149 return Combine(d2, Xor(lo, lo), lo);
2150}
2151
2152// ------------------------------ Lower/UpperHalf
2153
2154namespace detail {
2155
2156// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
2157// that SEW = sizeof(T)*8 and LMUL = 1 << Pow2().
2158template <class D>
2159constexpr bool IsSupportedLMUL(D d) {
2160 return (size_t{1} << (Pow2(d) + 3)) >= sizeof(TFromD<D>);
2161}
2162
2163} // namespace detail
2164
2165// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
2166template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
2167HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
2168 return detail::Trunc(v);
2169}
2170
2171// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
2172// the hardware may set "vill" if we attempt such an LMUL. However, the V
2173// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
2174// still makes sense to have half of an SEW=64 vector. We instead just return
2175// the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
2176template <class DH, class V,
2177 hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
2178HWY_API V LowerHalf(const DH /* tag */, const V v) {
2179 return v;
2180}
2181
2182// Same, but without D arg
2183template <class V>
2185 return LowerHalf(Half<DFromV<V>>(), v);
2186}
2187
2188template <class DH>
2190 return LowerHalf(d2, detail::SlideDown(v, v, Lanes(d2)));
2191}
2192
2193// ================================================== SWIZZLE
2194
2195namespace detail {
2196// Special instruction for 1 lane is presumably faster?
2197#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2198 MLEN, NAME, OP) \
2199 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2200 return v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
2201 }
2202
2203HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
2204HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
2205HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
2206HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
2207#undef HWY_RVV_SLIDE1
2208} // namespace detail
2209
2210// ------------------------------ GetLane
2211
2212#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2213 SHIFT, MLEN, NAME, OP) \
2214 HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2215 return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
2216 }
2217
2220#undef HWY_RVV_GET_LANE
2221
2222// ------------------------------ ExtractLane
2223template <class V>
2224HWY_API TFromV<V> ExtractLane(const V v, size_t i) {
2225 return GetLane(detail::SlideDown(v, v, i));
2226}
2227
2228// ------------------------------ InsertLane
2229
2230template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 1)>
2231HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2232 const DFromV<V> d;
2233 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2234 using TU = TFromD<decltype(du)>;
2235 const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
2236 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
2237}
2238
2239namespace detail {
2240HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof)
2241} // namespace detail
2242
2243// For 8-bit lanes, Iota0 might overflow.
2244template <class V, HWY_IF_LANE_SIZE_V(V, 1)>
2245HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2246 const DFromV<V> d;
2247 const auto zero = Zero(d);
2248 const auto one = Set(d, 1);
2249 const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
2250 const auto is_i = detail::SetOnlyFirst(ge_i);
2251 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
2252}
2253
2254// ------------------------------ OddEven
2255template <class V>
2256HWY_API V OddEven(const V a, const V b) {
2257 const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
2258 const auto is_even = detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
2259 return IfThenElse(is_even, b, a);
2260}
2261
2262// ------------------------------ DupEven (OddEven)
2263template <class V>
2264HWY_API V DupEven(const V v) {
2265 const V up = detail::Slide1Up(v);
2266 return OddEven(up, v);
2267}
2268
2269// ------------------------------ DupOdd (OddEven)
2270template <class V>
2271HWY_API V DupOdd(const V v) {
2272 const V down = detail::Slide1Down(v);
2273 return OddEven(v, down);
2274}
2275
2276// ------------------------------ OddEvenBlocks
2277template <class V>
2278HWY_API V OddEvenBlocks(const V a, const V b) {
2279 const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
2280 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
2281 const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
2282 const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
2283 return IfThenElse(is_even, b, a);
2284}
2285
2286// ------------------------------ SwapAdjacentBlocks
2287
2288template <class V>
2289HWY_API V SwapAdjacentBlocks(const V v) {
2290 const DFromV<V> d;
2291 const size_t lpb = detail::LanesPerBlock(d);
2292 const V down = detail::SlideDown(v, v, lpb);
2293 const V up = detail::SlideUp(v, v, lpb);
2294 return OddEvenBlocks(up, down);
2295}
2296
2297// ------------------------------ TableLookupLanes
2298
2299template <class D, class VI>
2300HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
2301 static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
2302 const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d.
2303 const auto indices = BitCast(du, vec);
2304#if HWY_IS_DEBUG_BUILD
2305 HWY_DASSERT(AllTrue(du, detail::LtS(indices, Lanes(d))));
2306#endif
2307 return indices;
2308}
2309
2310template <class D, typename TI>
2311HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
2312 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
2313 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
2314}
2315
2316// <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
2317// to 2048! We could instead use vrgatherei16.
2318#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2319 MLEN, NAME, OP) \
2320 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2321 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
2322 return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT)); \
2323 }
2324
2326#undef HWY_RVV_TABLE
2327
2328// ------------------------------ ConcatOdd (TableLookupLanes)
2329template <class D, class V>
2330HWY_API V ConcatOdd(D d, const V hi, const V lo) {
2331 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2332 const auto iota = detail::Iota0(du);
2333 const auto idx = detail::AddS(Add(iota, iota), 1);
2334 const auto lo_odd = TableLookupLanes(lo, idx);
2335 const auto hi_odd = TableLookupLanes(hi, idx);
2336 return detail::SlideUp(lo_odd, hi_odd, Lanes(d) / 2);
2337}
2338
2339// ------------------------------ ConcatEven (TableLookupLanes)
2340template <class D, class V>
2341HWY_API V ConcatEven(D d, const V hi, const V lo) {
2342 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2343 const auto iota = detail::Iota0(du);
2344 const auto idx = Add(iota, iota);
2345 const auto lo_even = TableLookupLanes(lo, idx);
2346 const auto hi_even = TableLookupLanes(hi, idx);
2347 return detail::SlideUp(lo_even, hi_even, Lanes(d) / 2);
2348}
2349
2350// ------------------------------ Reverse (TableLookupLanes)
2351template <class D>
2353 const RebindToUnsigned<D> du;
2354 using TU = TFromD<decltype(du)>;
2355 const size_t N = Lanes(du);
2356 const auto idx =
2357 detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
2358 return TableLookupLanes(v, idx);
2359}
2360
2361// ------------------------------ Reverse2 (RotateRight, OddEven)
2362
2363// Shifting and adding requires fewer instructions than blending, but casting to
2364// u32 only works for LMUL in [1/2, 8].
2365template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -1, 3)>
2367 const Repartition<uint32_t, D> du32;
2368 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2369}
2370// For LMUL < 1/2, we can extend and then truncate.
2371template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -3, -2)>
2372HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2373 const Twice<decltype(d)> d2;
2374 const Twice<decltype(d2)> d4;
2375 const Repartition<uint32_t, decltype(d4)> du32;
2376 const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2377 const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx)));
2378 return detail::Trunc(detail::Trunc(rx));
2379}
2380
2381// Shifting and adding requires fewer instructions than blending, but casting to
2382// u64 does not work for LMUL < 1.
2383template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, 0, 3)>
2384HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2385 const Repartition<uint64_t, decltype(d)> du64;
2386 return BitCast(d, RotateRight<32>(BitCast(du64, v)));
2387}
2388
2389// For fractions, we can extend and then truncate.
2390template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, -2, -1)>
2391HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2392 const Twice<decltype(d)> d2;
2393 const Twice<decltype(d2)> d4;
2394 const Repartition<uint64_t, decltype(d4)> du64;
2395 const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2396 const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx)));
2397 return detail::Trunc(detail::Trunc(rx));
2398}
2399
2400template <class D, class V = VFromD<D>, HWY_IF_LANE_SIZE_D(D, 8)>
2401HWY_API V Reverse2(D /* tag */, const V v) {
2402 const V up = detail::Slide1Up(v);
2403 const V down = detail::Slide1Down(v);
2404 return OddEven(up, down);
2405}
2406
2407// ------------------------------ Reverse4 (TableLookupLanes)
2408
2409template <class D>
2410HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2411 const RebindToUnsigned<D> du;
2412 const auto idx = detail::XorS(detail::Iota0(du), 3);
2413 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2414}
2415
2416// ------------------------------ Reverse8 (TableLookupLanes)
2417
2418template <class D>
2419HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
2420 const RebindToUnsigned<D> du;
2421 const auto idx = detail::XorS(detail::Iota0(du), 7);
2422 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2423}
2424
2425// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2426template <class D, class V = VFromD<D>>
2427HWY_API V ReverseBlocks(D d, V v) {
2428 const Repartition<uint64_t, D> du64;
2429 const size_t N = Lanes(du64);
2430 const auto rev =
2431 detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
2432 // Swap lo/hi u64 within each block
2433 const auto idx = detail::XorS(rev, 1);
2434 return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
2435}
2436
2437// ------------------------------ Compress
2438
2439// RVV supports all lane types natively.
2440#ifdef HWY_NATIVE_COMPRESS8
2441#undef HWY_NATIVE_COMPRESS8
2442#else
2443#define HWY_NATIVE_COMPRESS8
2444#endif
2445
2446template <typename T>
2447struct CompressIsPartition {
2448 enum { value = 0 };
2449};
2450
2451#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2452 SHIFT, MLEN, NAME, OP) \
2453 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2454 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
2455 return v##OP##_vm_##CHAR##SEW##LMUL(v, v, mask, HWY_RVV_AVL(SEW, SHIFT)); \
2456 }
2457
2459#undef HWY_RVV_COMPRESS
2460
2461// ------------------------------ CompressNot
2462template <class V, class M>
2463HWY_API V CompressNot(V v, const M mask) {
2464 return Compress(v, Not(mask));
2465}
2466
2467// ------------------------------ CompressBlocksNot
2468template <class V, class M>
2469HWY_API V CompressBlocksNot(V v, const M mask) {
2470 return CompressNot(v, mask);
2471}
2472
2473// ------------------------------ CompressStore
2474template <class V, class M, class D>
2475HWY_API size_t CompressStore(const V v, const M mask, const D d,
2476 TFromD<D>* HWY_RESTRICT unaligned) {
2477 StoreU(Compress(v, mask), d, unaligned);
2478 return CountTrue(d, mask);
2479}
2480
2481// ------------------------------ CompressBlendedStore
2482template <class V, class M, class D>
2483HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
2484 TFromD<D>* HWY_RESTRICT unaligned) {
2485 const size_t count = CountTrue(d, mask);
2486 detail::StoreN(count, Compress(v, mask), d, unaligned);
2487 return count;
2488}
2489
2490// ================================================== BLOCKWISE
2491
2492// ------------------------------ CombineShiftRightBytes
2493template <size_t kBytes, class D, class V = VFromD<D>>
2494HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
2495 const Repartition<uint8_t, decltype(d)> d8;
2496 const auto hi8 = BitCast(d8, hi);
2497 const auto lo8 = BitCast(d8, lo);
2498 const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
2499 const auto lo_down = detail::SlideDown(lo8, lo8, kBytes);
2500 const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2501 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2502}
2503
2504// ------------------------------ CombineShiftRightLanes
2505template <size_t kLanes, class D, class V = VFromD<D>>
2506HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
2507 constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
2508 const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
2509 const auto lo_down = detail::SlideDown(lo, lo, kLanes);
2510 const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
2511 return IfThenElse(is_lo, lo_down, hi_up);
2512}
2513
2514// ------------------------------ Shuffle2301 (ShiftLeft)
2515template <class V>
2516HWY_API V Shuffle2301(const V v) {
2517 const DFromV<V> d;
2518 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2519 const Repartition<uint64_t, decltype(d)> du64;
2520 const auto v64 = BitCast(du64, v);
2521 return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
2522}
2523
2524// ------------------------------ Shuffle2103
2525template <class V>
2526HWY_API V Shuffle2103(const V v) {
2527 const DFromV<V> d;
2528 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2529 return CombineShiftRightLanes<3>(d, v, v);
2530}
2531
2532// ------------------------------ Shuffle0321
2533template <class V>
2534HWY_API V Shuffle0321(const V v) {
2535 const DFromV<V> d;
2536 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2537 return CombineShiftRightLanes<1>(d, v, v);
2538}
2539
2540// ------------------------------ Shuffle1032
2541template <class V>
2542HWY_API V Shuffle1032(const V v) {
2543 const DFromV<V> d;
2544 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2545 return CombineShiftRightLanes<2>(d, v, v);
2546}
2547
2548// ------------------------------ Shuffle01
2549template <class V>
2550HWY_API V Shuffle01(const V v) {
2551 const DFromV<V> d;
2552 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2553 return CombineShiftRightLanes<1>(d, v, v);
2554}
2555
2556// ------------------------------ Shuffle0123
2557template <class V>
2558HWY_API V Shuffle0123(const V v) {
2559 return Shuffle2301(Shuffle1032(v));
2560}
2561
2562// ------------------------------ TableLookupBytes
2563
2564// Extends or truncates a vector to match the given d.
2565namespace detail {
2566
2567template <typename T, size_t N, int kPow2>
2569 -> VFromD<decltype(d)> {
2570 const Simd<T, N, kPow2 - 1> dh;
2571 const Simd<T, N, kPow2 - 2> dhh;
2572 return Ext(d, Ext(dh, Ext(dhh, v)));
2573}
2574template <typename T, size_t N, int kPow2>
2576 -> VFromD<decltype(d)> {
2577 const Simd<T, N, kPow2 - 1> dh;
2578 return Ext(d, Ext(dh, v));
2579}
2580template <typename T, size_t N, int kPow2>
2582 -> VFromD<decltype(d)> {
2583 return Ext(d, v);
2584}
2585
2586template <typename T, size_t N, int kPow2>
2588 -> VFromD<decltype(d)> {
2589 return v;
2590}
2591
2592template <typename T, size_t N, int kPow2>
2594 -> VFromD<decltype(d)> {
2595 return Trunc(v);
2596}
2597template <typename T, size_t N, int kPow2>
2599 -> VFromD<decltype(d)> {
2600 return Trunc(Trunc(v));
2601}
2602template <typename T, size_t N, int kPow2>
2604 -> VFromD<decltype(d)> {
2605 return Trunc(Trunc(Trunc(v)));
2606}
2607
2608} // namespace detail
2609
2610template <class VT, class VI>
2611HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
2612 const DFromV<VT> dt; // T=table, I=index.
2613 const DFromV<VI> di;
2614 const Repartition<uint8_t, decltype(dt)> dt8;
2615 const Repartition<uint8_t, decltype(di)> di8;
2616 // Required for producing half-vectors with table lookups from a full vector.
2617 // If we instead run at the LMUL of the index vector, lookups into the table
2618 // would be truncated. Thus we run at the larger of the two LMULs and truncate
2619 // the result vector to the original index LMUL.
2620 constexpr int kPow2T = Pow2(dt8);
2621 constexpr int kPow2I = Pow2(di8);
2622 const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max
2623 const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
2624 const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
2625 auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
2626 // If the table is shorter, wrap around offsets so they do not reference
2627 // undefined lanes in the newly extended vmt.
2628 if (kPow2T < kPow2I) {
2629 offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1));
2630 }
2631 const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
2632 return BitCast(di, detail::ChangeLMUL(di8, out));
2633}
2634
2635template <class VT, class VI>
2636HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
2637 const DFromV<VI> di;
2638 const Repartition<int8_t, decltype(di)> di8;
2639 const auto idx8 = BitCast(di8, idx);
2640 const auto lookup = TableLookupBytes(vt, idx8);
2641 return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
2642}
2643
2644// ------------------------------ Broadcast
2645template <int kLane, class V>
2646HWY_API V Broadcast(const V v) {
2647 const DFromV<V> d;
2648 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
2650 if (kLane != 0) {
2651 idx = detail::AddS(idx, kLane);
2652 }
2653 return TableLookupLanes(v, idx);
2654}
2655
2656// ------------------------------ ShiftLeftLanes
2657
2658template <size_t kLanes, class D, class V = VFromD<D>>
2659HWY_API V ShiftLeftLanes(const D d, const V v) {
2660 const RebindToSigned<decltype(d)> di;
2661 using TI = TFromD<decltype(di)>;
2662 const auto shifted = detail::SlideUp(v, v, kLanes);
2663 // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2664 const auto idx_mod =
2665 detail::AndS(BitCast(di, detail::Iota0(di)),
2666 static_cast<TI>(detail::LanesPerBlock(di) - 1));
2667 const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
2668 return IfThenZeroElse(clear, shifted);
2669}
2670
2671template <size_t kLanes, class V>
2672HWY_API V ShiftLeftLanes(const V v) {
2673 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2674}
2675
2676// ------------------------------ ShiftLeftBytes
2677
2678template <int kBytes, class D>
2680 const Repartition<uint8_t, decltype(d)> d8;
2681 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2682}
2683
2684template <int kBytes, class V>
2685HWY_API V ShiftLeftBytes(const V v) {
2686 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2687}
2688
2689// ------------------------------ ShiftRightLanes
2690template <size_t kLanes, typename T, size_t N, int kPow2,
2691 class V = VFromD<Simd<T, N, kPow2>>>
2693 const RebindToSigned<decltype(d)> di;
2694 using TI = TFromD<decltype(di)>;
2695 // For partial vectors, clear upper lanes so we shift in zeros.
2696 if (N <= 16 / sizeof(T)) {
2697 v = IfThenElseZero(FirstN(d, N), v);
2698 }
2699
2700 const auto shifted = detail::SlideDown(v, v, kLanes);
2701 // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2702 const size_t lpb = detail::LanesPerBlock(di);
2703 const auto idx_mod =
2704 detail::AndS(BitCast(di, detail::Iota0(di)), static_cast<TI>(lpb - 1));
2705 const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
2706 return IfThenElseZero(keep, shifted);
2707}
2708
2709// ------------------------------ ShiftRightBytes
2710template <int kBytes, class D, class V = VFromD<D>>
2711HWY_API V ShiftRightBytes(const D d, const V v) {
2712 const Repartition<uint8_t, decltype(d)> d8;
2713 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2714}
2715
2716// ------------------------------ InterleaveLower
2717
2718template <class D, class V>
2719HWY_API V InterleaveLower(D d, const V a, const V b) {
2720 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2721 const RebindToUnsigned<decltype(d)> du;
2722 using TU = TFromD<decltype(du)>;
2723 const auto i = detail::Iota0(du);
2724 const auto idx_mod = ShiftRight<1>(
2725 detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1)));
2726 const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2727 const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2728 return IfThenElse(is_even, TableLookupLanes(a, idx),
2729 TableLookupLanes(b, idx));
2730}
2731
2732template <class V>
2733HWY_API V InterleaveLower(const V a, const V b) {
2734 return InterleaveLower(DFromV<V>(), a, b);
2735}
2736
2737// ------------------------------ InterleaveUpper
2738
2739template <class D, class V>
2740HWY_API V InterleaveUpper(const D d, const V a, const V b) {
2741 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2742 const RebindToUnsigned<decltype(d)> du;
2743 using TU = TFromD<decltype(du)>;
2744 const size_t lpb = detail::LanesPerBlock(du);
2745 const auto i = detail::Iota0(du);
2746 const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1)));
2747 const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2748 const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2));
2749 const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2750 return IfThenElse(is_even, TableLookupLanes(a, idx),
2751 TableLookupLanes(b, idx));
2752}
2753
2754// ------------------------------ ZipLower
2755
2756template <class V, class DW = RepartitionToWide<DFromV<V>>>
2757HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2758 const RepartitionToNarrow<DW> dn;
2759 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2760 return BitCast(dw, InterleaveLower(dn, a, b));
2761}
2762
2763template <class V, class DW = RepartitionToWide<DFromV<V>>>
2764HWY_API VFromD<DW> ZipLower(V a, V b) {
2765 return BitCast(DW(), InterleaveLower(a, b));
2766}
2767
2768// ------------------------------ ZipUpper
2769template <class DW, class V>
2770HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2771 const RepartitionToNarrow<DW> dn;
2772 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2773 return BitCast(dw, InterleaveUpper(dn, a, b));
2774}
2775
2776// ================================================== REDUCE
2777
2778// vector = f(vector, zero_m1)
2779#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2780 MLEN, NAME, OP) \
2781 template <class D> \
2782 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2783 NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
2784 return Set(d, GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
2785 v0, v, v0, Lanes(d)))); \
2786 }
2787
2788// ------------------------------ SumOfLanes
2789
2790namespace detail {
2791HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
2792HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
2793} // namespace detail
2794
2795template <class D>
2797 const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
2798 return detail::RedSum(d, v, v0);
2799}
2800
2801// ------------------------------ MinOfLanes
2802namespace detail {
2803HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
2804HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
2805HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
2806} // namespace detail
2807
2808template <class D>
2810 using T = TFromD<D>;
2811 const ScalableTag<T> d1; // always m1
2812 const auto neutral = Set(d1, HighestValue<T>());
2813 return detail::RedMin(d, v, neutral);
2814}
2815
2816// ------------------------------ MaxOfLanes
2817namespace detail {
2818HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
2819HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
2820HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
2821} // namespace detail
2822
2823template <class D>
2825 using T = TFromD<D>;
2826 const ScalableTag<T> d1; // always m1
2827 const auto neutral = Set(d1, LowestValue<T>());
2828 return detail::RedMax(d, v, neutral);
2829}
2830
2831#undef HWY_RVV_REDUCE
2832
2833// ================================================== Ops with dependencies
2834
2835// ------------------------------ PopulationCount (ShiftRight)
2836
2837// Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
2838template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
2839 hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
2840HWY_API V PopulationCount(V v) {
2841 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
2842 v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
2843 v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
2844 return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
2845}
2846
2847// ------------------------------ LoadDup128
2848
2849template <class D>
2850HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
2851 const VFromD<D> loaded = Load(d, p);
2852 // idx must be unsigned for TableLookupLanes.
2853 using TU = MakeUnsigned<TFromD<D>>;
2854 const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
2855 // Broadcast the first block.
2856 const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(d), mask);
2857 return TableLookupLanes(loaded, idx);
2858}
2859
2860// ------------------------------ LoadMaskBits
2861
2862// Support all combinations of T and SHIFT(LMUL) without explicit overloads for
2863// each. First overload for MLEN=1..64.
2864namespace detail {
2865
2866// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
2867// increases with lane size and decreases for increasing LMUL. Cap at 64, the
2868// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
2869// e.g. vuint16mf8_t: (8*2 << 3) == 128.
2870template <class D>
2871using MaskTag = hwy::SizeTag<HWY_MIN(
2872 64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -Pow2(D())))>;
2873
2874#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2875 HWY_INLINE HWY_RVV_M(MLEN) \
2876 NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
2877 return OP##_v_b##MLEN(bits, N); \
2878 }
2880#undef HWY_RVV_LOAD_MASK_BITS
2881} // namespace detail
2882
2883template <class D, class MT = detail::MaskTag<D>>
2884HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
2885 -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
2886 return detail::LoadMaskBits(MT(), bits, Lanes(d));
2887}
2888
2889// ------------------------------ StoreMaskBits
2890#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2891 template <class D> \
2892 HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \
2893 const size_t N = Lanes(d); \
2894 OP##_v_b##MLEN(bits, m, N); \
2895 /* Non-full byte, need to clear the undefined upper bits. */ \
2896 /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
2897 constexpr bool kLessThan8 = \
2898 detail::ScaleByPower(16 / sizeof(TFromD<D>), Pow2(d)) < 8; \
2899 if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \
2900 const int mask = (1 << N) - 1; \
2901 bits[0] = static_cast<uint8_t>(bits[0] & mask); \
2902 } \
2903 return (N + 7) / 8; \
2904 }
2906#undef HWY_RVV_STORE_MASK_BITS
2907
2908// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
2909
2910template <class V>
2911HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2912 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2913}
2914
2915template <class D>
2916HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2917 D d, TFromD<D>* HWY_RESTRICT unaligned) {
2918 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2919}
2920
2921// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
2922
2923// Disallow for 8-bit because Iota is likely to overflow.
2924template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2925HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2926 const RebindToSigned<D> di;
2927 using TI = TFromD<decltype(di)>;
2928 return RebindMask(
2929 d, detail::LtS(BitCast(di, detail::Iota0(d)), static_cast<TI>(n)));
2930}
2931
2932template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2933HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2934 const auto zero = Zero(d);
2935 const auto one = Set(d, 1);
2936 return Eq(detail::SlideUp(one, zero, n), one);
2937}
2938
2939// ------------------------------ Neg (Sub)
2940
2941template <class V, HWY_IF_SIGNED_V(V)>
2942HWY_API V Neg(const V v) {
2943 return detail::ReverseSubS(v, 0);
2944}
2945
2946// vector = f(vector), but argument is repeated
2947#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2948 SHIFT, MLEN, NAME, OP) \
2949 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2950 return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2951 }
2952
2954
2955// ------------------------------ Abs (Max, Neg)
2956
2957template <class V, HWY_IF_SIGNED_V(V)>
2958HWY_API V Abs(const V v) {
2959 return Max(v, Neg(v));
2960}
2961
2963
2964#undef HWY_RVV_RETV_ARGV2
2965
2966// ------------------------------ AbsDiff (Abs, Sub)
2967template <class V>
2968HWY_API V AbsDiff(const V a, const V b) {
2969 return Abs(Sub(a, b));
2970}
2971
2972// ------------------------------ Round (NearestInt, ConvertTo, CopySign)
2973
2974// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
2975// a dedicated instruction for that. Rounding to integer and converting back to
2976// float is correct except when the input magnitude is large, in which case the
2977// input was already an integer (because mantissa >> exponent is zero).
2978
2979namespace detail {
2980enum RoundingModes { kNear, kTrunc, kDown, kUp };
2981
2982template <class V>
2983HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
2984 return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
2985}
2986
2987} // namespace detail
2988
2989template <class V>
2990HWY_API V Round(const V v) {
2991 const DFromV<V> df;
2992
2993 const auto integer = NearestInt(v); // round using current mode
2994 const auto int_f = ConvertTo(df, integer);
2995
2996 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2997}
2998
2999// ------------------------------ Trunc (ConvertTo)
3000template <class V>
3001HWY_API V Trunc(const V v) {
3002 const DFromV<V> df;
3003 const RebindToSigned<decltype(df)> di;
3004
3005 const auto integer = ConvertTo(di, v); // round toward 0
3006 const auto int_f = ConvertTo(df, integer);
3007
3008 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
3009}
3010
3011// ------------------------------ Ceil
3012template <class V>
3013HWY_API V Ceil(const V v) {
3014 asm volatile("fsrm %0" ::"r"(detail::kUp));
3015 const auto ret = Round(v);
3016 asm volatile("fsrm %0" ::"r"(detail::kNear));
3017 return ret;
3018}
3019
3020// ------------------------------ Floor
3021template <class V>
3022HWY_API V Floor(const V v) {
3023 asm volatile("fsrm %0" ::"r"(detail::kDown));
3024 const auto ret = Round(v);
3025 asm volatile("fsrm %0" ::"r"(detail::kNear));
3026 return ret;
3027}
3028
3029// ------------------------------ Floating-point classification (Ne)
3030
3031// vfclass does not help because it would require 3 instructions (to AND and
3032// then compare the bits), whereas these are just 1-3 integer instructions.
3033
3034template <class V>
3036 return Ne(v, v);
3037}
3038
3039template <class V, class D = DFromV<V>>
3041 const D d;
3042 const RebindToSigned<decltype(d)> di;
3043 using T = TFromD<D>;
3044 const VFromD<decltype(di)> vi = BitCast(di, v);
3045 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
3046 return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
3047}
3048
3049// Returns whether normal/subnormal/zero.
3050template <class V, class D = DFromV<V>>
3052 const D d;
3053 const RebindToUnsigned<decltype(d)> du;
3054 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
3055 using T = TFromD<D>;
3056 const VFromD<decltype(du)> vu = BitCast(du, v);
3057 // 'Shift left' to clear the sign bit, then right so we can compare with the
3058 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
3059 // negative and non-negative floats would be greater).
3060 const VFromD<decltype(di)> exp =
3061 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
3062 return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
3063}
3064
3065// ------------------------------ Iota (ConvertTo)
3066
3067template <class D, HWY_IF_UNSIGNED_D(D)>
3068HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
3069 return detail::AddS(detail::Iota0(d), first);
3070}
3071
3072template <class D, HWY_IF_SIGNED_D(D)>
3073HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
3074 const RebindToUnsigned<D> du;
3075 return detail::AddS(BitCast(d, detail::Iota0(du)), first);
3076}
3077
3078template <class D, HWY_IF_FLOAT_D(D)>
3079HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
3080 const RebindToUnsigned<D> du;
3081 const RebindToSigned<D> di;
3082 return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
3083}
3084
3085// ------------------------------ MulEven/Odd (Mul, OddEven)
3086
3087template <class V, HWY_IF_LANE_SIZE_V(V, 4), class D = DFromV<V>,
3088 class DW = RepartitionToWide<D>>
3089HWY_API VFromD<DW> MulEven(const V a, const V b) {
3090 const auto lo = Mul(a, b);
3091 const auto hi = detail::MulHigh(a, b);
3092 return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
3093}
3094
3095// There is no 64x64 vwmul.
3096template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
3097HWY_INLINE V MulEven(const V a, const V b) {
3098 const auto lo = Mul(a, b);
3099 const auto hi = detail::MulHigh(a, b);
3100 return OddEven(detail::Slide1Up(hi), lo);
3101}
3102
3103template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
3104HWY_INLINE V MulOdd(const V a, const V b) {
3105 const auto lo = Mul(a, b);
3106 const auto hi = detail::MulHigh(a, b);
3107 return OddEven(hi, detail::Slide1Down(lo));
3108}
3109
3110// ------------------------------ ReorderDemote2To (OddEven, Combine)
3111
3112template <size_t N, int kPow2>
3115 VFromD<RepartitionToWide<decltype(dbf16)>> a,
3116 VFromD<RepartitionToWide<decltype(dbf16)>> b) {
3117 const RebindToUnsigned<decltype(dbf16)> du16;
3118 const RebindToUnsigned<DFromV<decltype(a)>> du32;
3119 const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
3120 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3121}
3122
3123// If LMUL is not the max, Combine first to avoid another DemoteTo.
3124template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
3125 class D32 = RepartitionToWide<Simd<int16_t, N, kPow2>>>
3126HWY_API VFromD<Simd<int16_t, N, kPow2>> ReorderDemote2To(
3127 Simd<int16_t, N, kPow2> d16, VFromD<D32> a, VFromD<D32> b) {
3128 const Twice<D32> d32t;
3129 const VFromD<decltype(d32t)> ab = Combine(d32t, a, b);
3130 return DemoteTo(d16, ab);
3131}
3132
3133// Max LMUL: must DemoteTo first, then Combine.
3134template <size_t N, class V32 = VFromD<RepartitionToWide<Simd<int16_t, N, 3>>>>
3136 V32 a, V32 b) {
3137 const Half<decltype(d16)> d16h;
3138 const VFromD<decltype(d16h)> a16 = DemoteTo(d16h, a);
3139 const VFromD<decltype(d16h)> b16 = DemoteTo(d16h, b);
3140 return Combine(d16, a16, b16);
3141}
3142
3143// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3144
3145namespace detail {
3146
3147// Non-overloaded wrapper function so we can define DF32 in template args.
3148template <
3149 size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
3150 class VF32 = VFromD<DF32>,
3151 class DU16 = RepartitionToNarrow<RebindToUnsigned<Simd<float, N, kPow2>>>>
3154 const VF32 sum0, VF32& sum1) {
3155 const RebindToUnsigned<DF32> du32;
3156 using VU32 = VFromD<decltype(du32)>;
3157 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
3158 // Using shift/and instead of Zip leads to the odd/even order that
3159 // RearrangeToOddPlusEven prefers.
3160 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
3161 const VU32 ao = And(BitCast(du32, a), odd);
3162 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
3163 const VU32 bo = And(BitCast(du32, b), odd);
3164 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
3165 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
3166}
3167
3168#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3169 SHIFT, MLEN, NAME, OP) \
3170 template <size_t N> \
3171 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
3172 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
3173 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
3174 return OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \
3175 }
3176
3177HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, vwmacc_vv_, _EXT_VIRT)
3178#undef HWY_RVV_WIDEN_MACC
3179
3180// If LMUL is not the max, we can WidenMul first (3 instructions).
3181template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
3182 class D32 = Simd<int32_t, N, kPow2>, class V32 = VFromD<D32>,
3183 class D16 = RepartitionToNarrow<D32>>
3184HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, kPow2> d32,
3185 VFromD<D16> a, VFromD<D16> b,
3186 const V32 sum0, V32& sum1) {
3187 const Twice<decltype(d32)> d32t;
3188 using V32T = VFromD<decltype(d32t)>;
3189 V32T sum = Combine(d32t, sum1, sum0);
3190 sum = detail::WidenMulAcc(d32t, sum, a, b);
3191 sum1 = UpperHalf(d32, sum);
3192 return LowerHalf(d32, sum);
3193}
3194
3195// Max LMUL: must LowerHalf first (4 instructions).
3196template <size_t N, class D32 = Simd<int32_t, N, 3>, class V32 = VFromD<D32>,
3197 class D16 = RepartitionToNarrow<D32>>
3200 const V32 sum0, V32& sum1) {
3201 const Half<D16> d16h;
3202 using V16H = VFromD<decltype(d16h)>;
3203 const V16H a0 = LowerHalf(d16h, a);
3204 const V16H a1 = UpperHalf(d16h, a);
3205 const V16H b0 = LowerHalf(d16h, b);
3206 const V16H b1 = UpperHalf(d16h, b);
3207 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
3208 return detail::WidenMulAcc(d32, sum0, a0, b0);
3209}
3210
3211} // namespace detail
3212
3213template <size_t N, int kPow2, class VN, class VW>
3215 const VW sum0, VW& sum1) {
3216 return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
3217}
3218
3219template <size_t N, int kPow2, class VN, class VW>
3221 const VW sum0, VW& sum1) {
3222 return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
3223}
3224
3225// ------------------------------ RearrangeToOddPlusEven
3226
3227template <class VW, HWY_IF_SIGNED_V(VW)> // vint32_t*
3228HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
3229 // vwmacc doubles LMUL, so we require a pairwise sum here. This op is
3230 // expected to be less frequent than ReorderWidenMulAccumulate, hence it's
3231 // preferable to do the extra work here rather than do manual odd/even
3232 // extraction there.
3233 const DFromV<VW> di32;
3234 const RebindToUnsigned<decltype(di32)> du32;
3235 const Twice<decltype(di32)> di32x2;
3236 const RepartitionToWide<decltype(di32x2)> di64x2;
3237 const RebindToUnsigned<decltype(di64x2)> du64x2;
3238 const auto combined = BitCast(di64x2, Combine(di32x2, sum1, sum0));
3239 // Isolate odd/even int32 in int64 lanes.
3240 const auto even = ShiftRight<32>(ShiftLeft<32>(combined)); // sign extend
3241 const auto odd = ShiftRight<32>(combined);
3242 return BitCast(di32, TruncateTo(du32, BitCast(du64x2, Add(even, odd))));
3243}
3244
3245// For max LMUL, we cannot Combine again and instead manually unroll.
3246HWY_API vint32m8_t RearrangeToOddPlusEven(vint32m8_t sum0, vint32m8_t sum1) {
3247 const DFromV<vint32m8_t> d;
3248 const Half<decltype(d)> dh;
3249 const vint32m4_t lo =
3250 RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0));
3251 const vint32m4_t hi =
3252 RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1));
3253 return Combine(d, hi, lo);
3254}
3255
3256template <class VW, HWY_IF_FLOAT_V(VW)> // vfloat*
3257HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
3258 return Add(sum0, sum1); // invariant already holds
3259}
3260
3261// ------------------------------ Lt128
3262template <class D>
3264 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3265 "D must be u64");
3266 // Truth table of Eq and Compare for Hi and Lo u64.
3267 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
3268 // =H =L cH cL | out = cH | (=H & cL)
3269 // 0 0 0 0 | 0
3270 // 0 0 0 1 | 0
3271 // 0 0 1 0 | 1
3272 // 0 0 1 1 | 1
3273 // 0 1 0 0 | 0
3274 // 0 1 0 1 | 0
3275 // 0 1 1 0 | 1
3276 // 1 0 0 0 | 0
3277 // 1 0 0 1 | 1
3278 // 1 1 0 0 | 0
3279 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
3280 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
3281 // Shift leftward so L can influence H.
3282 const VFromD<D> ltLx = detail::Slide1Up(ltHL);
3283 const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
3284 // Replicate H to its neighbor.
3285 return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
3286}
3287
3288// ------------------------------ Lt128Upper
3289template <class D>
3291 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3292 "D must be u64");
3293 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
3294 // Replicate H to its neighbor.
3295 return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
3296}
3297
3298// ------------------------------ Eq128
3299template <class D>
3301 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3302 "D must be u64");
3303 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
3304 const VFromD<D> eqLH = Reverse2(d, eqHL);
3305 return MaskFromVec(And(eqHL, eqLH));
3306}
3307
3308// ------------------------------ Eq128Upper
3309template <class D>
3311 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3312 "D must be u64");
3313 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
3314 // Replicate H to its neighbor.
3315 return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
3316}
3317
3318// ------------------------------ Ne128
3319template <class D>
3321 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3322 "D must be u64");
3323 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
3324 const VFromD<D> neLH = Reverse2(d, neHL);
3325 return MaskFromVec(Or(neHL, neLH));
3326}
3327
3328// ------------------------------ Ne128Upper
3329template <class D>
3331 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3332 "D must be u64");
3333 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
3334 // Replicate H to its neighbor.
3335 return MaskFromVec(OddEven(neHL, detail::Slide1Down(neHL)));
3336}
3337
3338// ------------------------------ Min128, Max128 (Lt128)
3339
3340template <class D>
3341HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
3342 const VFromD<D> aXH = detail::Slide1Down(a);
3343 const VFromD<D> bXH = detail::Slide1Down(b);
3344 const VFromD<D> minHL = Min(a, b);
3345 const MFromD<D> ltXH = Lt(aXH, bXH);
3346 const MFromD<D> eqXH = Eq(aXH, bXH);
3347 // If the upper lane is the decider, take lo from the same reg.
3348 const VFromD<D> lo = IfThenElse(ltXH, a, b);
3349 // The upper lane is just minHL; if they are equal, we also need to use the
3350 // actual min of the lower lanes.
3351 return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
3352}
3353
3354template <class D>
3355HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
3356 const VFromD<D> aXH = detail::Slide1Down(a);
3357 const VFromD<D> bXH = detail::Slide1Down(b);
3358 const VFromD<D> maxHL = Max(a, b);
3359 const MFromD<D> ltXH = Lt(aXH, bXH);
3360 const MFromD<D> eqXH = Eq(aXH, bXH);
3361 // If the upper lane is the decider, take lo from the same reg.
3362 const VFromD<D> lo = IfThenElse(ltXH, b, a);
3363 // The upper lane is just maxHL; if they are equal, we also need to use the
3364 // actual min of the lower lanes.
3365 return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
3366}
3367
3368template <class D>
3369HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
3370 return IfThenElse(Lt128Upper(d, a, b), a, b);
3371}
3372
3373template <class D>
3374HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
3375 return IfThenElse(Lt128Upper(d, b, a), a, b);
3376}
3377
3378// ================================================== END MACROS
3379namespace detail { // for code folding
3380#undef HWY_RVV_AVL
3381#undef HWY_RVV_D
3382#undef HWY_RVV_FOREACH
3383#undef HWY_RVV_FOREACH_08_ALL
3384#undef HWY_RVV_FOREACH_08_ALL_VIRT
3385#undef HWY_RVV_FOREACH_08_DEMOTE
3386#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
3387#undef HWY_RVV_FOREACH_08_EXT
3388#undef HWY_RVV_FOREACH_08_EXT_VIRT
3389#undef HWY_RVV_FOREACH_08_TRUNC
3390#undef HWY_RVV_FOREACH_08_VIRT
3391#undef HWY_RVV_FOREACH_16_ALL
3392#undef HWY_RVV_FOREACH_16_ALL_VIRT
3393#undef HWY_RVV_FOREACH_16_DEMOTE
3394#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
3395#undef HWY_RVV_FOREACH_16_EXT
3396#undef HWY_RVV_FOREACH_16_EXT_VIRT
3397#undef HWY_RVV_FOREACH_16_TRUNC
3398#undef HWY_RVV_FOREACH_16_VIRT
3399#undef HWY_RVV_FOREACH_32_ALL
3400#undef HWY_RVV_FOREACH_32_ALL_VIRT
3401#undef HWY_RVV_FOREACH_32_DEMOTE
3402#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
3403#undef HWY_RVV_FOREACH_32_EXT
3404#undef HWY_RVV_FOREACH_32_EXT_VIRT
3405#undef HWY_RVV_FOREACH_32_TRUNC
3406#undef HWY_RVV_FOREACH_32_VIRT
3407#undef HWY_RVV_FOREACH_64_ALL
3408#undef HWY_RVV_FOREACH_64_ALL_VIRT
3409#undef HWY_RVV_FOREACH_64_DEMOTE
3410#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
3411#undef HWY_RVV_FOREACH_64_EXT
3412#undef HWY_RVV_FOREACH_64_EXT_VIRT
3413#undef HWY_RVV_FOREACH_64_TRUNC
3414#undef HWY_RVV_FOREACH_64_VIRT
3415#undef HWY_RVV_FOREACH_B
3416#undef HWY_RVV_FOREACH_F
3417#undef HWY_RVV_FOREACH_F16
3418#undef HWY_RVV_FOREACH_F32
3419#undef HWY_RVV_FOREACH_F3264
3420#undef HWY_RVV_FOREACH_F64
3421#undef HWY_RVV_FOREACH_I
3422#undef HWY_RVV_FOREACH_I08
3423#undef HWY_RVV_FOREACH_I16
3424#undef HWY_RVV_FOREACH_I163264
3425#undef HWY_RVV_FOREACH_I32
3426#undef HWY_RVV_FOREACH_I64
3427#undef HWY_RVV_FOREACH_U
3428#undef HWY_RVV_FOREACH_U08
3429#undef HWY_RVV_FOREACH_U16
3430#undef HWY_RVV_FOREACH_U163264
3431#undef HWY_RVV_FOREACH_U32
3432#undef HWY_RVV_FOREACH_U64
3433#undef HWY_RVV_FOREACH_UI
3434#undef HWY_RVV_FOREACH_UI08
3435#undef HWY_RVV_FOREACH_UI16
3436#undef HWY_RVV_FOREACH_UI163264
3437#undef HWY_RVV_FOREACH_UI32
3438#undef HWY_RVV_FOREACH_UI3264
3439#undef HWY_RVV_FOREACH_UI64
3440#undef HWY_RVV_M
3441#undef HWY_RVV_RETM_ARGM
3442#undef HWY_RVV_RETV_ARGV
3443#undef HWY_RVV_RETV_ARGVS
3444#undef HWY_RVV_RETV_ARGVV
3445#undef HWY_RVV_T
3446#undef HWY_RVV_V
3447} // namespace detail
3448// NOLINTNEXTLINE(google-readability-namespace-comments)
3449} // namespace HWY_NAMESPACE
3450} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:135
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_API
Definition: base.h:129
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_INLINE
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:238
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition: emu128-inl.h:633
HWY_INLINE VFromD< DU > BitCastToUnsigned(V v)
Definition: rvv-inl.h:693
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd< float, N, kPow2 > df32, VFromD< DU16 > a, VFromD< DU16 > b, const VF32 sum0, VF32 &sum1)
Definition: rvv-inl.h:3152
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1570
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: rvv-inl.h:2078
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:2069
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:815
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition: x86_128-inl.h:5009
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:545
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:861
HWY_INLINE auto ChangeLMUL(Simd< T, N, kPow2 > d, VFromD< Simd< T, N, kPow2 - 3 > > v) -> VFromD< decltype(d)>
Definition: rvv-inl.h:2568
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:123
constexpr bool IsSupportedLMUL(D d)
Definition: rvv-inl.h:2159
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:115
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition: rvv-inl.h:2084
HWY_INLINE VFromD< DU > Iota0(const D)
Definition: rvv-inl.h:714
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:5364
HWY_API VFromD< D32 > ReorderWidenMulAccumulateI16(Simd< int32_t, N, kPow2 > d32, VFromD< D16 > a, VFromD< D16 > b, const V32 sum0, V32 &sum1)
Definition: rvv-inl.h:3184
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:748
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition: arm_sve-inl.h:276
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition: ops/shared-inl.h:295
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f, _DEMOTE_VIRT) template< size_t N > HWY_API vint32mf2_t DemoteTo(Simd< int32_t
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6677
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
typename D::Twice Twice
Definition: ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition: arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6705
constexpr size_t MLenFromD(Simd< T, N, kPow2 >)
Definition: rvv-inl.h:43
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition: arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6623
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:173
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition: arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:271
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
typename D::Half Half
Definition: ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API V Trunc(const V v)
Definition: rvv-inl.h:3001
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2506
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6549
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API svbool_t Ge(const V a, const V b)
Definition: arm_sve-inl.h:885
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
constexpr T MantissaEnd()
Definition: base.h:753
HWY_API constexpr bool IsSame()
Definition: base.h:396
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:383
constexpr size_t CeilLog2(TI x)
Definition: base.h:899
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
#define HWY_IF_LANE_SIZE_D(D, bytes)
Definition: ops/shared-inl.h:250
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:344
#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1412
HWY_AFTER_NAMESPACE()
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1574
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:569
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1025
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:339
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:510
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1315
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:299
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1383
#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:353
#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1448
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:612
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:293
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:370
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1341
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1581
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2779
#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:628
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2197
#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2051
#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1354
#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:541
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:917
#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:997
#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1632
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:409
#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1565
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:908
#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1496
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:853
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1175
#define HWY_RVV_IF_POW2_IN(D, min, max)
Definition: rvv-inl.h:39
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1123
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2318
#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1463
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition: rvv-inl.h:59
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:447
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:311
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:301
#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1526
#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:349
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:463
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:440
#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1511
#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:394
#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1253
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:287
#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2093
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2451
HWY_BEFORE_NAMESPACE()
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:379
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:472
#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1980
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:532
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:313
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:583
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1101
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:364
#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1279
#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1652
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:291
#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:703
#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:644
#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:335
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1143
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1479
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1230
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:554
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:598
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1328
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2212
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:3168
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1034
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:358
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:375
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2028
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:455
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:289
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1265
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:297
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5730
Definition: ops/shared-inl.h:52
Definition: base.h:435
Definition: base.h:296
uint16_t bits
Definition: base.h:297