Grok 10.0.5
detect_targets.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
17#define HIGHWAY_HWY_DETECT_TARGETS_H_
18
19// Defines targets and chooses which to enable.
20
22
23//------------------------------------------------------------------------------
24// Optional configuration
25
26// See g3doc/quick_reference.md for documentation of these macros.
27
28// Uncomment to override the default baseline determined from predefined macros:
29// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
30
31// Uncomment to override the default blocklist:
32// #define HWY_BROKEN_TARGETS HWY_AVX3
33
34// Uncomment to definitely avoid generating those target(s):
35// #define HWY_DISABLED_TARGETS HWY_SSE4
36
37// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
38// AVX2 target for VMs which support AVX2 but not the other instruction sets)
39// #define HWY_DISABLE_BMI2_FMA
40
41// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
42// #define HWY_WANT_SSSE3
43// #define HWY_WANT_SSE4
44
45//------------------------------------------------------------------------------
46// Targets
47
48// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
49// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
50//
51// All values are unconditionally defined so we can test HWY_TARGETS without
52// first checking the HWY_ARCH_*.
53//
54// The C99 preprocessor evaluates #if expressions using intmax_t types. This
55// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
56// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
57// avoid overflow when computing HWY_TARGETS (subtracting one instead of
58// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
59
60// --------------------------- x86: 15 targets (+ one fallback)
61// Bits 0..6 reserved (7 targets)
62// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
63// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
64// Tiger Lake? We do not yet have uses for GFNI.
65#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
66#define HWY_AVX3 (1LL << 8)
67#define HWY_AVX2 (1LL << 9)
68// Bit 10: reserved for AVX
69#define HWY_SSE4 (1LL << 11)
70#define HWY_SSSE3 (1LL << 12)
71// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
72// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
73// dynamic dispatch. All x86 target bits must be lower or equal to
74// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
75// HWY_MAX_DYNAMIC_TARGETS in total.
76#define HWY_HIGHEST_TARGET_BIT_X86 14
77
78// --------------------------- Arm: 15 targets (+ one fallback)
79// Bits 15..23 reserved (9 targets)
80#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
81#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
82#define HWY_SVE2 (1LL << 26)
83#define HWY_SVE (1LL << 27)
84#define HWY_NEON (1LL << 28) // On A64, includes/requires AES
85// Bit 29 reserved (Helium?)
86#define HWY_HIGHEST_TARGET_BIT_ARM 29
87
88// --------------------------- RISC-V: 9 targets (+ one fallback)
89// Bits 30..36 reserved (7 targets)
90#define HWY_RVV (1LL << 37)
91// Bit 38 reserved
92#define HWY_HIGHEST_TARGET_BIT_RVV 38
93
94// --------------------------- Future expansion: 4 targets
95// Bits 39..42 reserved
96
97
98// --------------------------- IBM Power: 9 targets (+ one fallback)
99// Bits 43..48 reserved (6 targets)
100#define HWY_PPC8 (1LL << 49) // v2.07 or 3
101// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
102#define HWY_HIGHEST_TARGET_BIT_PPC 51
103
104// --------------------------- WebAssembly: 9 targets (+ one fallback)
105// Bits 52..57 reserved (6 targets)
106#define HWY_WASM_EMU256 (1LL << 58) // Experimental
107#define HWY_WASM (1LL << 59)
108// Bits 60 reserved
109#define HWY_HIGHEST_TARGET_BIT_WASM 60
110
111// --------------------------- Emulation: 2 targets
112
113#define HWY_EMU128 (1LL << 61)
114// We do not add/left-shift, so this will not overflow to a negative number.
115#define HWY_SCALAR (1LL << 62)
116#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
117
118// Do not use bit 63 - would be confusing to have negative numbers.
119
120//------------------------------------------------------------------------------
121// Set default blocklists
122
123// Disabled means excluded from enabled at user's request. A separate config
124// macro allows disabling without deactivating the blocklist below.
125#ifndef HWY_DISABLED_TARGETS
126#define HWY_DISABLED_TARGETS 0
127#endif
128
129// Broken means excluded from enabled due to known compiler issues. Allow the
130// user to override this blocklist without any guarantee of success.
131#ifndef HWY_BROKEN_TARGETS
132
133// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
134// SSE4 codegen (possibly only for msan), so disable all those targets.
135#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
136#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
137// This entails a major speed reduction, so warn unless the user explicitly
138// opts in to scalar-only.
139#if !defined(HWY_COMPILE_ONLY_SCALAR)
140#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
141#endif
142
143// 32-bit may fail to compile AVX2/3.
144#elif HWY_ARCH_X86_32
145#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
146
147// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
148#elif HWY_COMPILER_MSVC != 0
149#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
150
151// armv7be has not been tested and is not yet supported.
152#elif HWY_ARCH_ARM_V7 && \
153 (defined(__ARM_BIG_ENDIAN) || \
154 (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
155#define HWY_BROKEN_TARGETS (HWY_NEON)
156
157// SVE[2] require recent clang or gcc versions.
158#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
159 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
160#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
161
162#else
163#define HWY_BROKEN_TARGETS 0
164#endif
165
166#endif // HWY_BROKEN_TARGETS
167
168// Enabled means not disabled nor blocklisted.
169#define HWY_ENABLED(targets) \
170 ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
171
172// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
173// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
174// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
175// always be enabled. If 1, we instead choose HWY_SCALAR even without
176// HWY_COMPILE_ONLY_SCALAR being set.
177#if !defined(HWY_BROKEN_EMU128) // allow overriding
178#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \
179 defined(HWY_NO_LIBCXX)
180#define HWY_BROKEN_EMU128 1
181#else
182#define HWY_BROKEN_EMU128 0
183#endif
184#endif // HWY_BROKEN_EMU128
185
186//------------------------------------------------------------------------------
187// Detect baseline targets using predefined macros
188
189// Baseline means the targets for which the compiler is allowed to generate
190// instructions, implying the target CPU would have to support them. This does
191// not take the blocklist into account.
192
193#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
194#define HWY_BASELINE_SCALAR HWY_SCALAR
195#else
196#define HWY_BASELINE_SCALAR HWY_EMU128
197#endif
198
199// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
200// HWY_TARGET == HWY_BASELINE_SCALAR.
201
202#if HWY_ARCH_WASM && defined(__wasm_simd128__)
203#if defined(HWY_WANT_WASM2)
204#define HWY_BASELINE_WASM HWY_WASM_EMU256
205#else
206#define HWY_BASELINE_WASM HWY_WASM
207#endif // HWY_WANT_WASM2
208#else
209#define HWY_BASELINE_WASM 0
210#endif
211
212// Avoid choosing the PPC target until we have an implementation.
213#if HWY_ARCH_PPC && defined(__VSX__) && 0
214#define HWY_BASELINE_PPC8 HWY_PPC8
215#else
216#define HWY_BASELINE_PPC8 0
217#endif
218
219#define HWY_BASELINE_SVE2 0
220#define HWY_BASELINE_SVE 0
221#define HWY_BASELINE_NEON 0
222
223#if HWY_ARCH_ARM
224
225#if defined(__ARM_FEATURE_SVE2)
226#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
227// If user specified -msve-vector-bits=128, they assert the vector length is
228// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
229#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
230#define HWY_BASELINE_SVE2 HWY_SVE2_128
231// Otherwise we're not sure what the vector length will be. The baseline must be
232// unconditionally valid, so we can only assume HWY_SVE2. However, when running
233// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
234// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
235#else
236#define HWY_BASELINE_SVE2 HWY_SVE2
237#endif // __ARM_FEATURE_SVE_BITS
238#endif // __ARM_FEATURE_SVE2
239
240#if defined(__ARM_FEATURE_SVE)
241#undef HWY_BASELINE_SVE // was 0, will be re-defined
242// See above. If user-specified vector length matches our optimization, use it.
243#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
244#define HWY_BASELINE_SVE HWY_SVE_256
245#else
246#define HWY_BASELINE_SVE HWY_SVE
247#endif // __ARM_FEATURE_SVE_BITS
248#endif // __ARM_FEATURE_SVE
249
250// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
251#if defined(__ARM_NEON__) || defined(__ARM_NEON)
252#undef HWY_BASELINE_NEON
253#define HWY_BASELINE_NEON HWY_NEON
254#endif
255
256#endif // HWY_ARCH_ARM
257
258// Special handling for MSVC because it has fewer predefined macros:
259#if HWY_COMPILER_MSVC
260
261// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
262// https://stackoverflow.com/questions/18563978/.
263#if defined(__AVX__)
264#define HWY_CHECK_SSSE3 1
265#define HWY_CHECK_SSE4 1
266#else
267#define HWY_CHECK_SSSE3 0
268#define HWY_CHECK_SSE4 0
269#endif
270
271// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
272// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
273#define HWY_CHECK_PCLMUL_AES 1
274#define HWY_CHECK_BMI2_FMA 1
275#define HWY_CHECK_F16C 1
276
277#else // non-MSVC
278
279#if defined(__SSSE3__)
280#define HWY_CHECK_SSSE3 1
281#else
282#define HWY_CHECK_SSSE3 0
283#endif
284
285#if defined(__SSE4_1__) && defined(__SSE4_2__)
286#define HWY_CHECK_SSE4 1
287#else
288#define HWY_CHECK_SSE4 0
289#endif
290
291// If these are disabled, they should not gate the availability of SSE4/AVX2.
292#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
293#define HWY_CHECK_PCLMUL_AES 1
294#else
295#define HWY_CHECK_PCLMUL_AES 0
296#endif
297
298#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
299#define HWY_CHECK_BMI2_FMA 1
300#else
301#define HWY_CHECK_BMI2_FMA 0
302#endif
303
304#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
305#define HWY_CHECK_F16C 1
306#else
307#define HWY_CHECK_F16C 0
308#endif
309
310#endif // non-MSVC
311
312#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
313#define HWY_BASELINE_SSSE3 HWY_SSSE3
314#else
315#define HWY_BASELINE_SSSE3 0
316#endif
317
318#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
319#define HWY_BASELINE_SSE4 HWY_SSE4
320#else
321#define HWY_BASELINE_SSE4 0
322#endif
323
324#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
325 defined(__AVX2__)
326#define HWY_BASELINE_AVX2 HWY_AVX2
327#else
328#define HWY_BASELINE_AVX2 0
329#endif
330
331// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
332#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
333 defined(__AVX512DQ__) && defined(__AVX512VL__)
334#define HWY_BASELINE_AVX3 HWY_AVX3
335#else
336#define HWY_BASELINE_AVX3 0
337#endif
338
339// TODO(janwas): not yet known whether these will be set by MSVC
340#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
341 defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
342 defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
343 defined(__AVX512BITALG__)
344#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
345#else
346#define HWY_BASELINE_AVX3_DL 0
347#endif
348
349#if HWY_ARCH_RVV && defined(__riscv_vector)
350#define HWY_BASELINE_RVV HWY_RVV
351#else
352#define HWY_BASELINE_RVV 0
353#endif
354
355// Allow the user to override this without any guarantee of success.
356#ifndef HWY_BASELINE_TARGETS
357#define HWY_BASELINE_TARGETS \
358 (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
359 HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \
360 HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
361 HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
362#endif // HWY_BASELINE_TARGETS
363
364//------------------------------------------------------------------------------
365// Choose target for static dispatch
366
367#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
368#if HWY_ENABLED_BASELINE == 0
369#error "At least one baseline target must be defined and enabled"
370#endif
371
372// Best baseline, used for static dispatch. This is the least-significant 1-bit
373// within HWY_ENABLED_BASELINE and lower bit values imply "better".
374#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
375
376// Start by assuming static dispatch. If we later use dynamic dispatch, this
377// will be defined to other targets during the multiple-inclusion, and finally
378// return to the initial value. Defining this outside begin/end_target ensures
379// inl headers successfully compile by themselves (required by Bazel).
380#define HWY_TARGET HWY_STATIC_TARGET
381
382//------------------------------------------------------------------------------
383// Choose targets for dynamic dispatch according to one of four policies
384
385#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
386 defined(HWY_COMPILE_ONLY_STATIC))
387#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
388#endif
389// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
390
391// Clang, GCC and MSVC allow runtime dispatch on x86.
392#if HWY_ARCH_X86
393#define HWY_HAVE_RUNTIME_DISPATCH 1
394// On Arm, currently only GCC does, and we require Linux to detect CPU
395// capabilities.
396#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H)
397#define HWY_HAVE_RUNTIME_DISPATCH 1
398#else
399#define HWY_HAVE_RUNTIME_DISPATCH 0
400#endif
401
402// AVX3_DL is not widely available yet. To reduce code size and compile time,
403// only include it in the set of attainable targets (for dynamic dispatch) if
404// the user opts in, OR it is in the baseline (we check whether enabled below).
405#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
406#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
407#else
408#define HWY_ATTAINABLE_AVX3_DL 0
409#endif
410
411#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
412 (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
413#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
414#else
415#define HWY_ATTAINABLE_SVE 0
416#endif
417
418#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
419 (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
420#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
421#else
422#define HWY_ATTAINABLE_SVE2 0
423#endif
424
425// Attainable means enabled and the compiler allows intrinsics (even when not
426// allowed to autovectorize). Used in 3 and 4.
427#if HWY_ARCH_X86
428#define HWY_ATTAINABLE_TARGETS \
429 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
430 HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
431#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
432#define HWY_ATTAINABLE_TARGETS \
433 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
434 HWY_ATTAINABLE_SVE2)
435#else
436#define HWY_ATTAINABLE_TARGETS \
437 (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
438#endif
439
440// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
441#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
442#undef HWY_STATIC_TARGET
443#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
444#define HWY_TARGETS HWY_EMU128
445
446// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
447// we currently still support it for backwards compatibility.
448#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
449 (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
450#undef HWY_STATIC_TARGET
451#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
452#define HWY_TARGETS HWY_SCALAR
453
454// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
455#elif defined(HWY_COMPILE_ONLY_STATIC)
456#define HWY_TARGETS HWY_STATIC_TARGET
457
458// 3) For tests: include all attainable targets (in particular: scalar)
459#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
460#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
461
462// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
463// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
464// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
465// sets all lower bits (better targets), then we also include the static target.
466#else
467#define HWY_TARGETS \
468 (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
469
470#endif // target policy
471
472// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
473// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
474// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
475#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
476#error "Logic error: best baseline should be included in dynamic targets"
477#endif
478
479#endif // HIGHWAY_HWY_DETECT_TARGETS_H_