Grok 10.0.5
contrib/sort/shared-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Definitions shared between vqsort-inl and sorting_networks-inl.
17
18// Normal include guard for target-independent parts
19#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
20#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
21
22#include "hwy/base.h"
23
24namespace hwy {
25
26// Internal constants - these are to avoid magic numbers/literals and cannot be
27// changed without also changing the associated code.
29// SortingNetwork reshapes its input into a matrix. This is the maximum number
30// of *keys* per vector.
31#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
32 static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
33#else
34 static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
35#endif
36
37 // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
38 // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
39 // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
40 // extra logN factor for larger networks (for which only loose upper bounds
41 // on size are known).
42 static constexpr size_t kMaxRowsLog2 = 4;
43 static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
44
45 static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
46 return kMaxRows * HWY_MIN(N, kMaxCols);
47 }
48
49 // Unrolling is important (pipelining and amortizing branch mispredictions);
50 // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
51 // somewhat slower for sorting than 4x.
52 //
53 // To change, must also update left + 3 * N etc. in the loop.
54 static constexpr size_t kPartitionUnroll = 4;
55
56 static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
57 // The main loop reads kPartitionUnroll vectors, and first loads from
58 // both left and right beforehand, so it requires min = 2 *
59 // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
60 // >= BaseCaseNum), we partition the right side into a buffer. We need
61 // another vector at the end so CompressStore does not overwrite anything.
62 return (2 * kPartitionUnroll + 1) * N;
63 }
64
65 // Chunk := group of keys loaded for sampling a pivot. Matches the typical
66 // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
67 // ensures vectors are no larger than that, so this can be independent of the
68 // vector size and thus constexpr.
69 static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
70 return 64 / sizeof_t;
71 }
72
73 static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
74 // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
75 return (3 + 1) * LanesPerChunk(sizeof_t) + 2 * N;
76 }
77
78 template <typename T>
79 static constexpr HWY_INLINE size_t BufNum(size_t N) {
80 // One extra for padding plus another for full-vector loads.
81 return HWY_MAX(BaseCaseNum(N) + 2 * N,
82 HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
83 }
84
85 template <typename T>
86 static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
87 return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
88 }
89};
90
91} // namespace hwy
92
93#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
94
95// Per-target
96#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
97 defined(HWY_TARGET_TOGGLE)
98#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
99#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
100#else
101#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
102#endif
103
104#include "hwy/highway.h"
105
106// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
107// Arm v7 debug.
108#undef VQSORT_ENABLED
109#if (HWY_TARGET == HWY_SCALAR) || \
110 (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
111 (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
112#define VQSORT_ENABLED 0
113#else
114#define VQSORT_ENABLED 1
115#endif
116
117namespace hwy {
118namespace HWY_NAMESPACE {
119
120// Default tag / vector width selector.
121#if HWY_TARGET == HWY_RVV
122// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
123template <typename T>
124using SortTag = ScalableTag<T, -1>;
125#else
126template <typename T>
127using SortTag = ScalableTag<T>;
128#endif
129
130// NOLINTNEXTLINE(google-readability-namespace-comments)
131} // namespace HWY_NAMESPACE
132} // namespace hwy
133
134#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#define HWY_MAX(a, b)
Definition: base.h:135
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_INLINE
Definition: base.h:70
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:173
N
Definition: rvv-inl.h:1998
ScalableTag< T, -1 > SortTag
Definition: contrib/sort/shared-inl.h:124
Definition: aligned_allocator.h:27
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: contrib/sort/shared-inl.h:28
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size)
Definition: contrib/sort/shared-inl.h:86
static constexpr size_t kMaxCols
Definition: contrib/sort/shared-inl.h:34
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N)
Definition: contrib/sort/shared-inl.h:56
static constexpr HWY_INLINE size_t BufNum(size_t N)
Definition: contrib/sort/shared-inl.h:79
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t)
Definition: contrib/sort/shared-inl.h:69
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N)
Definition: contrib/sort/shared-inl.h:73
static constexpr size_t kMaxRows
Definition: contrib/sort/shared-inl.h:43
static constexpr HWY_INLINE size_t BaseCaseNum(size_t N)
Definition: contrib/sort/shared-inl.h:45
static constexpr size_t kMaxRowsLog2
Definition: contrib/sort/shared-inl.h:42
static constexpr size_t kPartitionUnroll
Definition: contrib/sort/shared-inl.h:54