bit_pack_test.cc (9078B)
1 // Copyright 2022 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <stdio.h> 17 18 #include <vector> 19 20 #include "hwy/aligned_allocator.h" 21 #include "hwy/base.h" 22 #include "hwy/nanobenchmark.h" 23 24 // clang-format off 25 #undef HWY_TARGET_INCLUDE 26 #define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc" // NOLINT 27 #include "hwy/foreach_target.h" // IWYU pragma: keep 28 #include "hwy/highway.h" 29 #include "hwy/timer.h" 30 #include "hwy/contrib/bit_pack/bit_pack-inl.h" 31 #include "hwy/tests/test_util-inl.h" 32 // clang-format on 33 34 #ifndef HWY_BIT_PACK_BENCHMARK 35 #define HWY_BIT_PACK_BENCHMARK 0 36 #endif 37 38 HWY_BEFORE_NAMESPACE(); 39 namespace hwy { 40 // Used to prevent running benchmark (slow) for partial vectors and targets 41 // except the best available. Global, not per-target, hence must be outside 42 // HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code 43 // has been re-included. 44 extern size_t last_bits; 45 extern uint64_t best_target; 46 #if HWY_ONCE 47 size_t last_bits = 0; 48 uint64_t best_target = ~0ull; 49 #endif 50 namespace HWY_NAMESPACE { 51 namespace { 52 53 template <size_t kBits, typename T> 54 T Random(RandomState& rng) { 55 return ConvertScalarTo<T>(Random32(&rng) & kBits); 56 } 57 58 template <typename T> 59 class Checker { 60 public: 61 explicit Checker(size_t num) { raw_.reserve(num); } 62 void NotifyRaw(T raw) { raw_.push_back(raw); } 63 64 void NotifyRawOutput(size_t bits, T raw) { 65 if (raw_[num_verified_] != raw) { 66 HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits, 67 num_verified_, raw_.size(), 68 ConvertScalarTo<double>(raw_[num_verified_]), 69 ConvertScalarTo<double>(raw)); 70 } 71 ++num_verified_; 72 } 73 74 private: 75 std::vector<T> raw_; 76 size_t num_verified_ = 0; 77 }; 78 79 template <template <size_t> class PackT, size_t kVectors, size_t kBits> 80 struct TestPack { 81 template <typename T, class D> 82 void operator()(T /* t */, D d) { 83 constexpr size_t kLoops = 16; // working set slightly larger than L1 84 const size_t N = Lanes(d); 85 RandomState rng(N * 129); 86 static_assert(kBits <= kVectors, ""); 87 const size_t num_per_loop = N * kVectors; 88 const size_t num = num_per_loop * kLoops; 89 const size_t num_packed_per_loop = N * kBits; 90 const size_t num_packed = num_packed_per_loop * kLoops; 91 Checker<T> checker(num); 92 AlignedFreeUniquePtr<T[]> raw = hwy::AllocateAligned<T>(num); 93 AlignedFreeUniquePtr<T[]> raw2 = hwy::AllocateAligned<T>(num); 94 AlignedFreeUniquePtr<T[]> packed = hwy::AllocateAligned<T>(num_packed); 95 HWY_ASSERT(raw && raw2 && packed); 96 97 for (size_t i = 0; i < num; ++i) { 98 raw[i] = Random<kBits, T>(rng); 99 checker.NotifyRaw(raw[i]); 100 } 101 102 best_target = HWY_MIN(best_target, HWY_TARGET); 103 const bool run_bench = HWY_BIT_PACK_BENCHMARK && (kBits != last_bits) && 104 (HWY_TARGET == best_target); 105 last_bits = kBits; 106 107 const PackT<kBits> func; 108 109 if (run_bench) { 110 const size_t kNumInputs = 1; 111 const size_t num_items = num * size_t(Unpredictable1()); 112 const FuncInput inputs[kNumInputs] = {num_items}; 113 Result results[kNumInputs]; 114 115 Params p; 116 p.verbose = false; 117 p.max_evals = 7; 118 p.target_rel_mad = 0.002; 119 const size_t num_results = MeasureClosure( 120 [&](FuncInput) HWY_ATTR { 121 for (size_t i = 0, pi = 0; i < num; 122 i += num_per_loop, pi += num_packed_per_loop) { 123 func.Pack(d, raw.get() + i, packed.get() + pi); 124 } 125 T& val = packed.get()[Random32(&rng) % num_packed]; 126 T zero = static_cast<T>(Unpredictable1() - 1); 127 val = static_cast<T>(val + zero); 128 for (size_t i = 0, pi = 0; i < num; 129 i += num_per_loop, pi += num_packed_per_loop) { 130 func.Unpack(d, packed.get() + pi, raw2.get() + i); 131 } 132 return raw2[Random32(&rng) % num]; 133 }, 134 inputs, kNumInputs, results, p); 135 if (num_results != kNumInputs) { 136 HWY_WARN("MeasureClosure failed.\n"); 137 return; 138 } 139 // Print throughput for pack+unpack round trip 140 for (size_t i = 0; i < num_results; ++i) { 141 const size_t bytes_per_element = (kBits + 7) / 8; 142 const double bytes = 143 static_cast<double>(results[i].input * bytes_per_element); 144 const double seconds = 145 results[i].ticks / platform::InvariantTicksPerSecond(); 146 printf("Bits:%2d elements:%3d GB/s:%4.1f (+/-%3.1f%%)\n", 147 static_cast<int>(kBits), static_cast<int>(results[i].input), 148 1E-9 * bytes / seconds, results[i].variability * 100.0); 149 } 150 } else { 151 for (size_t i = 0, pi = 0; i < num; 152 i += num_per_loop, pi += num_packed_per_loop) { 153 func.Pack(d, raw.get() + i, packed.get() + pi); 154 } 155 T& val = packed.get()[Random32(&rng) % num_packed]; 156 T zero = static_cast<T>(Unpredictable1() - 1); 157 val = static_cast<T>(val + zero); 158 for (size_t i = 0, pi = 0; i < num; 159 i += num_per_loop, pi += num_packed_per_loop) { 160 func.Unpack(d, packed.get() + pi, raw2.get() + i); 161 } 162 } 163 164 for (size_t i = 0; i < num; ++i) { 165 checker.NotifyRawOutput(kBits, raw2[i]); 166 } 167 } 168 }; 169 170 void TestAllPack8() { 171 ForShrinkableVectors<TestPack<Pack8, 8, 1>>()(uint8_t()); 172 ForShrinkableVectors<TestPack<Pack8, 8, 2>>()(uint8_t()); 173 ForShrinkableVectors<TestPack<Pack8, 8, 3>>()(uint8_t()); 174 ForShrinkableVectors<TestPack<Pack8, 8, 4>>()(uint8_t()); 175 ForShrinkableVectors<TestPack<Pack8, 8, 5>>()(uint8_t()); 176 ForShrinkableVectors<TestPack<Pack8, 8, 6>>()(uint8_t()); 177 ForShrinkableVectors<TestPack<Pack8, 8, 7>>()(uint8_t()); 178 ForShrinkableVectors<TestPack<Pack8, 8, 8>>()(uint8_t()); 179 } 180 181 void TestAllPack16() { 182 ForShrinkableVectors<TestPack<Pack16, 16, 1>>()(uint16_t()); 183 ForShrinkableVectors<TestPack<Pack16, 16, 2>>()(uint16_t()); 184 ForShrinkableVectors<TestPack<Pack16, 16, 3>>()(uint16_t()); 185 ForShrinkableVectors<TestPack<Pack16, 16, 4>>()(uint16_t()); 186 ForShrinkableVectors<TestPack<Pack16, 16, 5>>()(uint16_t()); 187 ForShrinkableVectors<TestPack<Pack16, 16, 6>>()(uint16_t()); 188 ForShrinkableVectors<TestPack<Pack16, 16, 7>>()(uint16_t()); 189 ForShrinkableVectors<TestPack<Pack16, 16, 8>>()(uint16_t()); 190 ForShrinkableVectors<TestPack<Pack16, 16, 9>>()(uint16_t()); 191 ForShrinkableVectors<TestPack<Pack16, 16, 10>>()(uint16_t()); 192 ForShrinkableVectors<TestPack<Pack16, 16, 11>>()(uint16_t()); 193 ForShrinkableVectors<TestPack<Pack16, 16, 12>>()(uint16_t()); 194 ForShrinkableVectors<TestPack<Pack16, 16, 13>>()(uint16_t()); 195 ForShrinkableVectors<TestPack<Pack16, 16, 14>>()(uint16_t()); 196 ForShrinkableVectors<TestPack<Pack16, 16, 15>>()(uint16_t()); 197 ForShrinkableVectors<TestPack<Pack16, 16, 16>>()(uint16_t()); 198 } 199 200 void TestAllPack32() { 201 ForShrinkableVectors<TestPack<Pack32, 32, 1>>()(uint32_t()); 202 ForShrinkableVectors<TestPack<Pack32, 32, 2>>()(uint32_t()); 203 ForShrinkableVectors<TestPack<Pack32, 32, 6>>()(uint32_t()); 204 ForShrinkableVectors<TestPack<Pack32, 32, 11>>()(uint32_t()); 205 ForShrinkableVectors<TestPack<Pack32, 32, 16>>()(uint32_t()); 206 ForShrinkableVectors<TestPack<Pack32, 32, 31>>()(uint32_t()); 207 ForShrinkableVectors<TestPack<Pack32, 32, 32>>()(uint32_t()); 208 } 209 210 void TestAllPack64() { 211 // Fails, but only on GCC 13. 212 #if !(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400 && \ 213 HWY_TARGET == HWY_RVV) 214 ForShrinkableVectors<TestPack<Pack64, 64, 1>>()(uint64_t()); 215 ForShrinkableVectors<TestPack<Pack64, 64, 5>>()(uint64_t()); 216 ForShrinkableVectors<TestPack<Pack64, 64, 12>>()(uint64_t()); 217 ForShrinkableVectors<TestPack<Pack64, 64, 16>>()(uint64_t()); 218 ForShrinkableVectors<TestPack<Pack64, 64, 27>>()(uint64_t()); 219 ForShrinkableVectors<TestPack<Pack64, 64, 31>>()(uint64_t()); 220 ForShrinkableVectors<TestPack<Pack64, 64, 33>>()(uint64_t()); 221 ForShrinkableVectors<TestPack<Pack64, 64, 41>>()(uint64_t()); 222 ForShrinkableVectors<TestPack<Pack64, 64, 61>>()(uint64_t()); 223 #endif 224 } 225 226 } // namespace 227 // NOLINTNEXTLINE(google-readability-namespace-comments) 228 } // namespace HWY_NAMESPACE 229 } // namespace hwy 230 HWY_AFTER_NAMESPACE(); 231 232 #if HWY_ONCE 233 namespace hwy { 234 namespace { 235 HWY_BEFORE_TEST(BitPackTest); 236 HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack8); 237 HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack16); 238 HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack32); 239 HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack64); 240 HWY_AFTER_TEST(); 241 } // namespace 242 } // namespace hwy 243 HWY_TEST_MAIN(); 244 #endif // HWY_ONCE