aligned_allocator.h (15965B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ 17 #define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ 18 19 // Memory allocator with support for alignment and offsets. 20 21 #include <algorithm> 22 #include <array> 23 #include <cassert> 24 #include <cstdint> 25 #include <cstring> 26 #include <initializer_list> 27 #include <memory> 28 #include <type_traits> 29 #include <utility> 30 #include <vector> 31 32 #include "hwy/base.h" 33 #include "hwy/per_target.h" 34 35 #include <mozilla/Attributes.h> 36 37 namespace hwy { 38 39 // Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which 40 // requires a literal. To prevent false sharing, this should be at least the 41 // L1 cache line size, usually 64 bytes. However, Intel's L2 prefetchers may 42 // access pairs of lines, and M1 L2 and POWER8 lines are also 128 bytes. 43 #define HWY_ALIGNMENT 128 44 45 // `align` is in bytes. 46 template <typename T> 47 HWY_API constexpr bool IsAligned(T* ptr, size_t align = HWY_ALIGNMENT) { 48 return reinterpret_cast<uintptr_t>(ptr) % align == 0; 49 } 50 51 // Pointers to functions equivalent to malloc/free with an opaque void* passed 52 // to them. 53 using AllocPtr = void* (*)(void* opaque, size_t bytes); 54 using FreePtr = void (*)(void* opaque, void* memory); 55 56 // Returns null or a pointer to at least `payload_size` (which can be zero) 57 // bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and 58 // the vector size. Calls `alloc` with the passed `opaque` pointer to obtain 59 // memory or malloc() if it is null. 60 HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size, 61 AllocPtr alloc_ptr = nullptr, 62 void* opaque_ptr = nullptr); 63 64 // Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it 65 // must have been returned from a previous call to `AllocateAlignedBytes`. 66 // Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if 67 // `free_ptr` function is null, uses the default free(). 68 HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, 69 FreePtr free_ptr, void* opaque_ptr); 70 71 // Class that deletes the aligned pointer passed to operator() calling the 72 // destructor before freeing the pointer. This is equivalent to the 73 // std::default_delete but for aligned objects. For a similar deleter equivalent 74 // to free() for aligned memory see AlignedFreer(). 75 class AlignedDeleter { 76 public: 77 AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {} 78 AlignedDeleter(FreePtr free_ptr, void* opaque_ptr) 79 : free_(free_ptr), opaque_ptr_(opaque_ptr) {} 80 81 template <typename T> 82 void operator()(T* aligned_pointer) const { 83 return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_, 84 TypedArrayDeleter<T>); 85 } 86 87 private: 88 template <typename T> 89 static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) { 90 size_t elems = size_in_bytes / sizeof(T); 91 for (size_t i = 0; i < elems; i++) { 92 // Explicitly call the destructor on each element. 93 (static_cast<T*>(ptr) + i)->~T(); 94 } 95 } 96 97 // Function prototype that calls the destructor for each element in a typed 98 // array. TypeArrayDeleter<T> would match this prototype. 99 using ArrayDeleter = void (*)(void* t_ptr, size_t t_size); 100 101 HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer, 102 FreePtr free_ptr, 103 void* opaque_ptr, 104 ArrayDeleter deleter); 105 106 FreePtr free_; 107 void* opaque_ptr_; 108 }; 109 110 // Unique pointer to T with custom aligned deleter. This can be a single 111 // element U or an array of element if T is a U[]. The custom aligned deleter 112 // will call the destructor on U or each element of a U[] in the array case. 113 template <typename T> 114 using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>; 115 116 // Aligned memory equivalent of make_unique<T> using the custom allocators 117 // alloc/free with the passed `opaque` pointer. This function calls the 118 // constructor with the passed Args... and calls the destructor of the object 119 // when the AlignedUniquePtr is destroyed. 120 template <typename T, typename... Args> 121 AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free, 122 void* opaque, Args&&... args) { 123 T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque)); 124 return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...), 125 AlignedDeleter(free, opaque)); 126 } 127 128 // Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free 129 // functions. 130 template <typename T, typename... Args> 131 AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) { 132 T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T))); 133 return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...), 134 AlignedDeleter()); 135 } 136 137 template <class T> 138 struct AlignedAllocator { 139 using value_type = T; 140 141 AlignedAllocator() = default; 142 143 template <class V> 144 explicit AlignedAllocator(const AlignedAllocator<V>&) noexcept {} 145 146 template <class V> 147 value_type* allocate(V n) { 148 static_assert(std::is_integral<V>::value, 149 "AlignedAllocator only supports integer types"); 150 static_assert(sizeof(V) <= sizeof(std::size_t), 151 "V n must be smaller or equal size_t to avoid overflow"); 152 return static_cast<value_type*>( 153 AllocateAlignedBytes(static_cast<std::size_t>(n) * sizeof(value_type))); 154 } 155 156 template <class V> 157 void deallocate(value_type* p, HWY_MAYBE_UNUSED V n) { 158 return FreeAlignedBytes(p, nullptr, nullptr); 159 } 160 }; 161 162 template <class T, class V> 163 constexpr bool operator==(const AlignedAllocator<T>&, 164 const AlignedAllocator<V>&) noexcept { 165 return true; 166 } 167 168 template <class T, class V> 169 constexpr bool operator!=(const AlignedAllocator<T>&, 170 const AlignedAllocator<V>&) noexcept { 171 return false; 172 } 173 174 template <class T> 175 using AlignedVector = std::vector<T, AlignedAllocator<T>>; 176 177 // Helpers for array allocators (avoids overflow) 178 namespace detail { 179 180 // Returns x such that 1u << x == n (if n is a power of two). 181 static inline constexpr size_t ShiftCount(size_t n) { 182 return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); 183 } 184 185 template <typename T> 186 T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { 187 constexpr size_t kSize = sizeof(T); 188 189 constexpr bool kIsPow2 = (kSize & (kSize - 1)) == 0; 190 constexpr size_t kBits = ShiftCount(kSize); 191 static_assert(!kIsPow2 || (1ull << kBits) == kSize, "ShiftCount has a bug"); 192 193 const size_t bytes = kIsPow2 ? items << kBits : items * kSize; 194 const size_t check = kIsPow2 ? bytes >> kBits : bytes / kSize; 195 if (check != items) { 196 return nullptr; // overflowed 197 } 198 return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); 199 } 200 201 } // namespace detail 202 203 // Aligned memory equivalent of make_unique<T[]> for array types using the 204 // custom allocators alloc/free. This function calls the constructor with the 205 // passed Args... on every created item. The destructor of each element will be 206 // called when the AlignedUniquePtr is destroyed. 207 template <typename T, typename... Args> 208 AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc( 209 size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { 210 T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque); 211 if (ptr != nullptr) { 212 for (size_t i = 0; i < items; i++) { 213 new (ptr + i) T(args...); 214 } 215 } 216 return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque)); 217 } 218 219 template <typename T, typename... Args> 220 AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) { 221 return MakeUniqueAlignedArrayWithAlloc<T, Args...>( 222 items, nullptr, nullptr, nullptr, std::forward<Args>(args)...); 223 } 224 225 // Custom deleter for std::unique_ptr equivalent to using free() as a deleter 226 // but for aligned memory. 227 class AlignedFreer { 228 public: 229 // Pass address of this to ctor to skip deleting externally-owned memory. 230 static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {} 231 232 AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {} 233 AlignedFreer(FreePtr free_ptr, void* opaque_ptr) 234 : free_(free_ptr), opaque_ptr_(opaque_ptr) {} 235 236 template <typename T> 237 void operator()(T* aligned_pointer) const { 238 FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); 239 } 240 241 private: 242 FreePtr free_; 243 void* opaque_ptr_; 244 }; 245 246 // Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD 247 // data use AlignedUniquePtr. 248 template <typename T> 249 using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>; 250 251 // Allocate an aligned and uninitialized array of POD values as a unique_ptr. 252 // Upon destruction of the unique_ptr the aligned array will be freed. 253 template <typename T> 254 AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc, 255 FreePtr free, void* opaque) { 256 static_assert(std::is_trivially_copyable<T>::value, 257 "AllocateAligned: requires trivially copyable T"); 258 static_assert(std::is_trivially_destructible<T>::value, 259 "AllocateAligned: requires trivially destructible T"); 260 return AlignedFreeUniquePtr<T[]>( 261 detail::AllocateAlignedItems<T>(items, alloc, opaque), 262 AlignedFreer(free, opaque)); 263 } 264 265 // Same as previous AllocateAligned(), using default allocate/free functions. 266 template <typename T> 267 AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) { 268 return AllocateAligned<T>(items, nullptr, nullptr, nullptr); 269 } 270 271 // A simple span containing data and size of data. 272 template <typename T> 273 class Span { 274 public: 275 Span() = default; 276 Span(T* data, size_t size) : size_(size), data_(data) {} 277 template <typename U> 278 MOZ_IMPLICIT Span(U u) : Span(u.data(), u.size()) {} 279 MOZ_IMPLICIT Span(std::initializer_list<const T> v) : Span(v.begin(), v.size()) {} 280 281 // Copies the contents of the initializer list to the span. 282 Span<T>& operator=(std::initializer_list<const T> v) { 283 HWY_DASSERT(size_ == v.size()); 284 CopyBytes(v.begin(), data_, sizeof(T) * std::min(size_, v.size())); 285 return *this; 286 } 287 288 // Returns the size of the contained data. 289 size_t size() const { return size_; } 290 291 // Returns a pointer to the contained data. 292 T* data() { return data_; } 293 T* data() const { return data_; } 294 295 // Returns the element at index. 296 T& operator[](size_t index) const { return data_[index]; } 297 298 // Returns an iterator pointing to the first element of this span. 299 T* begin() { return data_; } 300 301 // Returns a const iterator pointing to the first element of this span. 302 constexpr const T* cbegin() const { return data_; } 303 304 // Returns an iterator pointing just beyond the last element at the 305 // end of this span. 306 T* end() { return data_ + size_; } 307 308 // Returns a const iterator pointing just beyond the last element at the 309 // end of this span. 310 constexpr const T* cend() const { return data_ + size_; } 311 312 private: 313 size_t size_ = 0; 314 T* data_ = nullptr; 315 }; 316 317 // A multi dimensional array containing an aligned buffer. 318 // 319 // To maintain alignment, the innermost dimension will be padded to ensure all 320 // innermost arrays are aligned. 321 template <typename T, size_t axes> 322 class AlignedNDArray { 323 static_assert(std::is_trivial<T>::value, 324 "AlignedNDArray can only contain trivial types"); 325 326 public: 327 AlignedNDArray(AlignedNDArray&& other) = default; 328 AlignedNDArray& operator=(AlignedNDArray&& other) = default; 329 330 // Constructs an array of the provided shape and fills it with zeros. 331 explicit AlignedNDArray(std::array<size_t, axes> shape) : shape_(shape) { 332 sizes_ = ComputeSizes(shape_); 333 memory_shape_ = shape_; 334 // Round the innermost dimension up to the number of bytes available for 335 // SIMD operations on this architecture to make sure that each innermost 336 // array is aligned from the first element. 337 memory_shape_[axes - 1] = RoundUpTo(memory_shape_[axes - 1], VectorBytes()); 338 memory_sizes_ = ComputeSizes(memory_shape_); 339 buffer_ = hwy::AllocateAligned<T>(memory_size()); 340 hwy::ZeroBytes(buffer_.get(), memory_size() * sizeof(T)); 341 } 342 343 // Returns a span containing the innermost array at the provided indices. 344 Span<T> operator[](std::array<const size_t, axes - 1> indices) { 345 return Span<T>(buffer_.get() + Offset(indices), sizes_[indices.size()]); 346 } 347 348 // Returns a const span containing the innermost array at the provided 349 // indices. 350 Span<const T> operator[](std::array<const size_t, axes - 1> indices) const { 351 return Span<const T>(buffer_.get() + Offset(indices), 352 sizes_[indices.size()]); 353 } 354 355 // Returns the shape of the array, which might be smaller than the allocated 356 // buffer after padding the last axis to alignment. 357 const std::array<size_t, axes>& shape() const { return shape_; } 358 359 // Returns the shape of the allocated buffer, which might be larger than the 360 // used size of the array after padding to alignment. 361 const std::array<size_t, axes>& memory_shape() const { return memory_shape_; } 362 363 // Returns the size of the array, which might be smaller than the allocated 364 // buffer after padding the last axis to alignment. 365 size_t size() const { return sizes_[0]; } 366 367 // Returns the size of the allocated buffer, which might be larger than the 368 // used size of the array after padding to alignment. 369 size_t memory_size() const { return memory_sizes_[0]; } 370 371 // Returns a pointer to the allocated buffer. 372 T* data() { return buffer_.get(); } 373 374 // Returns a const pointer to the buffer. 375 const T* data() const { return buffer_.get(); } 376 377 // Truncates the array by updating its shape. 378 // 379 // The new shape must be equal to or less than the old shape in all axes. 380 // 381 // Doesn't modify underlying memory. 382 void truncate(const std::array<size_t, axes>& new_shape) { 383 #if HWY_IS_DEBUG_BUILD 384 for (size_t axis_index = 0; axis_index < axes; ++axis_index) { 385 HWY_ASSERT(new_shape[axis_index] <= shape_[axis_index]); 386 } 387 #endif 388 shape_ = new_shape; 389 sizes_ = ComputeSizes(shape_); 390 } 391 392 private: 393 std::array<size_t, axes> shape_; 394 std::array<size_t, axes> memory_shape_; 395 std::array<size_t, axes + 1> sizes_; 396 std::array<size_t, axes + 1> memory_sizes_; 397 hwy::AlignedFreeUniquePtr<T[]> buffer_; 398 399 // Computes offset in the buffer based on the provided indices. 400 size_t Offset(std::array<const size_t, axes - 1> indices) const { 401 size_t offset = 0; 402 size_t shape_index = 0; 403 for (const size_t axis_index : indices) { 404 offset += memory_sizes_[shape_index + 1] * axis_index; 405 shape_index++; 406 } 407 return offset; 408 } 409 410 // Computes the sizes of all sub arrays based on the sizes of each axis. 411 // 412 // Does this by multiplying the size of each axis with the previous one in 413 // reverse order, starting with the conceptual axis of size 1 containing the 414 // actual elements in the array. 415 static std::array<size_t, axes + 1> ComputeSizes( 416 std::array<size_t, axes> shape) { 417 std::array<size_t, axes + 1> sizes; 418 size_t axis = shape.size(); 419 sizes[axis] = 1; 420 while (axis > 0) { 421 --axis; 422 sizes[axis] = sizes[axis + 1] * shape[axis]; 423 } 424 return sizes; 425 } 426 }; 427 428 } // namespace hwy 429 #endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_