copy-inl.h (5166B)
1 // Copyright 2022 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Per-target include guard 17 #if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \ 18 defined(HWY_TARGET_TOGGLE) // NOLINT 19 #ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ 20 #undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ 21 #else 22 #define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ 23 #endif 24 25 #include <stddef.h> 26 #include <stdint.h> 27 28 #include "hwy/highway.h" 29 30 HWY_BEFORE_NAMESPACE(); 31 namespace hwy { 32 namespace HWY_NAMESPACE { 33 34 // These functions avoid having to write a loop plus remainder handling in the 35 // (unfortunately still common) case where arrays are not aligned/padded. If the 36 // inputs are known to be aligned/padded, it is more efficient to write a single 37 // loop using Load(). We do not provide a CopyAlignedPadded because it 38 // would be more verbose than such a loop. 39 40 // Fills `to`[0, `count`) with `value`. 41 template <class D, typename T = TFromD<D>> 42 void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) { 43 const size_t N = Lanes(d); 44 const Vec<D> v = Set(d, value); 45 46 size_t idx = 0; 47 if (count >= N) { 48 for (; idx <= count - N; idx += N) { 49 StoreU(v, d, to + idx); 50 } 51 } 52 53 // `count` was a multiple of the vector length `N`: already done. 54 if (HWY_UNLIKELY(idx == count)) return; 55 56 const size_t remaining = count - idx; 57 HWY_DASSERT(0 != remaining && remaining < N); 58 SafeFillN(remaining, value, d, to + idx); 59 } 60 61 // Copies `from`[0, `count`) to `to`, which must not overlap `from`. 62 template <class D, typename T = TFromD<D>> 63 void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) { 64 const size_t N = Lanes(d); 65 66 size_t idx = 0; 67 if (count >= N) { 68 for (; idx <= count - N; idx += N) { 69 const Vec<D> v = LoadU(d, from + idx); 70 StoreU(v, d, to + idx); 71 } 72 } 73 74 // `count` was a multiple of the vector length `N`: already done. 75 if (HWY_UNLIKELY(idx == count)) return; 76 77 const size_t remaining = count - idx; 78 HWY_DASSERT(0 != remaining && remaining < N); 79 SafeCopyN(remaining, d, from + idx, to + idx); 80 } 81 82 // For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the 83 // corresponding mask element of `func(d, v)` is true. Returns the STL-style end 84 // of the newly written elements in `to`. 85 // 86 // `func` is either a functor with a templated operator()(d, v) returning a 87 // mask, or a generic lambda if using C++14. Due to apparent limitations of 88 // Clang on Windows, it is currently necessary to add HWY_ATTR before the 89 // opening { of the lambda to avoid errors about "function .. requires target". 90 // 91 // NOTE: this is only supported for 16-, 32- or 64-bit types. 92 // NOTE: Func may be called a second time for elements it has already seen, but 93 // these elements will not be written to `to` again. 94 template <class D, class Func, typename T = TFromD<D>> 95 T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to, 96 const Func& func) { 97 const size_t N = Lanes(d); 98 99 size_t idx = 0; 100 if (count >= N) { 101 for (; idx <= count - N; idx += N) { 102 const Vec<D> v = LoadU(d, from + idx); 103 to += CompressBlendedStore(v, func(d, v), d, to); 104 } 105 } 106 107 // `count` was a multiple of the vector length `N`: already done. 108 if (HWY_UNLIKELY(idx == count)) return to; 109 110 #if HWY_MEM_OPS_MIGHT_FAULT 111 // Proceed one by one. 112 const CappedTag<T, 1> d1; 113 for (; idx < count; ++idx) { 114 using V1 = Vec<decltype(d1)>; 115 // Workaround for -Waggressive-loop-optimizations on GCC 8 116 // (iteration 2305843009213693951 invokes undefined behavior for T=i64) 117 const uintptr_t addr = reinterpret_cast<uintptr_t>(from); 118 const T* HWY_RESTRICT from_idx = 119 reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T))); 120 const V1 v = LoadU(d1, from_idx); 121 // Avoid storing to `to` unless we know it should be kept - otherwise, we 122 // might overrun the end if it was allocated for the exact count. 123 if (CountTrue(d1, func(d1, v)) == 0) continue; 124 StoreU(v, d1, to); 125 to += 1; 126 } 127 #else 128 // Start index of the last unaligned whole vector, ending at the array end. 129 const size_t last = count - N; 130 // Number of elements before `from` or already written. 131 const size_t invalid = idx - last; 132 HWY_DASSERT(0 != invalid && invalid < N); 133 const Mask<D> mask = Not(FirstN(d, invalid)); 134 const Vec<D> v = MaskedLoad(mask, d, from + last); 135 to += CompressBlendedStore(v, And(mask, func(d, v)), d, to); 136 #endif 137 return to; 138 } 139 140 // NOLINTNEXTLINE(google-readability-namespace-comments) 141 } // namespace HWY_NAMESPACE 142 } // namespace hwy 143 HWY_AFTER_NAMESPACE(); 144 145 #endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_