[ tor-browser ].git.dasho

commit 4f6491fb0a51aae31138e0f5e2fa01c781415d2c
parent 67e7e29a2a9f7c6f6bec40490c2d22334a3ed3af
Author: Mike Hommey <mh+mozilla@glandium.org>
Date:   Tue,  2 Dec 2025 06:31:33 +0000

Bug 2001260 - Update highway to the latest commit from git. r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D274666

Diffstat:
M media/highway/moz.yaml  | 6 +++---
M media/highway/mozilla.patch  | 4 ++--
M third_party/highway/BUILD  | 172 ++++---------------------------------------------------------------------------
M third_party/highway/CMakeLists.txt  | 48 ++++--------------------------------------------
M third_party/highway/LICENSE  | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
D third_party/highway/LICENSE-BSD3  | 27 ---------------------------
M third_party/highway/README.md  | 39 +++++++++++++++++++++++++--------------
M third_party/highway/hwy/aligned_allocator.h  | 2 +-
M third_party/highway/hwy/auto_tune.h  | 34 +++++++++++++++++++++++-----------
M third_party/highway/hwy/base.h  | 66 +++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M third_party/highway/hwy/bit_set.h  | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M third_party/highway/hwy/bit_set_test.cc  | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M third_party/highway/hwy/cache_control.h  | 5 +++--
M third_party/highway/hwy/contrib/algo/find_test.cc  | 4 ++--
M third_party/highway/hwy/contrib/algo/transform_test.cc  | 12 ++++++------
M third_party/highway/hwy/contrib/math/math-inl.h  | 1 -
M third_party/highway/hwy/contrib/math/math_test.cc  | 496 +++----------------------------------------------------------------------------
M third_party/highway/hwy/contrib/matvec/matvec-inl.h  | 8 ++++----
M third_party/highway/hwy/contrib/matvec/matvec_test.cc  | 3 +--
M third_party/highway/hwy/contrib/random/random-inl.h  | 6 ++----
M third_party/highway/hwy/contrib/sort/BUILD  | 2 ++
M third_party/highway/hwy/contrib/sort/algo-inl.h  | 38 +++++++++-----------------------------
M third_party/highway/hwy/contrib/sort/bench_sort.cc  | 3 +--
M third_party/highway/hwy/contrib/sort/print_network.cc  | 2 +-
M third_party/highway/hwy/contrib/sort/result-inl.h  | 20 ++++++++++----------
M third_party/highway/hwy/contrib/sort/shared-inl.h  | 14 +++++++-------
M third_party/highway/hwy/contrib/sort/sort_test.cc  | 10 +++++++---
M third_party/highway/hwy/contrib/sort/sort_unit_test.cc  | 12 +++++++-----
M third_party/highway/hwy/contrib/sort/sorting_networks-inl.h  | 10 ++++++++++
M third_party/highway/hwy/contrib/sort/vqsort_f16a.cc  | 7 ++++---
M third_party/highway/hwy/contrib/sort/vqsort_f16d.cc  | 7 ++++---
M third_party/highway/hwy/contrib/thread_pool/futex.h  | 8 ++++++++
M third_party/highway/hwy/contrib/thread_pool/spin.h  | 27 +++++++++++++++++----------
M third_party/highway/hwy/contrib/thread_pool/thread_pool.h  | 1689 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M third_party/highway/hwy/contrib/thread_pool/thread_pool_test.cc  | 67 +++++++++++++++++++++++++++++++------------------------------------
M third_party/highway/hwy/contrib/thread_pool/topology.cc  | 37 ++++++++++++++++++++++++++++++++-----
M third_party/highway/hwy/detect_compiler_arch.h  | 26 ++++++++++++++++++++++----
M third_party/highway/hwy/detect_targets.h  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M third_party/highway/hwy/highway.h  | 15 +++++++++++++++
M third_party/highway/hwy/highway_test.cc  | 2 +-
M third_party/highway/hwy/nanobenchmark.h  | 6 +++---
M third_party/highway/hwy/ops/arm_neon-inl.h  | 22 +++++++++-------------
M third_party/highway/hwy/ops/arm_sve-inl.h  | 36 +++++++++++++++++++++++++++++++-----
M third_party/highway/hwy/ops/generic_ops-inl.h  | 83 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M third_party/highway/hwy/ops/loongarch_lasx-inl.h  | 21 +++++++++++++++++++--
M third_party/highway/hwy/ops/loongarch_lsx-inl.h  | 23 ++++++++++++++++++++++-
M third_party/highway/hwy/ops/ppc_vsx-inl.h  | 13 +++++++++++++
M third_party/highway/hwy/ops/rvv-inl.h  | 66 +++++++++++++++++++++++++++++++++++++++++++-----------------------
M third_party/highway/hwy/ops/scalar-inl.h  | 11 +++++++++++
M third_party/highway/hwy/ops/set_macros-inl.h  | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M third_party/highway/hwy/ops/x86_128-inl.h  | 53 +++++++++++++++++++++++++++++++++++++++++++++++------
M third_party/highway/hwy/ops/x86_256-inl.h  | 23 ++++++++++++++++++++---
M third_party/highway/hwy/ops/x86_512-inl.h  | 35 ++++++++++++++++++++++++++++++-----
M third_party/highway/hwy/perf_counters.cc  | 2 +-
M third_party/highway/hwy/perf_counters.h  | 2 +-
M third_party/highway/hwy/print-inl.h  | 2 ++
M third_party/highway/hwy/print.cc  | 22 ++++++++++++++--------
M third_party/highway/hwy/profiler.cc  | 60 ++++++++++++++++++++++++++++++------------------------------
M third_party/highway/hwy/profiler.h  | 617 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M third_party/highway/hwy/stats.cc  | 8 ++++----
M third_party/highway/hwy/targets.cc  | 22 +++++++++++++++++-----
M third_party/highway/hwy/timer.h  | 44 ++++++++++++++++++++++++++++++++++++++++++++

62 files changed, 2794 insertions(+), 2054 deletions(-)
diff --git a/media/highway/moz.yaml b/media/highway/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 1.3.0 (2025-08-14T00:25:53-07:00).
+  release: 2a67c5110fd5938a05d95d19fda3a806c9e61741 (2025-12-01T18:22:39Z).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: 1.3.0
+  revision: 2a67c5110fd5938a05d95d19fda3a806c9e61741
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
@@ -46,7 +46,7 @@ vendoring:
   url: https://github.com/google/highway
   source-hosting: github
   vendor-directory: third_party/highway
-  tracking: tag
+  tracking: commit
 
   exclude:
     - g3doc/
diff --git a/media/highway/mozilla.patch b/media/highway/mozilla.patch
@@ -30,9 +30,9 @@ diff --git a/hwy/base.h b/hwy/base.h
  
 +#include <mozilla/Attributes.h>
 +
- // API version (https://semver.org/); keep in sync with CMakeLists.txt.
+ // API version (https://semver.org/); keep in sync with CMakeLists.txt and
+ // meson.build.
  #define HWY_MAJOR 1
- #define HWY_MINOR 3
 @@ -1283,7 +1285,7 @@ struct alignas(2) float16_t {
  #if HWY_HAVE_SCALAR_F16_TYPE
    // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
diff --git a/third_party/highway/BUILD b/third_party/highway/BUILD
@@ -1,8 +1,7 @@
-# Placeholder#1 for Guitar, do not remove
 # Placeholder for cc_test, do not remove
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("//:hwy_tests.bzl", "HWY_TESTS")
 load("@rules_license//rules:license.bzl", "license")
-# Placeholder#2 for Guitar, do not remove
 
 package(
     default_applicable_licenses = ["//:license"],
@@ -360,6 +359,9 @@ cc_library(
 
 cc_library(
     name = "thread_pool",
+    srcs = [
+        "hwy/contrib/thread_pool/thread_pool.cc",
+    ],
     hdrs = [
         "hwy/contrib/thread_pool/futex.h",
         "hwy/contrib/thread_pool/spin.h",
@@ -502,7 +504,10 @@ cc_test(
     name = "list_targets",
     size = "small",
     srcs = ["hwy/tests/list_targets.cc"],
-    deps = [":hwy"],
+    deps = [
+        ":hwy",
+        ":timer",
+    ],
 )
 
 cc_test(
@@ -516,165 +521,6 @@ cc_test(
     ],
 )
 
-# path, name, deps
-HWY_CONTRIB_TESTS = (
-    (
-        "hwy/contrib/algo/",
-        "copy_test",
-        (":algo",),
-    ),
-    (
-        "hwy/contrib/algo/",
-        "find_test",
-        (":algo",),
-    ),
-    (
-        "hwy/contrib/algo/",
-        "transform_test",
-        (":algo",),
-    ),
-    (
-        "hwy/contrib/bit_pack/",
-        "bit_pack_test",
-        (":bit_pack",),
-    ),
-    (
-        "hwy/contrib/dot/",
-        "dot_test",
-        (":dot",),
-    ),
-    (
-        "hwy/contrib/image/",
-        "image_test",
-        (":image",),
-    ),
-    (
-        "hwy/contrib/math/",
-        "math_test",
-        (":math",),
-    ),
-    (
-        "hwy/contrib/random/",
-        "random_test",
-        (":random",),
-    ),
-    (
-        "hwy/contrib/matvec/",
-        "matvec_test",
-        (":matvec", ":algo", ":topology", ":thread_pool"),
-    ),
-    (
-        "hwy/contrib/thread_pool/",
-        "spin_test",
-        (":topology", ":thread_pool"),
-    ),
-    (
-        "hwy/contrib/thread_pool/",
-        "thread_pool_test",
-        (":topology", ":thread_pool", ":profiler"),
-    ),
-    (
-        "hwy/contrib/thread_pool/",
-        "topology_test",
-        (":thread_pool", ":topology"),
-    ),
-    (
-        "hwy/contrib/unroller/",
-        "unroller_test",
-        (":unroller",),
-    ),
-    # contrib/sort has its own BUILD, we also add sort_test to GUITAR_TESTS.
-    # To run bench_sort, specify --test=hwy/contrib/sort:bench_sort.
-)
-
-# path, name, deps
-HWY_TESTS = HWY_CONTRIB_TESTS + (
-    (
-        "hwy/examples/",
-        "skeleton_test",
-        (":skeleton",),
-    ),
-    ("hwy/", "abort_test", []),
-    ("hwy/", "aligned_allocator_test", []),
-    (
-        "hwy/",
-        "auto_tune_test",
-        (":auto_tune",),
-    ),
-    ("hwy/", "base_test", []),
-    (
-        "hwy/",
-        "bit_set_test",
-        (":bit_set",),
-    ),
-    ("hwy/", "highway_test", []),
-    ("hwy/", "nanobenchmark_test", []),
-    (
-        "hwy/",
-        "perf_counters_test",
-        (":perf_counters", ":thread_pool"),
-    ),
-    ("hwy/", "targets_test", []),
-    ("hwy/tests/", "arithmetic_test", []),
-    ("hwy/tests/", "bit_permute_test", []),
-    ("hwy/tests/", "blockwise_combine_test", []),
-    ("hwy/tests/", "blockwise_shift_test", []),
-    ("hwy/tests/", "blockwise_test", []),
-    ("hwy/tests/", "cast_test", []),
-    ("hwy/tests/", "combine_test", []),
-    ("hwy/tests/", "compare_test", []),
-    ("hwy/tests/", "compress_test", []),
-    ("hwy/tests/", "complex_arithmetic_test", []),
-    ("hwy/tests/", "concat_test", []),
-    ("hwy/tests/", "convert_test", []),
-    ("hwy/tests/", "count_test", []),
-    ("hwy/tests/", "crypto_test", []),
-    ("hwy/tests/", "demote_test", []),
-    ("hwy/tests/", "div_test", []),
-    ("hwy/tests/", "dup128_vec_test", []),
-    ("hwy/tests/", "expand_test", []),
-    ("hwy/tests/", "float_test", []),
-    ("hwy/tests/", "fma_test", []),
-    ("hwy/tests/", "foreach_vec_test", []),
-    ("hwy/tests/", "if_test", []),
-    ("hwy/tests/", "in_range_float_to_int_conv_test", []),
-    ("hwy/tests/", "interleaved_test", []),
-    ("hwy/tests/", "logical_test", []),
-    ("hwy/tests/", "mask_combine_test", []),
-    ("hwy/tests/", "mask_convert_test", []),
-    ("hwy/tests/", "mask_mem_test", []),
-    ("hwy/tests/", "mask_set_test", []),
-    ("hwy/tests/", "mask_slide_test", []),
-    ("hwy/tests/", "mask_test", []),
-    ("hwy/tests/", "masked_arithmetic_test", []),
-    ("hwy/tests/", "masked_minmax_test", []),
-    ("hwy/tests/", "memory_test", []),
-    ("hwy/tests/", "minmax_magnitude_test", []),
-    ("hwy/tests/", "minmax_number_test", []),
-    ("hwy/tests/", "minmax_test", []),
-    ("hwy/tests/", "minmax128_test", []),
-    ("hwy/tests/", "mul_by_pow2_test", []),
-    ("hwy/tests/", "mul_pairwise_test", []),
-    ("hwy/tests/", "mul_test", []),
-    ("hwy/tests/", "reduction_test", []),
-    ("hwy/tests/", "resize_test", []),
-    ("hwy/tests/", "reverse_test", []),
-    ("hwy/tests/", "rotate_test", []),
-    ("hwy/tests/", "saturated_test", []),
-    ("hwy/tests/", "shift_test", []),
-    ("hwy/tests/", "shuffle4_test", []),
-    ("hwy/tests/", "sign_test", []),
-    ("hwy/tests/", "slide_up_down_test", []),
-    ("hwy/tests/", "sums_abs_diff_test", []),
-    ("hwy/tests/", "swizzle_block_test", []),
-    ("hwy/tests/", "swizzle_test", []),
-    ("hwy/tests/", "table_test", []),
-    ("hwy/tests/", "test_util_test", []),
-    ("hwy/tests/", "truncate_test", []),
-    ("hwy/tests/", "tuple_test", []),
-    ("hwy/tests/", "widen_mul_test", []),
-)
-
 HWY_TEST_COPTS = select({
     ":compiler_msvc": [],
     "//conditions:default": [
@@ -748,5 +594,3 @@ test_suite(
     name = "hwy_ops_tests",
     tags = ["hwy_ops_test"],
 )
-
-# Placeholder for integration test, do not remove
diff --git a/third_party/highway/CMakeLists.txt b/third_party/highway/CMakeLists.txt
@@ -68,14 +68,6 @@ endif()
 # The following is only required with GCC < 6.1.0 or CLANG < 16.0
 set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for Armv7 with NEON (requires vfpv4)?")
 
-# Upstream compilers(GCC 14 or CLANG 18) start enabling LSX by default, but
-# enabling LASX still require -mlasx flag to be passed explicitly, in order
-# to enable all targets, we can check them directly, adding them if they are
-# supported.  In this way, Our local compilers(GCC 8.3.0 or CLANG 8.0.1) also
-# could enable LSX & LASX targets.  Any better ideas or suggestions are welcome.
-set(HWY_CMAKE_LSX ON CACHE BOOL "Add -mlsx flag?")
-set(HWY_CMAKE_LASX ON CACHE BOOL "Add -mlasx flag?")
-
 # This must be set on 32-bit x86 with GCC < 13.1, otherwise math_test will be
 # skipped. For GCC 13.1+, you can also build with -fexcess-precision=standard.
 set(HWY_CMAKE_SSE2 OFF CACHE BOOL "Set SSE2 as baseline for 32-bit x86?")
@@ -144,34 +136,6 @@ check_cxx_source_compiles(
   HWY_RISCV
 )
 
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch32|loongarch64")
-  include(CheckCXXCompilerFlag)
-  if (HWY_CMAKE_LSX)
-    set (CMAKE_REQUIRED_FLAGS -mlsx)
-    check_cxx_source_compiles(
-       "int main() {
-          #if !defined(__loongarch_sx)
-          static_assert(false, \"__loongarch_sx is not defined\");
-          #endif
-          return 0;
-        }"
-      COMPILER_SUPPORT_LSX_FLAG
-    )
-  endif()
-  if (HWY_CMAKE_LASX)
-    set (CMAKE_REQUIRED_FLAGS -mlasx)
-    check_cxx_source_compiles(
-       "int main() {
-          #if !defined(__loongarch_asx)
-          static_assert(false, \"__loongarch_asx is not defined\");
-          #endif
-          return 0;
-       }"
-      COMPILER_SUPPORT_LASX_FLAG
-    )
-  endif()
-endif()
-
 if (WIN32)
   set (ORIG_CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
   set (CMAKE_REQUIRED_LIBRARIES synchronization)
@@ -276,6 +240,7 @@ list(APPEND HWY_CONTRIB_SOURCES
     hwy/contrib/sort/vqsort.h
     hwy/contrib/thread_pool/futex.h
     hwy/contrib/thread_pool/spin.h
+    hwy/contrib/thread_pool/thread_pool.cc
     hwy/contrib/thread_pool/thread_pool.h
     hwy/contrib/thread_pool/topology.cc
     hwy/contrib/thread_pool/topology.h
@@ -419,14 +384,6 @@ else()
     -Wcast-align # see -Wcast-align=strict on x86
   )
 
-  # In case LSX/LASX are available but not enabled by defautl.
-  if(COMPILER_SUPPORT_LSX_FLAG)
-    list(APPEND HWY_FLAGS -mlsx)
-  endif()
-  if(COMPILER_SUPPORT_LASX_FLAG)
-    list(APPEND HWY_FLAGS -mlasx)
-  endif()
-
   if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
     list(APPEND HWY_FLAGS
       -Wfloat-overflow-conversion
@@ -931,6 +888,9 @@ list(APPEND HWY_TEST_FILES
   # Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
   # not reproducible locally. Still tested via bazel build.
   hwy/contrib/math/math_test.cc
+  hwy/contrib/math/math_hyper_test.cc
+  hwy/contrib/math/math_tan_test.cc
+  hwy/contrib/math/math_trig_test.cc
   hwy/contrib/random/random_test.cc
   hwy/contrib/sort/bench_sort.cc
   hwy/contrib/sort/sort_test.cc
diff --git a/third_party/highway/LICENSE b/third_party/highway/LICENSE
@@ -1,3 +1,16 @@
+This project is primarily dual-licensed under your choice of either the Apache
+License 2.0 or the BSD 3-Clause License.
+
+The following files are licensed under different terms:
+*   hwy/contrib/random/random-inl.h: CC0 1.0 Universal
+
+The full texts of all applicable licenses are included below, separated by
+'---'.
+
+--------------------------------------------------------------------------------
+Apache License 2.0
+--------------------------------------------------------------------------------
+
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -198,4 +211,161 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
-\ No newline at end of file
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+BSD 3-Clause License
+--------------------------------------------------------------------------------
+
+Copyright (c) The Highway Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1.  Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+
+2.  Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+3.  Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+CC0 1.0 Universal
+--------------------------------------------------------------------------------
+
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+\ No newline at end of file
diff --git a/third_party/highway/LICENSE-BSD3 b/third_party/highway/LICENSE-BSD3
@@ -1,26 +0,0 @@
-Copyright (c) The Highway Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1.  Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
-
-2.  Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
-3.  Neither the name of the copyright holder nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-\ No newline at end of file
diff --git a/third_party/highway/README.md b/third_party/highway/README.md
@@ -29,7 +29,7 @@ functions that map well to CPU instructions without extensive compiler
 transformations. The resulting code is more predictable and robust to code
 changes/compiler updates than autovectorization.
 
-**Works on widely-used platforms**: Highway supports five architectures; the
+**Works on widely-used platforms**: Highway supports seven architectures; the
 same application code can target various instruction sets, including those with
 'scalable' vectors (size unknown at compile time). Highway only requires C++11
 and supports four families of compilers. If you would like to use Highway on
@@ -41,7 +41,7 @@ runtime. Alternatively, developers may choose to target a single instruction set
 without any runtime overhead. In both cases, the application code is the same
 except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
 line of code. See also @kfjahnke's
-[introduction to dispatching](https://github.com/kfjahnke/zimt/blob/multi_isa/examples/multi_isa_example/multi_simd_isa.md).
+[introduction to dispatching](https://github.com/kfjahnke/zimt/blob/main/examples/multi_isa_example/multi_simd_isa.md).
 
 **Suitable for a variety of domains**: Highway provides an extensive set of
 operations, used for image processing (floating-point), compression, video
@@ -82,30 +82,39 @@ us via the below email.
 *   Audio: [Zimtohrli perceptual metric](https://github.com/google/zimtohrli)
 *   Browsers: Chromium (+Vivaldi), Firefox (+floorp / foxhound / librewolf /
     Waterfox)
-*   Computational biology: [RNA analysis](https://github.com/bnprks/BPCells)
-*   Computer graphics: [Sparse voxel renderer](https://github.com/rools/voxl)
+*   Computational biology: [RNA analysis](https://github.com/bnprks/BPCells),
+    [long-sequence preprocessing](https://github.com/OpenGene/fastplong)
+*   Computer graphics: ghostty-org/ghostty,
+    [Sparse voxel renderer](https://github.com/rools/voxl),
+    [tgfx 2D Graphics library](https://github.com/Tencent/tgfx)
 *   Cryptography: google/distributed_point_functions, google/shell-encryption
 *   Data structures: bkille/BitLib
 *   Image codecs: eustas/2im,
     [Grok JPEG 2000](https://github.com/GrokImageCompression/grok),
     [JPEG XL](https://github.com/libjxl/libjxl),
     [JPEGenc](https://github.com/osamu620/JPEGenc),
-    [Jpegli](https://github.com/google/jpegli), OpenHTJ2K
-*   Image processing: cloudinary/ssimulacra2, m-ab-s/media-autobuild_suite,
-    [libvips](https://github.com/libvips/libvips)
+    [Jpegli](https://github.com/google/jpegli),
+    [libaom](https://aomedia.googlesource.com/aom/),
+    [OpenHTJ2K](https://github.com/osamu620/OpenHTJ2K)
+*   Image processing: awxkee/aire, cloudinary/ssimulacra2,
+    [libvips](https://github.com/libvips/libvips), m-ab-s/media-autobuild_suite,
 *   Image viewers: AlienCowEatCake/ImageViewer, diffractor/diffractor,
-    mirillis/jpegxl-wic,
-    [Lux panorama/image viewer](https://bitbucket.org/kfj/pv/)
+    [Lux panorama/image viewer](https://bitbucket.org/kfj/pv/),
+    mirillis/jpegxl-wic
 *   Information retrieval:
     [iresearch database index](https://github.com/iresearch-toolkit/iresearch),
     michaeljclark/zvec,
     [nebula interactive analytics / OLAP](https://github.com/varchar-io/nebula),
-    [ScaNN Scalable Nearest Neighbors](https://github.com/google-research/google-research/tree/7a269cb2ce0ae1db591fe11b62cbc0be7d72532a/scann),
-    [vectorlite vector search](https://github.com/1yefuwang1/vectorlite/)
-*   Machine learning: [gemma.cpp](https://github.com/google/gemma.cpp),
-    Tensorflow, Numpy, zpye/SimpleInfer
+    [`ScaNN` Scalable Nearest Neighbors](https://github.com/google-research/google-research/tree/7a269cb2ce0ae1db591fe11b62cbc0be7d72532a/scann),
+*   Machine learning: array2d/deepx,
+    [gemma.cpp](https://github.com/google/gemma.cpp), Tensorflow, Numpy,
+    zpye/SimpleInfer
+*   Programming languages:
+    [AOT-compiled python](https://github.com/exaloop/codon), oven-sh/bun, V8/V8,
+    yinqiwen/rapidudf
 *   Robotics:
     [MIT Model-Based Design and Verification](https://github.com/RobotLocomotion/drake)
+*   Vector search: 1yefuwang1/vectorlite, vespa-engine/vespa
 
 Other
 
@@ -144,12 +153,13 @@ See also the list at https://repology.org/project/highway-simd-library/versions
 
 ### Targets
 
-Highway supports 24 targets, listed in alphabetical order of platform:
+Highway supports 27 targets, listed in alphabetical order of platform:
 
 -   Any: `EMU128`, `SCALAR`;
 -   Armv7+: `NEON_WITHOUT_AES`, `NEON`, `NEON_BF16`, `SVE`, `SVE2`, `SVE_256`,
     `SVE2_128`;
 -   IBM Z: `Z14`, `Z15`;
+-   LoongArch: `LSX`, `LASX`;
 -   POWER: `PPC8` (v2.07), `PPC9` (v3.0), `PPC10` (v3.1B, not yet supported due
     to compiler bugs, see #1207; also requires QEMU 7.2);
 -   RISC-V: `RVV` (1.0);
@@ -168,6 +178,7 @@ Highway supports 24 targets, listed in alphabetical order of platform:
         by defining `HWY_WANT_AVX3_ZEN4` if compiling for static dispatch, but
         enabled by default for runtime dispatch),
     -   `AVX3_SPR` (~Sapphire Rapids, includes AVX-512FP16)
+    -   `AVX10_2` (~Diamond Rapids)
 
 Our policy is that unless otherwise specified, targets will remain supported as
 long as they can be (cross-)compiled with currently supported Clang or GCC, and
diff --git a/third_party/highway/hwy/aligned_allocator.h b/third_party/highway/hwy/aligned_allocator.h
@@ -210,7 +210,7 @@ AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
   T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
   if (ptr != nullptr) {
     for (size_t i = 0; i < items; i++) {
-      new (ptr + i) T(std::forward<Args>(args)...);
+      new (ptr + i) T(args...);
     }
   }
   return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
diff --git a/third_party/highway/hwy/auto_tune.h b/third_party/highway/hwy/auto_tune.h
@@ -25,7 +25,18 @@
 
 #include "hwy/aligned_allocator.h"  // Span
 #include "hwy/base.h"               // HWY_MIN
-#include "hwy/contrib/sort/vqsort.h"
+
+// configuration to allow auto_tune to use std::sort instead of VQSort
+// (also enabled in header only mode).
+#if defined(HWY_HEADER_ONLY)
+#define HWY_AUTOTUNE_STDSORT
+#endif
+
+#ifdef HWY_AUTOTUNE_STDSORT
+#include <algorithm>  // std::sort
+#else
+#include "hwy/contrib/sort/vqsort.h"  // VQSort
+#endif
 
 // Infrastructure for auto-tuning (choosing optimal parameters at runtime).
 
@@ -104,6 +115,10 @@ class CostDistribution {
  private:
   static double Median(double* to_sort, size_t n) {
     HWY_DASSERT(n >= 2);
+
+#ifdef HWY_AUTOTUNE_STDSORT
+    std::sort(to_sort, to_sort + n);
+#else
 // F64 is supported everywhere except Armv7.
 #if !HWY_ARCH_ARM_V7
     VQSort(to_sort, n, SortAscending());
@@ -112,6 +127,8 @@ class CostDistribution {
     // equivalent.
     VQSort(reinterpret_cast<uint64_t*>(to_sort), n, SortAscending());
 #endif
+#endif
+
     if (n & 1) return to_sort[n / 2];
     // Even length: average of two middle elements.
     return (to_sort[n / 2] + to_sort[n / 2 - 1]) * 0.5;
@@ -246,15 +263,11 @@ class CostDistribution {
       OnlineNotify(copy[i]);
     }
     HWY_DASSERT(IsOnline());
-
-#if SIZE_MAX == 0xFFFFFFFFu
-    (void)padding_;
-#endif
   }
 
   size_t num_values_ = 0;  // size of `values_` <= `kMaxValues`
 #if SIZE_MAX == 0xFFFFFFFFu
-  uint32_t padding_ = 0;
+  HWY_MAYBE_UNUSED uint32_t padding_ = 0;
 #endif
 
   double online_n_ = 0.0;  // number of calls to `OnlineNotify`.
@@ -405,10 +418,9 @@ class AutoTune {
   const Config* Best() const { return best_; }
 
   // If false, caller must call `SetCandidates` before `NextConfig`.
-  bool HasCandidates() const {
-    HWY_DASSERT(!Best());
-    return !candidates_.empty();
-  }
+  // NOTE: also called after Best() is non-null.
+  bool HasCandidates() const { return !candidates_.empty(); }
+
   // WARNING: invalidates `Best()`, do not call if that is non-null.
   void SetCandidates(std::vector<Config> candidates) {
     HWY_DASSERT(!Best() && !HasCandidates());
@@ -429,7 +441,7 @@ class AutoTune {
 
   // Returns the current `Config` to measure.
   const Config& NextConfig() const {
-    HWY_DASSERT(!Best() && HasCandidates());
+    HWY_DASSERT(HasCandidates());
     return candidates_[config_idx_];
   }
 
diff --git a/third_party/highway/hwy/base.h b/third_party/highway/hwy/base.h
@@ -22,8 +22,8 @@
 #include <stddef.h>
 #include <stdint.h>
 #if defined(HWY_HEADER_ONLY)
-#include <cstdarg>
-#include <cstdio>
+#include <stdarg.h>
+#include <stdio.h>
 #endif
 
 #if !defined(HWY_NO_LIBCXX)
@@ -35,7 +35,8 @@
 
 #include <mozilla/Attributes.h>
 
-// API version (https://semver.org/); keep in sync with CMakeLists.txt.
+// API version (https://semver.org/); keep in sync with CMakeLists.txt and
+// meson.build.
 #define HWY_MAJOR 1
 #define HWY_MINOR 3
 #define HWY_PATCH 0
@@ -59,7 +60,7 @@
 
 #endif  // !HWY_IDE
 
-#if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
+#if !defined(HWY_NO_LIBCXX) || HWY_COMPILER_MSVC
 #include <atomic>
 #endif
 
@@ -150,6 +151,21 @@
 
 #endif  // !HWY_COMPILER_MSVC
 
+#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
+    (HWY_COMPILER_ICC && !HWY_COMPILER_ICX)
+// The use of __attribute__((unused)) in private class member variables triggers
+// a compiler warning with GCC 11 and earlier and ICC
+
+// GCC 11 and earlier and ICC also do not emit -Wunused-private-field warnings
+// for unused private class member variables
+#define HWY_MEMBER_VAR_MAYBE_UNUSED
+#else
+// Clang and ICX need __attribute__((unused)) in unused private class member
+// variables to suppress -Wunused-private-field warnings unless this warning is
+// ignored by using HWY_DIAGNOSTICS_OFF
+#define HWY_MEMBER_VAR_MAYBE_UNUSED HWY_MAYBE_UNUSED
+#endif
+
 //------------------------------------------------------------------------------
 // Builtin/attributes (no more #include after this point due to namespace!)
 
@@ -204,7 +220,9 @@ namespace hwy {
 //------------------------------------------------------------------------------
 // Macros
 
-#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
+// Note: it is safe to remove `static` for users who want to use modules, but
+// that might be a breaking change for some users, hence we do not by default.
+#define HWY_API static HWY_INLINE HWY_FLATTEN
 
 #define HWY_CONCAT_IMPL(a, b) a##b
 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
@@ -401,10 +419,19 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
 #endif
 
 // For enabling HWY_DASSERT and shortening tests in slower debug builds
+//
+// Note: `HWY_IS_UBSAN` is specifically excluded from engaging debug
+// builds. This is in service of Chromium's `-fsanitize=array-bounds` by
+// default, where we don't want Highway to unconditionally build in
+// debug mode.
+//
+// See also:
+// https://docs.google.com/document/d/1eCtY4AZF-SiFHxhIYWzEytdIx3C24de7ccD6Y5Gn2H8/edit?tab=t.9zkn85hr82ms#heading=h.efcshvfql42c
 #if !defined(HWY_IS_DEBUG_BUILD)
 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
-#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_SANITIZER || \
+#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) ||         \
+    (HWY_IS_ASAN || (HWY_IS_SANITIZER && !HWY_IS_UBSAN)) || \
     defined(__clang_analyzer__)
 #define HWY_IS_DEBUG_BUILD 1
 #else
@@ -1720,9 +1747,9 @@ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
 // bf16 <-> f32 in convert_test results in 0x2525 for 1.0 instead of 0x3f80.
 // Reported at https://github.com/llvm/llvm-project/issues/151692.
 #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
-#if HWY_ARCH_X86 && defined(__SSE2__) &&                     \
-    ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL && \
-      !HWY_IS_DEBUG_BUILD) ||                                \
+#if HWY_ARCH_X86 && defined(__SSE2__) &&                         \
+    ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL &&     \
+      (!HWY_IS_DEBUG_BUILD || HWY_COMPILER3_CLANG >= 220101)) || \
      HWY_COMPILER_GCC_ACTUAL >= 1300)
 #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
 #else
@@ -1740,7 +1767,11 @@ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
 #ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
 // Recent enough compiler also has operators. aarch64 clang 18 hits internal
 // compiler errors on bf16 ToString, hence only enable on GCC for now.
-#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
+// GCC >= 13 will insert a function call to the __extendbfsf2 helper function
+// for scalar conversions from __bf16 to float. This is prohibitively expensive,
+// so refrain from using scalar BF16 operators on these compiler versions.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121853
+#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1700)
 #define HWY_HAVE_SCALAR_BF16_OPERATORS 1
 #else
 #define HWY_HAVE_SCALAR_BF16_OPERATORS 0
@@ -2736,7 +2767,7 @@ HWY_API constexpr TTo ConvertScalarTo(TFrom in) {
 template <typename T1, typename T2>
 constexpr inline T1 DivCeil(T1 a, T2 b) {
 #if HWY_CXX_LANG >= 201703L
-  HWY_DASSERT(b != 0);
+  HWY_DASSERT(b != T2{0});
 #endif
   return (a + b - 1) / b;
 }
@@ -2959,9 +2990,10 @@ HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
 // 64 x 64 = 128 bit multiplication
 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
 #if defined(__SIZEOF_INT128__)
-  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
-  *upper = (uint64_t)(product >> 64);
-  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
+  __uint128_t product =
+      static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
+  *upper = static_cast<uint64_t>(product >> 64);
+  return static_cast<uint64_t>(product & 0xFFFFFFFFFFFFFFFFULL);
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
   return _umul128(a, b, upper);
 #else
@@ -2978,9 +3010,9 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
 
 HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
 #if defined(__SIZEOF_INT128__)
-  __int128_t product = (__int128_t)a * (__int128_t)b;
-  *upper = (int64_t)(product >> 64);
-  return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
+  __int128_t product = static_cast<__int128_t>(a) * static_cast<__int128_t>(b);
+  *upper = static_cast<int64_t>(product >> 64);
+  return static_cast<int64_t>(product & 0xFFFFFFFFFFFFFFFFULL);
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
   return _mul128(a, b, upper);
 #else
diff --git a/third_party/highway/hwy/bit_set.h b/third_party/highway/hwy/bit_set.h
@@ -16,17 +16,21 @@
 #ifndef HIGHWAY_HWY_BIT_SET_H_
 #define HIGHWAY_HWY_BIT_SET_H_
 
-// BitSet with fast Foreach for up to 64 and 4096 members.
+// Various BitSet for 64, up to 4096, or any number of bits.
 
 #include <stddef.h>
 
+#include <atomic>
+
 #include "hwy/base.h"
 
 namespace hwy {
 
-// 64-bit specialization of std::bitset, which lacks Foreach.
+// 64-bit specialization of `std::bitset`, which lacks `Foreach`.
 class BitSet64 {
  public:
+  constexpr size_t MaxSize() const { return 64; }
+
   // No harm if `i` is already set.
   void Set(size_t i) {
     HWY_DASSERT(i < 64);
@@ -48,15 +52,24 @@ class BitSet64 {
     return (bits_ & (1ULL << i)) != 0;
   }
 
-  // Returns true if any Get(i) would return true for i in [0, 64).
+  // Returns true if Get(i) would return true for any i in [0, 64).
   bool Any() const { return bits_ != 0; }
 
-  // Returns lowest i such that Get(i). Caller must ensure Any() beforehand!
+  // Returns true if Get(i) would return true for all i in [0, 64).
+  bool All() const { return bits_ == ~uint64_t{0}; }
+
+  // Returns lowest i such that `Get(i)`. Caller must first ensure `Any()`!
   size_t First() const {
     HWY_DASSERT(Any());
     return Num0BitsBelowLS1Bit_Nonzero64(bits_);
   }
 
+  // Returns lowest i such that `!Get(i)`. Caller must first ensure `!All()`!
+  size_t First0() const {
+    HWY_DASSERT(!All());
+    return Num0BitsBelowLS1Bit_Nonzero64(~bits_);
+  }
+
   // Returns uint64_t(Get(i)) << i for i in [0, 64).
   uint64_t Get64() const { return bits_; }
 
@@ -78,10 +91,226 @@ class BitSet64 {
   uint64_t bits_ = 0;
 };
 
-// Two-level bitset for up to kMaxSize <= 4096 values.
+// Any number of bits, flat array.
+template <size_t kMaxSize>
+class BitSet {
+  static_assert(kMaxSize != 0, "BitSet requires non-zero size");
+
+ public:
+  constexpr size_t MaxSize() const { return kMaxSize; }
+
+  // No harm if `i` is already set.
+  void Set(size_t i) {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
+    bits_[idx].Set(mod);
+  }
+
+  void Clear(size_t i) {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
+    bits_[idx].Clear(mod);
+    HWY_DASSERT(!Get(i));
+  }
+
+  bool Get(size_t i) const {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
+    return bits_[idx].Get(mod);
+  }
+
+  // Returns true if Get(i) would return true for any i in [0, kMaxSize).
+  bool Any() const {
+    for (const BitSet64& bits : bits_) {
+      if (bits.Any()) return true;
+    }
+    return false;
+  }
+
+  // Returns true if Get(i) would return true for all i in [0, kMaxSize).
+  bool All() const {
+    for (size_t idx = 0; idx < kNum64 - 1; ++idx) {
+      if (!bits_[idx].All()) return false;
+    }
+
+    constexpr size_t kRemainder = kMaxSize % 64;
+    if (kRemainder == 0) {
+      return bits_[kNum64 - 1].All();
+    }
+    return bits_[kNum64 - 1].Count() == kRemainder;
+  }
+
+  // Returns lowest i such that `Get(i)`. Caller must first ensure `Any()`!
+  size_t First() const {
+    HWY_DASSERT(Any());
+    for (size_t idx = 0;; ++idx) {
+      HWY_DASSERT(idx < kNum64);
+      if (bits_[idx].Any()) return idx * 64 + bits_[idx].First();
+    }
+  }
+
+  // Returns lowest i such that `!Get(i)`. Caller must first ensure `All()`!
+  size_t First0() const {
+    HWY_DASSERT(!All());
+    for (size_t idx = 0;; ++idx) {
+      HWY_DASSERT(idx < kNum64);
+      if (!bits_[idx].All()) {
+        const size_t first0 = idx * 64 + bits_[idx].First0();
+        HWY_DASSERT(first0 < kMaxSize);
+        return first0;
+      }
+    }
+  }
+
+  // Calls `func(i)` for each `i` in the set. It is safe for `func` to modify
+  // the set, but the current Foreach call is only affected if changing one of
+  // the not yet visited BitSet64.
+  template <class Func>
+  void Foreach(const Func& func) const {
+    for (size_t idx = 0; idx < kNum64; ++idx) {
+      bits_[idx].Foreach([idx, &func](size_t mod) { func(idx * 64 + mod); });
+    }
+  }
+
+  size_t Count() const {
+    size_t total = 0;
+    for (const BitSet64& bits : bits_) {
+      total += bits.Count();
+    }
+    return total;
+  }
+
+ private:
+  static constexpr size_t kNum64 = DivCeil(kMaxSize, size_t{64});
+  BitSet64 bits_[kNum64];
+};
+
+// Any number of bits, flat array, atomic updates to the u64.
+template <size_t kMaxSize>
+class AtomicBitSet {
+  static_assert(kMaxSize != 0, "AtomicBitSet requires non-zero size");
+
+  // Bits may signal something to other threads, hence relaxed is insufficient.
+  // Acq/Rel ensures a happens-before relationship.
+  static constexpr auto kAcq = std::memory_order_acquire;
+  static constexpr auto kRel = std::memory_order_release;
+
+ public:
+  constexpr size_t MaxSize() const { return kMaxSize; }
+
+  // No harm if `i` is already set.
+  void Set(size_t i) {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
+    bits_[idx].fetch_or(1ULL << mod, kRel);
+  }
+
+  void Clear(size_t i) {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
+    bits_[idx].fetch_and(~(1ULL << mod), kRel);
+    HWY_DASSERT(!Get(i));
+  }
+
+  bool Get(size_t i) const {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
+    return ((bits_[idx].load(kAcq) & (1ULL << mod))) != 0;
+  }
+
+  // Returns true if Get(i) would return true for any i in [0, kMaxSize).
+  bool Any() const {
+    for (const std::atomic<uint64_t>& bits : bits_) {
+      if (bits.load(kAcq)) return true;
+    }
+    return false;
+  }
+
+  // Returns true if Get(i) would return true for all i in [0, kMaxSize).
+  bool All() const {
+    for (size_t idx = 0; idx < kNum64 - 1; ++idx) {
+      if (bits_[idx].load(kAcq) != ~uint64_t{0}) return false;
+    }
+
+    constexpr size_t kRemainder = kMaxSize % 64;
+    const uint64_t last_bits = bits_[kNum64 - 1].load(kAcq);
+    if (kRemainder == 0) {
+      return last_bits == ~uint64_t{0};
+    }
+    return PopCount(last_bits) == kRemainder;
+  }
+
+  // Returns lowest i such that `Get(i)`. Caller must first ensure `Any()`!
+  size_t First() const {
+    HWY_DASSERT(Any());
+    for (size_t idx = 0;; ++idx) {
+      HWY_DASSERT(idx < kNum64);
+      const uint64_t bits = bits_[idx].load(kAcq);
+      if (bits != 0) {
+        return idx * 64 + Num0BitsBelowLS1Bit_Nonzero64(bits);
+      }
+    }
+  }
+
+  // Returns lowest i such that `!Get(i)`. Caller must first ensure `!All()`!
+  size_t First0() const {
+    HWY_DASSERT(!All());
+    for (size_t idx = 0;; ++idx) {
+      HWY_DASSERT(idx < kNum64);
+      const uint64_t inv_bits = ~bits_[idx].load(kAcq);
+      if (inv_bits != 0) {
+        const size_t first0 =
+            idx * 64 + Num0BitsBelowLS1Bit_Nonzero64(inv_bits);
+        HWY_DASSERT(first0 < kMaxSize);
+        return first0;
+      }
+    }
+  }
+
+  // Calls `func(i)` for each `i` in the set. It is safe for `func` to modify
+  // the set, but the current Foreach call is only affected if changing one of
+  // the not yet visited uint64_t.
+  template <class Func>
+  void Foreach(const Func& func) const {
+    for (size_t idx = 0; idx < kNum64; ++idx) {
+      uint64_t remaining_bits = bits_[idx].load(kAcq);
+      while (remaining_bits != 0) {
+        const size_t i = Num0BitsBelowLS1Bit_Nonzero64(remaining_bits);
+        remaining_bits &= remaining_bits - 1;  // clear LSB
+        func(idx * 64 + i);
+      }
+    }
+  }
+
+  size_t Count() const {
+    size_t total = 0;
+    for (const std::atomic<uint64_t>& bits : bits_) {
+      total += PopCount(bits.load(kAcq));
+    }
+    return total;
+  }
+
+ private:
+  static constexpr size_t kNum64 = DivCeil(kMaxSize, size_t{64});
+  std::atomic<uint64_t> bits_[kNum64] = {};
+};
+
+// Two-level bitset for up to `kMaxSize` <= 4096 values. The iterators
+// (`Any/First/Foreach/Count`) are more efficient than `BitSet` for sparse sets.
+// This comes at the cost of slightly slower mutators (`Set/Clear`).
 template <size_t kMaxSize = 4096>
 class BitSet4096 {
+  static_assert(kMaxSize != 0, "BitSet4096 requires non-zero size");
+
  public:
+  constexpr size_t MaxSize() const { return kMaxSize; }
+
   // No harm if `i` is already set.
   void Set(size_t i) {
     HWY_DASSERT(i < kMaxSize);
@@ -117,16 +346,38 @@ class BitSet4096 {
     return bits_[idx].Get(mod);
   }
 
-  // Returns true if any Get(i) would return true for i in [0, 64).
+  // Returns true if `Get(i)` would return true for any i in [0, kMaxSize).
   bool Any() const { return nonzero_.Any(); }
 
-  // Returns lowest i such that Get(i). Caller must ensure Any() beforehand!
+  // Returns true if `Get(i)` would return true for all i in [0, kMaxSize).
+  bool All() const {
+    // Do not check `nonzero_.All()` - that only works if `kMaxSize` is 4096.
+    if (nonzero_.Count() != kNum64) return false;
+    return Count() == kMaxSize;
+  }
+
+  // Returns lowest i such that `Get(i)`. Caller must first ensure `Any()`!
   size_t First() const {
     HWY_DASSERT(Any());
     const size_t idx = nonzero_.First();
     return idx * 64 + bits_[idx].First();
   }
 
+  // Returns lowest i such that `!Get(i)`. Caller must first ensure `!All()`!
+  size_t First0() const {
+    HWY_DASSERT(!All());
+    // It is likely not worthwhile to have a separate `BitSet64` for `not_all_`,
+    // hence iterate over all u64.
+    for (size_t idx = 0;; ++idx) {
+      HWY_DASSERT(idx < kNum64);
+      if (!bits_[idx].All()) {
+        const size_t first0 = idx * 64 + bits_[idx].First0();
+        HWY_DASSERT(first0 < kMaxSize);
+        return first0;
+      }
+    }
+  }
+
   // Returns uint64_t(Get(i)) << i for i in [0, 64).
   uint64_t Get64() const { return bits_[0].Get64(); }
 
@@ -149,8 +400,9 @@ class BitSet4096 {
 
  private:
   static_assert(kMaxSize <= 64 * 64, "One BitSet64 insufficient");
+  static constexpr size_t kNum64 = DivCeil(kMaxSize, size_t{64});
   BitSet64 nonzero_;
-  BitSet64 bits_[kMaxSize / 64];
+  BitSet64 bits_[kNum64];
 };
 
 }  // namespace hwy
diff --git a/third_party/highway/hwy/bit_set_test.cc b/third_party/highway/hwy/bit_set_test.cc
@@ -32,50 +32,79 @@
 namespace hwy {
 namespace {
 
-// Template arg for kMin avoids compiler behavior mismatch for lambda capture.
-template <class Set, size_t kMax, size_t kMin = 0>
-void TestSet() {
+template <class Set>
+void SmokeTest() {
+  constexpr size_t kMax = Set().MaxSize() - 1;
+
   Set set;
   // Defaults to empty.
   HWY_ASSERT(!set.Any());
-  HWY_ASSERT(set.Count() == 0);
-  set.Foreach(
-      [](size_t i) { HWY_ABORT("Set should be empty but got %zu\n", i); });
+  HWY_ASSERT(!set.All());
   HWY_ASSERT(!set.Get(0));
   HWY_ASSERT(!set.Get(kMax));
+  HWY_ASSERT(set.First0() == 0);
+  set.Foreach(
+      [](size_t i) { HWY_ABORT("Set should be empty but got %zu\n", i); });
+  HWY_ASSERT(set.Count() == 0);
 
   // After setting, we can retrieve it.
   set.Set(kMax);
   HWY_ASSERT(set.Get(kMax));
   HWY_ASSERT(set.Any());
+  HWY_ASSERT(!set.All());
   HWY_ASSERT(set.First() == kMax);
-  HWY_ASSERT(set.Count() == 1);
+  HWY_ASSERT(set.First0() == 0);
   set.Foreach([](size_t i) { HWY_ASSERT(i == kMax); });
-
-  // SetNonzeroBitsFrom64 does not clear old bits.
-  set.SetNonzeroBitsFrom64(1ull << kMin);
-  HWY_ASSERT(set.Any());
-  HWY_ASSERT(set.First() == kMin);
-  HWY_ASSERT(set.Get(kMin));
-  HWY_ASSERT(set.Get(kMax));
-  HWY_ASSERT(set.Count() == 2);
-  set.Foreach([](size_t i) { HWY_ASSERT(i == kMin || i == kMax); });
+  HWY_ASSERT(set.Count() == 1);
 
   // After clearing, it is empty again.
-  set.Clear(kMin);
   set.Clear(kMax);
+  set.Clear(0);  // was not set
+  HWY_ASSERT(!set.Get(0));
+  HWY_ASSERT(!set.Get(kMax));
   HWY_ASSERT(!set.Any());
-  HWY_ASSERT(set.Count() == 0);
+  HWY_ASSERT(!set.All());
+  HWY_ASSERT(set.First0() == 0);
   set.Foreach(
       [](size_t i) { HWY_ABORT("Set should be empty but got %zu\n", i); });
-  HWY_ASSERT(!set.Get(0));
-  HWY_ASSERT(!set.Get(kMax));
+  HWY_ASSERT(set.Count() == 0);
+}
+
+TEST(BitSetTest, SmokeTestSet64) { SmokeTest<BitSet64>(); }
+TEST(BitSetTest, SmokeTestSet) { SmokeTest<BitSet<320>>(); }
+TEST(BitSetTest, SmokeTestAtomicSet) { SmokeTest<AtomicBitSet<400>>(); }
+TEST(BitSetTest, SmokeTestSet4096) { SmokeTest<BitSet4096<>>(); }
+
+template <class Set>
+void TestSetNonzeroBitsFrom64() {
+  constexpr size_t kMin = 0;
+  Set set;
+  set.SetNonzeroBitsFrom64(1ull << kMin);
+  HWY_ASSERT(set.Any());
+  HWY_ASSERT(!set.All());
+  HWY_ASSERT(set.Get(kMin));
+  HWY_ASSERT(set.First() == kMin);
+  HWY_ASSERT(set.First0() == kMin + 1);
+  set.Foreach([](size_t i) { HWY_ASSERT(i == kMin); });
+  HWY_ASSERT(set.Count() == 1);
+
+  set.SetNonzeroBitsFrom64(0x70ULL);
+  HWY_ASSERT(set.Get(kMin) && set.Get(4) && set.Get(5) && set.Get(6));
+  HWY_ASSERT(set.Any());
+  HWY_ASSERT(!set.All());
+  HWY_ASSERT(set.First() == kMin);  // does not clear existing bits
+  HWY_ASSERT(set.First0() == kMin + 1);
+  set.Foreach([](size_t i) { HWY_ASSERT(i == kMin || (4 <= i && i <= 6)); });
+  HWY_ASSERT(set.Count() == 4);
 }
 
-TEST(BitSetTest, TestSet64) { TestSet<BitSet64, 63>(); }
-TEST(BitSetTest, TestSet4096) { TestSet<BitSet4096<>, 4095>(); }
+TEST(BitSetTest, TestSetNonzeroBits64) { TestSetNonzeroBitsFrom64<BitSet64>(); }
+TEST(BitSetTest, TestSetNonzeroBits4096) {
+  TestSetNonzeroBitsFrom64<BitSet4096<>>();
+}
 
-// Supports membership and random choice, for testing BitSet4096.
+// Reference implementation using map (for sparse `BitSet4096`) and vector for
+// random choice of elements.
 class SlowSet {
  public:
   // Inserting multiple times is a no-op.
@@ -136,6 +165,7 @@ class SlowSet {
   template <class Set>
   void CheckSame(const Set& set) {
     HWY_ASSERT(set.Any() == (set.Count() != 0));
+    HWY_ASSERT(set.All() == (set.Count() == set.MaxSize()));
     HWY_ASSERT(Count() == set.Count());
     // Everything set has, we also have.
     set.Foreach([this](size_t i) { HWY_ASSERT(Get(i)); });
@@ -146,6 +176,12 @@ class SlowSet {
     if (set.Any()) {
       HWY_ASSERT(set.First() == idx_for_i_.begin()->first);
     }
+    if (!set.All()) {
+      const size_t idx0 = set.First0();
+      HWY_ASSERT(idx0 < set.MaxSize());
+      HWY_ASSERT(!set.Get(idx0));
+      HWY_ASSERT(!Get(idx0));
+    }
   }
 
  private:
@@ -153,16 +189,17 @@ class SlowSet {
   std::map<size_t, size_t> idx_for_i_;
 };
 
-void TestSetRandom(uint64_t grow_prob) {
-  const uint32_t mod = 4096;
+template <class Set>
+void TestSetWithGrowProb(uint64_t grow_prob) {
+  constexpr uint32_t max_size = static_cast<uint32_t>(Set().MaxSize());
   RandomState rng;
 
   // Multiple independent random tests:
   for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
-    BitSet4096<> set;
+    Set set;
     SlowSet slow_set;
     // Mutate sets via random walk and ensure they are the same afterwards.
-    for (size_t iter = 0; iter < 200; ++iter) {
+    for (size_t iter = 0; iter < AdjustedReps(1000); ++iter) {
       const uint64_t bits = (Random64(&rng) >> 10) & 0x3FF;
       if (bits > 980 && slow_set.Count() != 0) {
         // Small chance of reinsertion: already present, unchanged after.
@@ -175,7 +212,7 @@ void TestSetRandom(uint64_t grow_prob) {
         HWY_ASSERT(count == set.Count());
       } else if (bits < grow_prob) {
         // Set random value; no harm if already set.
-        const size_t i = static_cast<size_t>(Random32(&rng) % mod);
+        const size_t i = static_cast<size_t>(Random32(&rng) % max_size);
         slow_set.Set(i);
         set.Set(i);
         HWY_ASSERT(set.Get(i));
@@ -194,9 +231,23 @@ void TestSetRandom(uint64_t grow_prob) {
   }
 }
 
-// Lower probability of growth so that the set is often nearly empty.
-TEST(BitSetTest, TestSetRandomShrink) { TestSetRandom(400); }
-TEST(BitSetTest, TestSetRandomGrow) { TestSetRandom(600); }
+template <class Set>
+void TestSetRandom() {
+  // Lower probability of growth so that the set is often nearly empty.
+  TestSetWithGrowProb<Set>(400);
+
+  TestSetWithGrowProb<Set>(600);
+}
+
+TEST(BitSetTest, TestSet64) { TestSetRandom<BitSet64>(); }
+TEST(BitSetTest, TestSet41) { TestSetRandom<BitSet<41>>(); }
+TEST(BitSetTest, TestSet) { TestSetRandom<BitSet<199>>(); }
+// One partial u64
+TEST(BitSetTest, TestAtomicSet32) { TestSetRandom<AtomicBitSet<32>>(); }
+// 3 whole u64
+TEST(BitSetTest, TestAtomicSet192) { TestSetRandom<AtomicBitSet<192>>(); }
+TEST(BitSetTest, TestSet3000) { TestSetRandom<BitSet4096<3000>>(); }
+TEST(BitSetTest, TestSet4096) { TestSetRandom<BitSet4096<>>(); }
 
 }  // namespace
 }  // namespace hwy
diff --git a/third_party/highway/hwy/cache_control.h b/third_party/highway/hwy/cache_control.h
@@ -98,9 +98,10 @@ template <typename T>
 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
   (void)p;
 #ifndef HWY_DISABLE_CACHE_CONTROL
-#if HWY_ARCH_X86
+// Use _mm_prefetch on x86/x64, except when clang-cl is compiled with -mno-mmx.
+#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
   _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
-#elif HWY_COMPILER_GCC  // includes clang
+#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL  // includes clang
   // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
   // desirable, so use the default 3 (keep in caches).
   __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
diff --git a/third_party/highway/hwy/contrib/algo/find_test.cc b/third_party/highway/hwy/contrib/algo/find_test.cc
@@ -173,8 +173,8 @@ struct TestFindIf {
     // Includes out-of-range value 9 to test the not-found path.
     for (int val = min_val; val <= 9; ++val) {
 #if HWY_GENERIC_LAMBDA
-      const auto greater = [val](const auto d, const auto v) HWY_ATTR {
-        return Gt(v, Set(d, ConvertScalarTo<T>(val)));
+      const auto greater = [val](const auto d2, const auto v) HWY_ATTR {
+        return Gt(v, Set(d2, ConvertScalarTo<T>(val)));
       };
 #else
       const GreaterThan greater(val);
diff --git a/third_party/highway/hwy/contrib/algo/transform_test.cc b/third_party/highway/hwy/contrib/algo/transform_test.cc
@@ -173,8 +173,8 @@ struct TestGenerate {
     // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
     // the attribute also applies to lambdas? If so, remove HWY_ATTR.
 #if HWY_GENERIC_LAMBDA
-    const auto gen2 = [](const auto d, const auto vidx)
-                          HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
+    const auto gen2 = [](const auto d2, const auto vidx)
+                          HWY_ATTR { return BitCast(d2, Add(vidx, vidx)); };
 #else
     const Gen2 gen2;
 #endif
@@ -245,8 +245,8 @@ struct TestTransform {
     // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
     // the attribute also applies to lambdas? If so, remove HWY_ATTR.
 #if HWY_GENERIC_LAMBDA
-    const auto scal = [](const auto d, const auto v) HWY_ATTR {
-      return Mul(Set(d, ConvertScalarTo<T>(kAlpha)), v);
+    const auto scal = [](const auto d2, const auto v) HWY_ATTR {
+      return Mul(Set(d2, ConvertScalarTo<T>(kAlpha)), v);
     };
 #else
     const SCAL scal;
@@ -290,8 +290,8 @@ struct TestTransform1 {
     SimpleAXPY(a, b, expected.get(), count);
 
 #if HWY_GENERIC_LAMBDA
-    const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
-      return MulAdd(Set(d, ConvertScalarTo<T>(kAlpha)), v, v1);
+    const auto axpy = [](const auto d2, const auto v, const auto v1) HWY_ATTR {
+      return MulAdd(Set(d2, ConvertScalarTo<T>(kAlpha)), v, v1);
     };
 #else
     const AXPY axpy;
diff --git a/third_party/highway/hwy/contrib/math/math-inl.h b/third_party/highway/hwy/contrib/math/math-inl.h
@@ -342,7 +342,6 @@ HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 1
  *      Valid Range: [-39000, +39000]
- * @return sine and cosine of 'x'
  */
 template <class D, class V>
 HWY_INLINE void SinCos(D d, V x, V& s, V& c);
diff --git a/third_party/highway/hwy/contrib/math/math_test.cc b/third_party/highway/hwy/contrib/math/math_test.cc
@@ -20,7 +20,6 @@
 #include <cmath>   // std::abs
 
 #include "hwy/base.h"
-#include "hwy/nanobenchmark.h"
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
@@ -89,7 +88,7 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T),
   // two pieces, [+0, max] and [-0, min], otherwise [min, max].
   int range_count = 1;
   UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
-  if ((min < 0.0) && (max > 0.0)) {
+  if ((min < T{0}) && (max > T{0})) {
     ranges[0][0] = BitCastScalar<UintT>(ConvertScalarTo<T>(+0.0));
     ranges[0][1] = max_bits;
     ranges[1][0] = BitCastScalar<UintT>(ConvertScalarTo<T>(-0.0));
@@ -123,9 +122,9 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T),
       max_ulp = HWY_MAX(max_ulp, ulp);
       if (ulp > max_error_ulp) {
         fprintf(stderr, "%s: %s(%f) expected %E actual %E ulp %g max ulp %u\n",
-                hwy::TypeName(T(), Lanes(d)).c_str(), name, value, expected,
-                actual, static_cast<double>(ulp),
-                static_cast<uint32_t>(max_error_ulp));
+                hwy::TypeName(T(), Lanes(d)).c_str(), name, value,
+                static_cast<double>(expected), static_cast<double>(actual),
+                static_cast<double>(ulp), static_cast<uint32_t>(max_error_ulp));
       }
     }
   }
@@ -140,101 +139,23 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T),
   }
 
 #undef DEFINE_MATH_TEST
-#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
-                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \
-  struct Test##NAME {                                                     \
-    template <class T, class D>                                           \
-    HWY_NOINLINE void operator()(T, D d) {                                \
-      if (sizeof(T) == 4) {                                               \
-        TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX,  \
-                       F32_ERROR);                                        \
-      } else {                                                            \
-        TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d,                    \
-                       static_cast<T>(F64_MIN), static_cast<T>(F64_MAX),  \
-                       F64_ERROR);                                        \
-      }                                                                   \
-    }                                                                     \
-  };                                                                      \
+#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR,     \
+                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)           \
+  struct Test##NAME {                                                         \
+    template <class T, class D, HWY_IF_T_SIZE(T, 4)>                          \
+    HWY_NOINLINE void operator()(T, D d) {                                    \
+      TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX,        \
+                     F32_ERROR);                                              \
+    }                                                                         \
+    template <class T, class D, HWY_IF_T_SIZE(T, 8)>                          \
+    HWY_NOINLINE void operator()(T, D d) {                                    \
+      TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, static_cast<T>(F64_MIN), \
+                     static_cast<T>(F64_MAX), F64_ERROR);                     \
+    }                                                                         \
+  };                                                                          \
   DEFINE_MATH_TEST_FUNC(NAME)
 
-// Floating point values closest to but less than 1.0. Avoid variables with
-// static initializers inside HWY_BEFORE_NAMESPACE/HWY_AFTER_NAMESPACE to
-// ensure target-specific code does not leak into startup code.
-float kNearOneF() { return BitCastScalar<float>(0x3F7FFFFF); }
-double kNearOneD() { return BitCastScalar<double>(0x3FEFFFFFFFFFFFFFULL); }
-
-// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
-// only increase the error tolerance there.
-constexpr uint64_t Cos64ULP() {
-#if defined(__MINGW32__)
-  return 23;
-#else
-  return 3;
-#endif
-}
-
-constexpr uint64_t ACosh32ULP() {
-#if defined(__MINGW32__)
-  return 8;
-#else
-  return 3;
-#endif
-}
-
-template <class D>
-static Vec<D> SinCosSin(const D d, VecArg<Vec<D>> x) {
-  Vec<D> s, c;
-  CallSinCos(d, x, s, c);
-  return s;
-}
-
-template <class D>
-static Vec<D> SinCosCos(const D d, VecArg<Vec<D>> x) {
-  Vec<D> s, c;
-  CallSinCos(d, x, s, c);
-  return c;
-}
-
-// on targets without FMA the result is less inaccurate
-constexpr uint64_t SinCosSin32ULP() {
-#if !(HWY_NATIVE_FMA)
-  return 256;
-#else
-  return 3;
-#endif
-}
-
-constexpr uint64_t SinCosCos32ULP() {
-#if !(HWY_NATIVE_FMA)
-  return 64;
-#else
-  return 3;
-#endif
-}
-
 // clang-format off
-DEFINE_MATH_TEST(Acos,
-  std::acos,  CallAcos,  -1.0f,      +1.0f,       3,  // NEON is 3 instead of 2
-  std::acos,  CallAcos,  -1.0,       +1.0,        2)
-DEFINE_MATH_TEST(Acosh,
-  std::acosh, CallAcosh, +1.0f,      +FLT_MAX,    ACosh32ULP(),
-  std::acosh, CallAcosh, +1.0,       +DBL_MAX,    3)
-DEFINE_MATH_TEST(Asin,
-  std::asin,  CallAsin,  -1.0f,      +1.0f,       4,  // 4 ulp on Armv7, not 2
-  std::asin,  CallAsin,  -1.0,       +1.0,        2)
-DEFINE_MATH_TEST(Asinh,
-  std::asinh, CallAsinh, -FLT_MAX,   +FLT_MAX,    3,
-  std::asinh, CallAsinh, -DBL_MAX,   +DBL_MAX,    3)
-DEFINE_MATH_TEST(Atan,
-  std::atan,  CallAtan,  -FLT_MAX,   +FLT_MAX,    3,
-  std::atan,  CallAtan,  -DBL_MAX,   +DBL_MAX,    3)
-// NEON has ULP 4 instead of 3
-DEFINE_MATH_TEST(Atanh,
-  std::atanh, CallAtanh, -kNearOneF(), +kNearOneF(),  4,
-  std::atanh, CallAtanh, -kNearOneD(), +kNearOneD(),  3)
-DEFINE_MATH_TEST(Cos,
-  std::cos,   CallCos,   -39000.0f,  +39000.0f,   3,
-  std::cos,   CallCos,   -39000.0,   +39000.0,    Cos64ULP())
 DEFINE_MATH_TEST(Exp,
   std::exp,   CallExp,   -FLT_MAX,   +104.0f,     1,
   std::exp,   CallExp,   -DBL_MAX,   +104.0,      1)
@@ -256,375 +177,8 @@ DEFINE_MATH_TEST(Log1p,
 DEFINE_MATH_TEST(Log2,
   std::log2,  CallLog2,  +FLT_MIN,   +FLT_MAX,    2,
   std::log2,  CallLog2,  +DBL_MIN,   +DBL_MAX,    2)
-DEFINE_MATH_TEST(Sin,
-  std::sin,   CallSin,   -39000.0f,  +39000.0f,   3,
-  std::sin,   CallSin,   -39000.0,   +39000.0,    4)  // MSYS is 4 instead of 3
-DEFINE_MATH_TEST(Sinh,
-  std::sinh,  CallSinh,  -80.0f,     +80.0f,      4,
-  std::sinh,  CallSinh,  -709.0,     +709.0,      4)
-DEFINE_MATH_TEST(Tanh,
-  std::tanh,  CallTanh,  -FLT_MAX,   +FLT_MAX,    4,
-  std::tanh,  CallTanh,  -DBL_MAX,   +DBL_MAX,    4)
-DEFINE_MATH_TEST(SinCosSin,
-  std::sin,   SinCosSin,   -39000.0f,  +39000.0f,   SinCosSin32ULP(),
-  std::sin,   SinCosSin,   -39000.0,   +39000.0,    1)
-DEFINE_MATH_TEST(SinCosCos,
-  std::cos,   SinCosCos,   -39000.0f,  +39000.0f,   SinCosCos32ULP(),
-  std::cos,   SinCosCos,   -39000.0,   +39000.0,    1)
 // clang-format on
 
-template <typename T, class D>
-void Atan2TestCases(T /*unused*/, D d, size_t& padded,
-                    AlignedFreeUniquePtr<T[]>& out_y,
-                    AlignedFreeUniquePtr<T[]>& out_x,
-                    AlignedFreeUniquePtr<T[]>& out_expected) {
-  struct YX {
-    T y;
-    T x;
-    T expected;
-  };
-  const T pos = ConvertScalarTo<T>(1E5);
-  const T neg = ConvertScalarTo<T>(-1E7);
-  const T p0 = ConvertScalarTo<T>(0);
-  // -0 is not enough to get an actual negative zero.
-  const T n0 = ConvertScalarTo<T>(-0.0);
-  const T p1 = ConvertScalarTo<T>(1);
-  const T n1 = ConvertScalarTo<T>(-1);
-  const T p2 = ConvertScalarTo<T>(2);
-  const T n2 = ConvertScalarTo<T>(-2);
-  const T inf = GetLane(Inf(d));
-  const T nan = GetLane(NaN(d));
-
-  const T pi = ConvertScalarTo<T>(3.141592653589793238);
-  const YX test_cases[] = {                        // 45 degree steps:
-                           {p0, p1, p0},           // E
-                           {n1, p1, -pi / 4},      // SE
-                           {n1, p0, -pi / 2},      // S
-                           {n1, n1, -3 * pi / 4},  // SW
-                           {p0, n1, pi},           // W
-                           {p1, n1, 3 * pi / 4},   // NW
-                           {p1, p0, pi / 2},       // N
-                           {p1, p1, pi / 4},       // NE
-
-                           // y = ±0, x < 0 or -0
-                           {p0, n1, pi},
-                           {n0, n2, -pi},
-                           // y = ±0, x > 0 or +0
-                           {p0, p2, p0},
-                           {n0, p2, n0},
-                           // y = ±∞, x finite
-                           {inf, p2, pi / 2},
-                           {-inf, p2, -pi / 2},
-                           // y = ±∞, x = -∞
-                           {inf, -inf, 3 * pi / 4},
-                           {-inf, -inf, -3 * pi / 4},
-                           // y = ±∞, x = +∞
-                           {inf, inf, pi / 4},
-                           {-inf, inf, -pi / 4},
-                           // y < 0, x = ±0
-                           {n2, p0, -pi / 2},
-                           {n1, n0, -pi / 2},
-                           // y > 0, x = ±0
-                           {pos, p0, pi / 2},
-                           {p2, n0, pi / 2},
-                           // finite y > 0, x = -∞
-                           {pos, -inf, pi},
-                           // finite y < 0, x = -∞
-                           {neg, -inf, -pi},
-                           // finite y > 0, x = +∞
-                           {pos, inf, p0},
-                           // finite y < 0, x = +∞
-                           {neg, inf, n0},
-                           // y NaN xor x NaN
-                           {nan, p0, nan},
-                           {pos, nan, nan}};
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
-  out_y = AllocateAligned<T>(padded);
-  out_x = AllocateAligned<T>(padded);
-  out_expected = AllocateAligned<T>(padded);
-  HWY_ASSERT(out_y && out_x && out_expected);
-  size_t i = 0;
-  for (; i < kNumTestCases; ++i) {
-    out_y[i] = test_cases[i].y;
-    out_x[i] = test_cases[i].x;
-    out_expected[i] = test_cases[i].expected;
-  }
-  for (; i < padded; ++i) {
-    out_y[i] = p0;
-    out_x[i] = p0;
-    out_expected[i] = p0;
-  }
-}
-
-struct TestAtan2 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    const size_t N = Lanes(d);
-
-    size_t padded;
-    AlignedFreeUniquePtr<T[]> in_y, in_x, expected;
-    Atan2TestCases(t, d, padded, in_y, in_x, expected);
-
-    const Vec<D> tolerance = Set(d, ConvertScalarTo<T>(1E-5));
-
-    for (size_t i = 0; i < padded; ++i) {
-      const T actual = ConvertScalarTo<T>(atan2(in_y[i], in_x[i]));
-      // fprintf(stderr, "%zu: table %f atan2 %f\n", i, expected[i], actual);
-      HWY_ASSERT_EQ(expected[i], actual);
-    }
-    for (size_t i = 0; i < padded; i += N) {
-      const Vec<D> y = Load(d, &in_y[i]);
-      const Vec<D> x = Load(d, &in_x[i]);
-#if HWY_ARCH_ARM_A64
-      // TODO(b/287462770): inline to work around incorrect SVE codegen
-      const Vec<D> actual = Atan2(d, y, x);
-#else
-      const Vec<D> actual = CallAtan2(d, y, x);
-#endif
-      const Vec<D> vexpected = Load(d, &expected[i]);
-
-      const Mask<D> exp_nan = IsNaN(vexpected);
-      const Mask<D> act_nan = IsNaN(actual);
-      HWY_ASSERT_MASK_EQ(d, exp_nan, act_nan);
-
-      // If not NaN, then compare with tolerance
-      const Mask<D> ge = Ge(actual, Sub(vexpected, tolerance));
-      const Mask<D> le = Le(actual, Add(vexpected, tolerance));
-      const Mask<D> ok = Or(act_nan, And(le, ge));
-      if (!AllTrue(d, ok)) {
-        const size_t mismatch =
-            static_cast<size_t>(FindKnownFirstTrue(d, Not(ok)));
-        fprintf(stderr, "Mismatch for i=%d expected %E actual %E\n",
-                static_cast<int>(i + mismatch), expected[i + mismatch],
-                ExtractLane(actual, mismatch));
-        HWY_ASSERT(0);
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllAtan2() {
-  if (HWY_MATH_TEST_EXCESS_PRECISION) return;
-
-  ForFloat3264Types(ForPartialVectors<TestAtan2>());
-}
-
-template <typename T, class D>
-void HypotTestCases(T /*unused*/, D d, size_t& padded,
-                    AlignedFreeUniquePtr<T[]>& out_a,
-                    AlignedFreeUniquePtr<T[]>& out_b,
-                    AlignedFreeUniquePtr<T[]>& out_expected) {
-  using TU = MakeUnsigned<T>;
-
-  struct AB {
-    T a;
-    T b;
-  };
-
-  constexpr int kNumOfMantBits = MantissaBits<T>();
-  static_assert(kNumOfMantBits > 0, "kNumOfMantBits > 0 must be true");
-
-  // Ensures inputs are not constexpr.
-  const TU u1 = static_cast<TU>(hwy::Unpredictable1());
-  const double k1 = static_cast<double>(u1);
-
-  const T pos = ConvertScalarTo<T>(1E5 * k1);
-  const T neg = ConvertScalarTo<T>(-1E7 * k1);
-  const T p0 = ConvertScalarTo<T>(k1 - 1.0);
-  // -0 is not enough to get an actual negative zero.
-  const T n0 = ScalarCopySign<T>(p0, neg);
-  const T p1 = ConvertScalarTo<T>(k1);
-  const T n1 = ConvertScalarTo<T>(-k1);
-  const T p2 = ConvertScalarTo<T>(2 * k1);
-  const T n2 = ConvertScalarTo<T>(-2 * k1);
-  const T inf = BitCastScalar<T>(ExponentMask<T>() * u1);
-  const T neg_inf = ScalarCopySign(inf, n1);
-  const T nan = BitCastScalar<T>(
-      static_cast<TU>(ExponentMask<T>() | (u1 << (kNumOfMantBits - 1))));
-
-  const double max_as_f64 = ConvertScalarTo<double>(HighestValue<T>()) * k1;
-  const T max = ConvertScalarTo<T>(max_as_f64);
-
-  const T huge = ConvertScalarTo<T>(max_as_f64 * 0.25);
-  const T neg_huge = ScalarCopySign(huge, n1);
-
-  const T huge2 = ConvertScalarTo<T>(max_as_f64 * 0.039415044328304796);
-
-  const T large = ConvertScalarTo<T>(3.512227595593985E18 * k1);
-  const T neg_large = ScalarCopySign(large, n1);
-  const T large2 = ConvertScalarTo<T>(2.1190576943127544E16 * k1);
-
-  const T small = ConvertScalarTo<T>(1.067033284841808E-11 * k1);
-  const T neg_small = ScalarCopySign(small, n1);
-  const T small2 = ConvertScalarTo<T>(1.9401409532292856E-12 * k1);
-
-  const T tiny = BitCastScalar<T>(static_cast<TU>(u1 << kNumOfMantBits));
-  const T neg_tiny = ScalarCopySign(tiny, n1);
-
-  const T tiny2 =
-      ConvertScalarTo<T>(78.68466968859765 * ConvertScalarTo<double>(tiny));
-
-  const AB test_cases[] = {{p0, p0},          {p0, n0},
-                           {n0, n0},          {p1, p1},
-                           {p1, n1},          {n1, n1},
-                           {p2, p2},          {p2, n2},
-                           {p2, pos},         {p2, neg},
-                           {n2, pos},         {n2, neg},
-                           {n2, n2},          {p0, tiny},
-                           {p0, neg_tiny},    {n0, tiny},
-                           {n0, neg_tiny},    {p1, tiny},
-                           {p1, neg_tiny},    {n1, tiny},
-                           {n1, neg_tiny},    {tiny, p0},
-                           {tiny2, p0},       {tiny, tiny2},
-                           {neg_tiny, tiny2}, {huge, huge2},
-                           {neg_huge, huge2}, {huge, p0},
-                           {huge, tiny},      {huge2, tiny2},
-                           {large, p0},       {large, large2},
-                           {neg_large, p0},   {neg_large, large2},
-                           {small, p0},       {small, small2},
-                           {neg_small, p0},   {neg_small, small2},
-                           {max, p0},         {max, huge},
-                           {max, max},        {p0, inf},
-                           {n0, inf},         {p1, inf},
-                           {n1, inf},         {p2, inf},
-                           {n2, inf},         {p0, neg_inf},
-                           {n0, neg_inf},     {p1, neg_inf},
-                           {n1, neg_inf},     {p2, neg_inf},
-                           {n2, neg_inf},     {p0, nan},
-                           {n0, nan},         {p1, nan},
-                           {n1, nan},         {p2, nan},
-                           {n2, nan},         {huge, inf},
-                           {inf, nan},        {neg_inf, nan},
-                           {nan, nan}};
-
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
-  out_a = AllocateAligned<T>(padded);
-  out_b = AllocateAligned<T>(padded);
-  out_expected = AllocateAligned<T>(padded);
-  HWY_ASSERT(out_a && out_b && out_expected);
-
-  size_t i = 0;
-  for (; i < kNumTestCases; ++i) {
-    const T a =
-        test_cases[i].a * hwy::ConvertScalarTo<T>(hwy::Unpredictable1());
-    const T b = test_cases[i].b;
-
-#if HWY_TARGET <= HWY_NEON_WITHOUT_AES && HWY_ARCH_ARM_V7
-    // Ignore test cases that have infinite or NaN inputs on Armv7 NEON
-    if (!ScalarIsFinite(a) || !ScalarIsFinite(b)) {
-      out_a[i] = p0;
-      out_b[i] = p0;
-      out_expected[i] = p0;
-      continue;
-    }
-#endif
-
-    out_a[i] = a;
-    out_b[i] = b;
-
-    if (ScalarIsInf(a) || ScalarIsInf(b)) {
-      out_expected[i] = inf;
-    } else if (ScalarIsNaN(a) || ScalarIsNaN(b)) {
-      out_expected[i] = nan;
-    } else {
-      out_expected[i] = std::hypot(a, b);
-    }
-  }
-  for (; i < padded; ++i) {
-    out_a[i] = p0;
-    out_b[i] = p0;
-    out_expected[i] = p0;
-  }
-}
-
-struct TestHypot {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    if (HWY_MATH_TEST_EXCESS_PRECISION) {
-      return;
-    }
-
-    const size_t N = Lanes(d);
-
-    constexpr uint64_t kMaxErrorUlp = 4;
-
-    size_t padded;
-    AlignedFreeUniquePtr<T[]> in_a, in_b, expected;
-    HypotTestCases(t, d, padded, in_a, in_b, expected);
-
-    auto actual1_lanes = AllocateAligned<T>(N);
-    auto actual2_lanes = AllocateAligned<T>(N);
-    HWY_ASSERT(actual1_lanes && actual2_lanes);
-
-    uint64_t max_ulp = 0;
-    for (size_t i = 0; i < padded; i += N) {
-      const auto a = Load(d, in_a.get() + i);
-      const auto b = Load(d, in_b.get() + i);
-
-#if HWY_ARCH_ARM_A64
-      // TODO(b/287462770): inline to work around incorrect SVE codegen
-      const auto actual1 = Hypot(d, a, b);
-      const auto actual2 = Hypot(d, b, a);
-#else
-      const auto actual1 = CallHypot(d, a, b);
-      const auto actual2 = CallHypot(d, b, a);
-#endif
-
-      Store(actual1, d, actual1_lanes.get());
-      Store(actual2, d, actual2_lanes.get());
-
-      for (size_t j = 0; j < N; j++) {
-        const T val_a = in_a[i + j];
-        const T val_b = in_b[i + j];
-        const T expected_val = expected[i + j];
-        const T actual1_val = actual1_lanes[j];
-        const T actual2_val = actual2_lanes[j];
-
-        const auto ulp1 =
-            hwy::detail::ComputeUlpDelta(actual1_val, expected_val);
-        if (ulp1 > kMaxErrorUlp) {
-          fprintf(stderr,
-                  "%s: Hypot(%e, %e) lane %d expected %E actual %E ulp %g max "
-                  "ulp %u\n",
-                  hwy::TypeName(T(), Lanes(d)).c_str(), val_a, val_b,
-                  static_cast<int>(j), expected_val, actual1_val,
-                  static_cast<double>(ulp1),
-                  static_cast<uint32_t>(kMaxErrorUlp));
-        }
-
-        const auto ulp2 =
-            hwy::detail::ComputeUlpDelta(actual2_val, expected_val);
-        if (ulp2 > kMaxErrorUlp) {
-          fprintf(stderr,
-                  "%s: Hypot(%e, %e) expected %E actual %E ulp %g max ulp %u\n",
-                  hwy::TypeName(T(), Lanes(d)).c_str(), val_b, val_a,
-                  expected_val, actual2_val, static_cast<double>(ulp2),
-                  static_cast<uint32_t>(kMaxErrorUlp));
-        }
-
-        max_ulp = HWY_MAX(max_ulp, HWY_MAX(ulp1, ulp2));
-      }
-    }
-
-    if (max_ulp != 0) {
-      fprintf(stderr, "%s: Hypot max_ulp %g\n",
-              hwy::TypeName(T(), Lanes(d)).c_str(),
-              static_cast<double>(max_ulp));
-      HWY_ASSERT(max_ulp <= kMaxErrorUlp);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllHypot() {
-  if (HWY_MATH_TEST_EXCESS_PRECISION) return;
-
-  ForFloat3264Types(ForPartialVectors<TestHypot>());
-}
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -635,13 +189,6 @@ HWY_AFTER_NAMESPACE();
 namespace hwy {
 namespace {
 HWY_BEFORE_TEST(HwyMathTest);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp2);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
@@ -649,13 +196,6 @@ HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan2);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinCosSin);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinCosCos);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllHypot);
 HWY_AFTER_TEST();
 }  // namespace
 }  // namespace hwy
diff --git a/third_party/highway/hwy/contrib/matvec/matvec-inl.h b/third_party/highway/hwy/contrib/matvec/matvec-inl.h
@@ -49,13 +49,13 @@ HWY_NOINLINE void MatVecAddImpl(const T* HWY_RESTRICT mat,
   // Process multiple rows at a time so that we write multiples of a cache line
   // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little
   // parallelization potential.
-  constexpr size_t kChunkSize = 64 / sizeof(T);
-  const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize);
+  constexpr size_t kChunkSize2 = 64 / sizeof(T);
+  const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize2);
 
   const ScalableTag<T> d;
   const size_t N = Lanes(d);
   // Required for Stream loop, otherwise we might have partial vectors.
-  HWY_DASSERT(kChunkSize >= N);
+  HWY_DASSERT(kChunkSize2 >= N);
   pool.Run(0, num_chunks,
            [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR {
              // MSVC workaround: duplicate to ensure constexpr.
@@ -126,7 +126,7 @@ HWY_NOINLINE void MatVecAddImpl(const T* HWY_RESTRICT mat,
   hwy::FlushStream();
 
   // Handle remainder rows which are not a multiple of the chunk size.
-  for (size_t r = num_chunks * kChunkSize; r < kOuter; ++r) {
+  for (size_t r = num_chunks * kChunkSize2; r < kOuter; ++r) {
     auto sum0 = Zero(d);
 
     const T* HWY_RESTRICT row = &mat[r * kInner];
diff --git a/third_party/highway/hwy/contrib/matvec/matvec_test.cc b/third_party/highway/hwy/contrib/matvec/matvec_test.cc
@@ -17,8 +17,7 @@
 
 // Reduce targets to avoid timeout under emulation.
 #ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS \
-  (HWY_SVE2_128 | HWY_SVE2 | HWY_SVE_256 | HWY_NEON_WITHOUT_AES)
+#define HWY_DISABLED_TARGETS (HWY_SVE2_128 | HWY_SVE2 | HWY_SVE_256 | HWY_NEON)
 #endif
 
 #include <stddef.h>
diff --git a/third_party/highway/hwy/contrib/random/random-inl.h b/third_party/highway/hwy/contrib/random/random-inl.h
@@ -35,7 +35,6 @@ namespace hwy {
 namespace HWY_NAMESPACE {  // required: unique per target
 namespace internal {
 
-namespace {
 #if HWY_HAVE_FLOAT64
 // C++ < 17 does not support hexfloat
 #if __cpp_hex_float > 201603L
@@ -52,7 +51,6 @@ constexpr std::uint64_t kJump[] = {0x180ec6d33cfd0aba, 0xd5a61266f0c9392c,
 
 constexpr std::uint64_t kLongJump[] = {0x76e15d3efefdcbbf, 0xc5004e441c522fb3,
                                        0x77710069854ee241, 0x39109bb02acbe635};
-}  // namespace
 
 class SplitMix64 {
  public:
@@ -177,6 +175,7 @@ class VectorXoshiro {
 #if HWY_HAVE_FLOAT64
   using VF64 = Vec<ScalableTag<double>>;
 #endif
+
  public:
   explicit VectorXoshiro(const std::uint64_t seed,
                          const std::uint64_t threadNumber = 0)
@@ -381,4 +380,4 @@ class CachedXoshiro {
 
 HWY_AFTER_NAMESPACE();
 
-#endif  // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
-\ No newline at end of file
+#endif  // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
diff --git a/third_party/highway/hwy/contrib/sort/BUILD b/third_party/highway/hwy/contrib/sort/BUILD
@@ -175,6 +175,7 @@ cc_library(
         ":vxsort",  # required if HAVE_VXSORT
         "//:algo",
         "//:hwy",
+        "//:nanobenchmark",
     ],
 )
 
@@ -201,6 +202,7 @@ cc_library(
     deps = [
         "//:algo",
         "//:hwy",
+        "//:nanobenchmark",
     ],
 )
 
diff --git a/third_party/highway/hwy/contrib/sort/algo-inl.h b/third_party/highway/hwy/contrib/sort/algo-inl.h
@@ -24,6 +24,7 @@
 #include <functional>  // std::less, std::greater
 #include <vector>
 
+#include "hwy/base.h"
 #include "hwy/contrib/sort/vqsort.h"
 #include "hwy/highway.h"
 #include "hwy/print.h"
@@ -207,36 +208,18 @@ enum class Algo {
 };
 
 static inline bool IsVQ(Algo algo) {
-  switch (algo) {
-    case Algo::kVQSort:
-    case Algo::kVQPartialSort:
-    case Algo::kVQSelect:
-      return true;
-    default:
-      return false;
-  }
+  return algo == Algo::kVQSort || algo == Algo::kVQPartialSort ||
+         algo == Algo::kVQSelect;
 }
 
 static inline bool IsSelect(Algo algo) {
-  switch (algo) {
-    case Algo::kStdSelect:
-    case Algo::kVQSelect:
-    case Algo::kHeapSelect:
-      return true;
-    default:
-      return false;
-  }
+  return algo == Algo::kStdSelect || algo == Algo::kVQSelect ||
+         algo == Algo::kHeapSelect;
 }
 
 static inline bool IsPartialSort(Algo algo) {
-  switch (algo) {
-    case Algo::kStdPartialSort:
-    case Algo::kVQPartialSort:
-    case Algo::kHeapPartialSort:
-      return true;
-    default:
-      return false;
-  }
+  return algo == Algo::kStdPartialSort || algo == Algo::kVQPartialSort ||
+         algo == Algo::kHeapPartialSort;
 }
 
 static inline Algo ReferenceAlgoFor(Algo algo) {
@@ -451,8 +434,8 @@ InputStats<T> GenerateInput(const Dist dist, T* v, size_t num_lanes) {
   }
 
   InputStats<T> input_stats;
-  for (size_t i = 0; i < num_lanes; ++i) {
-    input_stats.Notify(v[i]);
+  for (size_t j = 0; j < num_lanes; ++j) {
+    input_stats.Notify(v[j]);
   }
   return input_stats;
 }
@@ -606,9 +589,6 @@ void Run(Algo algo, KeyType* inout, size_t num_keys, SharedState& shared,
       return CallHeapPartialSort(inout, num_keys, k_keys, Order());
     case Algo::kHeapSelect:
       return CallHeapSelect(inout, num_keys, k_keys, Order());
-
-    default:
-      HWY_ABORT("Not implemented");
   }
 }
 
diff --git a/third_party/highway/hwy/contrib/sort/bench_sort.cc b/third_party/highway/hwy/contrib/sort/bench_sort.cc
@@ -360,7 +360,6 @@ enum class BenchmarkModes {
 std::vector<size_t> SizesToBenchmark(BenchmarkModes mode) {
   std::vector<size_t> sizes;
   switch (mode) {
-    default:
     case BenchmarkModes::kDefault:
 #if HAVE_PARALLEL_IPS4O || SORT_100M
       sizes.push_back(100 * 1000 * size_t{1000});
@@ -410,7 +409,7 @@ std::vector<size_t> SizesToBenchmark(BenchmarkModes mode) {
 HWY_NOINLINE void BenchAllSort() {
   // Not interested in benchmark results for these targets. Note that SSE4 is
   // numerically less than SSE2, hence it is the lower bound.
-  if (HWY_SSE4 <= HWY_TARGET && HWY_TARGET <= HWY_SSE2) {
+  if (HWY_SSE4 <= HWY_TARGET && HWY_TARGET <= HWY_SSE2 && Unpredictable1()) {
     return;
   }
 #if HAVE_INTEL
diff --git a/third_party/highway/hwy/contrib/sort/print_network.cc b/third_party/highway/hwy/contrib/sort/print_network.cc
@@ -80,7 +80,7 @@ static void PrintMergeNetwork(int rows, int cols) {
   printf("\n");
 }
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   PrintMergeNetwork(8, 2);
   PrintMergeNetwork(8, 4);
   PrintMergeNetwork(16, 4);
diff --git a/third_party/highway/hwy/contrib/sort/result-inl.h b/third_party/highway/hwy/contrib/sort/result-inl.h
@@ -29,7 +29,6 @@
 
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
-#include "hwy/contrib/sort/order.h"
 #include "hwy/per_target.h"  // DispatchedTarget
 #include "hwy/targets.h"     // TargetName
 
@@ -51,16 +50,17 @@ static inline double SummarizeMeasurements(std::vector<double>& seconds) {
 
 struct SortResult {
   SortResult() {}
-  SortResult(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
-             double sec, size_t sizeof_key, const char* key_name)
+  SortResult(Algo algo_in, Dist dist_in, size_t num_keys_in,
+             size_t num_threads_in, double sec_in, size_t sizeof_key_in,
+             const char* key_name_in)
       : target(DispatchedTarget()),
-        algo(algo),
-        dist(dist),
-        num_keys(num_keys),
-        num_threads(num_threads),
-        sec(sec),
-        sizeof_key(sizeof_key),
-        key_name(key_name) {}
+        algo(algo_in),
+        dist(dist_in),
+        num_keys(num_keys_in),
+        num_threads(num_threads_in),
+        sec(sec_in),
+        sizeof_key(sizeof_key_in),
+        key_name(key_name_in) {}
 
   void Print() const {
     const double bytes = static_cast<double>(num_keys) *
diff --git a/third_party/highway/hwy/contrib/sort/shared-inl.h b/third_party/highway/hwy/contrib/sort/shared-inl.h
@@ -141,21 +141,21 @@ static_assert(SortConstants::MaxBufBytes<2>(64) <= 1664, "Unexpectedly high");
 // vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
 // Armv7 debug, and Armv8 GCC 11 asan hits an internal compiler error likely
 // due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97696. Armv8 Clang
-// hwasan/msan/tsan/asan also fail to build SVE (b/335157772). RVV currently
-// has a compiler issue.
+// hwasan/msan/tsan/asan also fail to build SVE (b/335157772).
 #undef VQSORT_ENABLED
 #undef VQSORT_COMPILER_COMPATIBLE
 
-#if (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) ||                   \
-    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) ||                      \
-    (HWY_ARCH_ARM_A64 && HWY_COMPILER_GCC_ACTUAL && HWY_IS_ASAN) || \
-    (HWY_ARCH_RISCV)
+#if (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
+    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) ||    \
+    (HWY_ARCH_ARM_A64 && HWY_IS_ASAN) ||          \
+    (HWY_ARCH_RISCV && HWY_COMPILER_GCC_ACTUAL < 1400)
 #define VQSORT_COMPILER_COMPATIBLE 0
 #else
 #define VQSORT_COMPILER_COMPATIBLE 1
 #endif
 
-#if (HWY_TARGET == HWY_SCALAR) || !VQSORT_COMPILER_COMPATIBLE
+#if (HWY_TARGET == HWY_SCALAR) || !VQSORT_COMPILER_COMPATIBLE || \
+    defined(HWY_DISABLE_VQSORT)
 #define VQSORT_ENABLED 0
 #else
 #define VQSORT_ENABLED 1
diff --git a/third_party/highway/hwy/contrib/sort/sort_test.cc b/third_party/highway/hwy/contrib/sort/sort_test.cc
@@ -20,6 +20,10 @@
 #include <random>
 #include <vector>
 
+#if !defined(HWY_DISABLED_TARGETS) && HWY_IS_DEBUG_BUILD
+#define HWY_DISABLED_TARGETS (HWY_SSE2 | HWY_SSSE3)
+#endif
+
 #include "hwy/aligned_allocator.h"  // IsAligned
 #include "hwy/base.h"
 #include "hwy/contrib/sort/vqsort.h"
@@ -228,7 +232,7 @@ void TestAllSort() {
       Algo::kVQSort,  Algo::kHeapSort,
   };
 
-  for (int num : {129, 504, 3 * 1000, 34567}) {
+  for (int num : {129, 504, 3 * 1000, 14567}) {
     const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
     CallAllSortTraits(algos, num_lanes);
   }
@@ -237,7 +241,7 @@ void TestAllSort() {
 void TestAllPartialSort() {
   const std::vector<Algo> algos{Algo::kVQPartialSort, Algo::kHeapPartialSort};
 
-  for (int num : {129, 504, 3 * 1000, 34567}) {
+  for (int num : {129, 504, 3 * 1000, 14567}) {
     const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
     CallAllSortTraits(algos, num_lanes);
   }
@@ -246,7 +250,7 @@ void TestAllPartialSort() {
 void TestAllSelect() {
   const std::vector<Algo> algos{Algo::kVQSelect, Algo::kHeapSelect};
 
-  for (int num : {129, 504, 3 * 1000, 34567}) {
+  for (int num : {129, 504, 3 * 1000, 14567}) {
     const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
     CallAllSortTraits(algos, num_lanes);
   }
diff --git a/third_party/highway/hwy/contrib/sort/sort_unit_test.cc b/third_party/highway/hwy/contrib/sort/sort_unit_test.cc
@@ -499,11 +499,13 @@ static HWY_NOINLINE void TestRandomGenerator() {
     sum_lo += bits & 0xFFFFFFFF;
     sum_hi += bits >> 32;
   }
-  const double expected = 1000 * (1ULL << 31);
-  HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_lo) &&
-             static_cast<double>(sum_lo) <= 1.1 * expected);
-  HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_hi) &&
-             static_cast<double>(sum_hi) <= 1.1 * expected);
+  {
+    const double expected = 1000 * (1ULL << 31);
+    HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_lo) &&
+               static_cast<double>(sum_lo) <= 1.1 * expected);
+    HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_hi) &&
+               static_cast<double>(sum_hi) <= 1.1 * expected);
+  }
 
   const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N);  // power of two
 
diff --git a/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
@@ -891,6 +891,16 @@ HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
 #else
 template <class Base>
 struct SharedTraits : public Base {};
+
+namespace detail {
+
+// Empty function to avoid a possible -Wpragma-clang-attribute warning if
+// compiling with Clang
+static HWY_INLINE HWY_MAYBE_UNUSED void HWY_CONCAT(UnusedSortingNetworksFunc,
+                                                   __LINE__)() {}
+
+}  // namespace detail
+
 #endif  // VQSORT_ENABLED
 
 }  // namespace detail
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f16a.cc
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include "hwy/contrib/sort/vqsort.h"  // VQSort
+#include "hwy/nanobenchmark.h"        // Unpredictable1
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f16a.cc"
@@ -33,7 +34,7 @@ void SortF16Asc(float16_t* HWY_RESTRICT keys, const size_t num) {
 #else
   (void)keys;
   (void)num;
-  HWY_ASSERT(0);
+  if (Unpredictable1()) HWY_ASSERT(0);
 #endif
 }
 
@@ -45,7 +46,7 @@ void PartialSortF16Asc(float16_t* HWY_RESTRICT keys, const size_t num,
   (void)keys;
   (void)num;
   (void)k;
-  HWY_ASSERT(0);
+  if (Unpredictable1()) HWY_ASSERT(0);
 #endif
 }
 
@@ -57,7 +58,7 @@ void SelectF16Asc(float16_t* HWY_RESTRICT keys, const size_t num,
   (void)keys;
   (void)num;
   (void)k;
-  HWY_ASSERT(0);
+  if (Unpredictable1()) HWY_ASSERT(0);
 #endif
 }
 
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f16d.cc
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include "hwy/contrib/sort/vqsort.h"  // VQSort
+#include "hwy/nanobenchmark.h"        //
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f16d.cc"
@@ -33,7 +34,7 @@ void SortF16Desc(float16_t* HWY_RESTRICT keys, const size_t num) {
 #else
   (void)keys;
   (void)num;
-  HWY_ASSERT(0);
+  if (Unpredictable1()) HWY_ASSERT(0);
 #endif
 }
 
@@ -45,7 +46,7 @@ void PartialSortF16Desc(float16_t* HWY_RESTRICT keys, const size_t num,
   (void)keys;
   (void)num;
   (void)k;
-  HWY_ASSERT(0);
+  if (Unpredictable1()) HWY_ASSERT(0);
 #endif
 }
 
@@ -57,7 +58,7 @@ void SelectF16Desc(float16_t* HWY_RESTRICT keys, const size_t num,
   (void)keys;
   (void)num;
   (void)k;
-  HWY_ASSERT(0);
+  if (Unpredictable1()) HWY_ASSERT(0);
 #endif
 }
 
diff --git a/third_party/highway/hwy/contrib/thread_pool/futex.h b/third_party/highway/hwy/contrib/thread_pool/futex.h
@@ -31,6 +31,14 @@
 
 #include "hwy/base.h"
 
+#if HWY_OS_APPLE
+#include <AvailabilityMacros.h>
+// __ulock* were added in OS X 10.12 (Sierra, 2016).
+#if MAC_OS_X_VERSION_MAX_ALLOWED < 101200 && !defined(HWY_DISABLE_FUTEX)
+#define HWY_DISABLE_FUTEX
+#endif
+#endif  // HWY_OS_APPLE
+
 #if HWY_OS_WIN
 // Need to include <windows.h> on Windows, even if HWY_DISABLE_FUTEX is defined,
 // since hwy::NanoSleep uses Windows API's that are defined in windows.h.
diff --git a/third_party/highway/hwy/contrib/thread_pool/spin.h b/third_party/highway/hwy/contrib/thread_pool/spin.h
@@ -82,25 +82,32 @@ struct SpinResult {
 // `HWY_TARGET` and its runtime dispatch mechanism. Returned by `Type()`, also
 // used by callers to set the `disabled` argument for `DetectSpin`.
 enum class SpinType : uint8_t {
+#if HWY_ENABLE_MONITORX
   kMonitorX = 1,  // AMD
-  kUMonitor,      // Intel
-  kPause,
+#endif
+#if HWY_ENABLE_UMONITOR
+  kUMonitor = 2,  // Intel
+#endif
+  kPause = 3,
   kSentinel  // for iterating over all enumerators. Must be last.
 };
 
 // For printing which is in use.
 static inline const char* ToString(SpinType type) {
   switch (type) {
+#if HWY_ENABLE_MONITORX
     case SpinType::kMonitorX:
       return "MonitorX_C1";
+#endif
+#if HWY_ENABLE_UMONITOR
     case SpinType::kUMonitor:
       return "UMonitor_C0.2";
+#endif
     case SpinType::kPause:
       return "Pause";
     case SpinType::kSentinel:
-      return nullptr;
     default:
-      HWY_UNREACHABLE;
+      return nullptr;
   }
 }
 
@@ -302,23 +309,23 @@ static inline SpinType DetectSpin(int disabled = 0) {
   return SpinType::kPause;
 }
 
-// Calls `func(spin)` for the given `spin_type`.
-template <class Func>
-HWY_INLINE void CallWithSpin(SpinType spin_type, Func&& func) {
+// Calls `func(spin, args)` for the given `spin_type`.
+template <class Func, typename... Args>
+HWY_INLINE void CallWithSpin(SpinType spin_type, Func&& func, Args&&... args) {
   switch (spin_type) {
 #if HWY_ENABLE_MONITORX
     case SpinType::kMonitorX:
-      func(SpinMonitorX());
+      func(SpinMonitorX(), std::forward<Args>(args)...);
       break;
 #endif
 #if HWY_ENABLE_UMONITOR
     case SpinType::kUMonitor:
-      func(SpinUMonitor());
+      func(SpinUMonitor(), std::forward<Args>(args)...);
       break;
 #endif
     case SpinType::kPause:
     default:
-      func(SpinPause());
+      func(SpinPause(), std::forward<Args>(args)...);
       break;
   }
 }
diff --git a/third_party/highway/hwy/contrib/thread_pool/thread_pool.h b/third_party/highway/hwy/contrib/thread_pool/thread_pool.h
@@ -23,6 +23,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>  // snprintf
+#include <string.h>
 
 #include <array>
 #include <atomic>
@@ -30,11 +31,6 @@
 #include <thread>  // NOLINT
 #include <vector>
 
-#include "hwy/detect_compiler_arch.h"
-#if HWY_OS_FREEBSD
-#include <pthread_np.h>
-#endif
-
 #include "hwy/aligned_allocator.h"  // HWY_ALIGNMENT
 #include "hwy/auto_tune.h"
 #include "hwy/base.h"
@@ -46,8 +42,15 @@
 #include "hwy/stats.h"
 #include "hwy/timer.h"
 
-// Define to HWY_NOINLINE to see profiles of `WorkerRun*` and waits.
-#define HWY_POOL_PROFILE
+#if HWY_OS_APPLE
+#include <AvailabilityMacros.h>
+#endif
+
+#if PROFILER_ENABLED
+#include <algorithm>  // std::sort
+
+#include "hwy/bit_set.h"
+#endif
 
 namespace hwy {
 
@@ -60,19 +63,67 @@ static inline void SetThreadName(const char* format, int thread) {
   HWY_ASSERT(0 < chars_written &&
              chars_written <= static_cast<int>(sizeof(buf) - 1));
 
-#if HWY_OS_LINUX && (!defined(__ANDROID__) || __ANDROID_API__ >= 19)
+#if (HWY_OS_LINUX && (!defined(__ANDROID__) || __ANDROID_API__ >= 19)) || \
+    HWY_OS_FREEBSD
+  // Note that FreeBSD pthread_set_name_np does not return a value (#2669).
   HWY_ASSERT(0 == pthread_setname_np(pthread_self(), buf));
-#elif HWY_OS_FREEBSD
-  HWY_ASSERT(0 == pthread_set_name_np(pthread_self(), buf));
-#elif HWY_OS_APPLE
+#elif HWY_OS_APPLE && (MAC_OS_X_VERSION_MIN_REQUIRED >= 1060)
   // Different interface: single argument, current thread only.
   HWY_ASSERT(0 == pthread_setname_np(buf));
+#elif defined(__EMSCRIPTEN__)
+  emscripten_set_thread_name(pthread_self(), buf);
+#else
+  (void)format;
+  (void)thread;
 #endif
 }
 
 // Whether workers should block or spin.
 enum class PoolWaitMode : uint8_t { kBlock = 1, kSpin };
 
+enum class Exit : uint32_t { kNone, kLoop, kThread };
+
+// Upper bound on non-empty `ThreadPool` (single-worker pools do not count).
+// Turin has 16 clusters. Add one for the across-cluster pool.
+HWY_INLINE_VAR constexpr size_t kMaxClusters = 32 + 1;
+
+// Use the last slot so that `PoolWorkerMapping` does not have to know the
+// total number of clusters.
+HWY_INLINE_VAR constexpr size_t kAllClusters = kMaxClusters - 1;
+
+// Argument to `ThreadPool`: how to map local worker_idx to global.
+class PoolWorkerMapping {
+ public:
+  // Backward-compatible mode: returns local worker index.
+  PoolWorkerMapping() : cluster_idx_(0), max_cluster_workers_(0) {}
+  PoolWorkerMapping(size_t cluster_idx, size_t max_cluster_workers)
+      : cluster_idx_(cluster_idx), max_cluster_workers_(max_cluster_workers) {
+    HWY_DASSERT(cluster_idx <= kAllClusters);
+    // Only use this ctor for the new global worker index mode. If this were
+    // zero, we would still return local indices.
+    HWY_DASSERT(max_cluster_workers != 0);
+  }
+
+  size_t ClusterIdx() const { return cluster_idx_; }
+  size_t MaxClusterWorkers() const { return max_cluster_workers_; }
+
+  // Returns global_idx, or unchanged local worker_idx if default-constructed.
+  size_t operator()(size_t worker_idx) const {
+    if (cluster_idx_ == kAllClusters) {
+      const size_t cluster_idx = worker_idx;
+      HWY_DASSERT(cluster_idx < kAllClusters);
+      // First index within the N-th cluster. The main thread is the first.
+      return cluster_idx * max_cluster_workers_;
+    }
+    HWY_DASSERT(max_cluster_workers_ == 0 || worker_idx < max_cluster_workers_);
+    return cluster_idx_ * max_cluster_workers_ + worker_idx;
+  }
+
+ private:
+  size_t cluster_idx_;
+  size_t max_cluster_workers_;
+};
+
 namespace pool {
 
 #ifndef HWY_POOL_VERBOSITY
@@ -85,7 +136,7 @@ static constexpr int kVerbosity = HWY_POOL_VERBOSITY;
 // large pool, we assume applications create multiple pools, ideally per
 // cluster (cores sharing a cache), because this improves locality and barrier
 // latency. In that case, this is a generous upper bound.
-static constexpr size_t kMaxThreads = 63;
+static constexpr size_t kMaxThreads = 127;
 
 // Generates a random permutation of [0, size). O(1) storage.
 class ShuffledIota {
@@ -151,7 +202,7 @@ class ShuffledIota {
 };
 
 // 'Policies' suitable for various worker counts and locality. To define a
-// new class, add an enum and update `ToString` plus `FunctorAddWait`. The
+// new class, add an enum and update `ToString` plus `CallWithConfig`. The
 // enumerators must be contiguous so we can iterate over them.
 enum class WaitType : uint8_t {
   kBlock,
@@ -159,15 +210,6 @@ enum class WaitType : uint8_t {
   kSpinSeparate,
   kSentinel  // Must be last.
 };
-enum class BarrierType : uint8_t {
-  kOrdered,
-  kCounter1,
-  kCounter2,
-  kCounter4,
-  kGroup2,
-  kGroup4,
-  kSentinel  // Must be last.
-};
 
 // For printing which is in use.
 static inline const char* ToString(WaitType type) {
@@ -180,107 +222,558 @@ static inline const char* ToString(WaitType type) {
       return "Separate";
     case WaitType::kSentinel:
       return nullptr;
-    default:
-      HWY_UNREACHABLE;
-  }
-}
-
-static inline const char* ToString(BarrierType type) {
-  switch (type) {
-    case BarrierType::kOrdered:
-      return "Ordered";
-    case BarrierType::kCounter1:
-      return "Counter1";
-    case BarrierType::kCounter2:
-      return "Counter2";
-    case BarrierType::kCounter4:
-      return "Counter4";
-    case BarrierType::kGroup2:
-      return "Group2";
-    case BarrierType::kGroup4:
-      return "Group4";
-    case BarrierType::kSentinel:
-      return nullptr;
-    default:
-      HWY_UNREACHABLE;
   }
 }
 
-// We want predictable struct/class sizes so we can reason about cache lines.
-#pragma pack(push, 1)
-
 // Parameters governing the main and worker thread behavior. Can be updated at
-// runtime via `SetWaitMode`. Both have copies which are carefully synchronized
-// (two-phase barrier). 64-bit allows adding fields (e.g. for load-balancing)
-// without having to bit-pack members, and is fine because this is only moved
-// with relaxed stores, hence we do not have to fit it in the 32 futex bits.
-class Config {  // 8 bytes
- public:
-  static std::vector<Config> AllCandidates(PoolWaitMode wait_mode,
-                                           size_t num_threads) {
-    std::vector<SpinType> spin_types(size_t{1}, DetectSpin());
-    // Monitor-based spin may be slower, so also try Pause.
-    if (spin_types[0] != SpinType::kPause) {
-      spin_types.push_back(SpinType::kPause);
-    }
+// runtime via `SetWaitMode`, which calls `SendConfig`. Both have copies which
+// are carefully synchronized. 32 bits leave room for two future fields.
+// 64 bits would also be fine because this does not go through futex.
+struct Config {  // 4 bytes
+  static std::vector<Config> AllCandidates(PoolWaitMode wait_mode) {
+    std::vector<Config> candidates;
 
-    std::vector<WaitType> wait_types;
     if (wait_mode == PoolWaitMode::kSpin) {
+      std::vector<SpinType> spin_types;
+      spin_types.reserve(2);
+      spin_types.push_back(DetectSpin());
+      // Monitor-based spin may be slower, so also try Pause.
+      if (spin_types[0] != SpinType::kPause) {
+        spin_types.push_back(SpinType::kPause);
+      }
+
       // All except `kBlock`.
+      std::vector<WaitType> wait_types;
       for (size_t wait = 0;; ++wait) {
         const WaitType wait_type = static_cast<WaitType>(wait);
         if (wait_type == WaitType::kSentinel) break;
         if (wait_type != WaitType::kBlock) wait_types.push_back(wait_type);
       }
-    } else {
-      wait_types.push_back(WaitType::kBlock);
-    }
 
-    std::vector<BarrierType> barrier_types;
-    // Note that casting an integer is UB if there is no matching enumerator,
-    // but we define a sentinel to prevent this.
-    for (size_t barrier = 0;; ++barrier) {
-      const BarrierType barrier_type = static_cast<BarrierType>(barrier);
-      if (barrier_type == BarrierType::kSentinel) break;
-      // If <= 2 workers, group size of 4 is the same as 2.
-      if (num_threads <= 1 && barrier_type == BarrierType::kCounter4) continue;
-      if (num_threads <= 1 && barrier_type == BarrierType::kGroup4) continue;
-      barrier_types.push_back(barrier_type);
-    }
-
-    std::vector<Config> candidates;
-    candidates.reserve(50);
-    for (const SpinType spin_type : spin_types) {
-      for (const WaitType wait_type : wait_types) {
-        for (const BarrierType barrier_type : barrier_types) {
-          candidates.emplace_back(spin_type, wait_type, barrier_type);
+      candidates.reserve(spin_types.size() * wait_types.size());
+      for (const SpinType spin_type : spin_types) {
+        for (const WaitType wait_type : wait_types) {
+          candidates.emplace_back(spin_type, wait_type);
         }
       }
+    } else {
+      // kBlock does not use spin, so there is only one candidate.
+      candidates.emplace_back(SpinType::kPause, WaitType::kBlock);
     }
+
     return candidates;
   }
 
   std::string ToString() const {
     char buf[128];
-    snprintf(buf, sizeof(buf), "%14s %9s %9s", hwy::ToString(spin_type),
-             pool::ToString(wait_type), pool::ToString(barrier_type));
+    snprintf(buf, sizeof(buf), "%-14s %-9s", hwy::ToString(spin_type),
+             pool::ToString(wait_type));
     return buf;
   }
 
-  Config() {}
-  Config(SpinType spin_type, WaitType wait_type, BarrierType barrier_type)
-      : spin_type(spin_type),
-        wait_type(wait_type),
-        barrier_type(barrier_type),
-        exit(false) {}
+  Config(SpinType spin_type_in, WaitType wait_type_in)
+      : spin_type(spin_type_in), wait_type(wait_type_in) {}
+  // Workers initially spin until ThreadPool sends them their actual config.
+  Config() : Config(SpinType::kPause, WaitType::kSpinSeparate) {}
 
   SpinType spin_type;
   WaitType wait_type;
-  BarrierType barrier_type;
-  bool exit;
-  uint32_t reserved = 0;
+  HWY_MEMBER_VAR_MAYBE_UNUSED uint8_t reserved[2];
+};
+static_assert(sizeof(Config) == 4, "");
+
+#if PROFILER_ENABLED
+
+// Accumulates timings and stats from main thread and workers.
+class Stats {
+  // Up to `HWY_ALIGNMENT / 8` slots/offsets, passed to `PerThread`.
+  static constexpr size_t kDWait = 0;
+  static constexpr size_t kWaitReps = 1;
+  static constexpr size_t kTBeforeRun = 2;
+  static constexpr size_t kDRun = 3;
+  static constexpr size_t kTasksStatic = 4;
+  static constexpr size_t kTasksDynamic = 5;
+  static constexpr size_t kTasksStolen = 6;
+  static constexpr size_t kDFuncStatic = 7;
+  static constexpr size_t kDFuncDynamic = 8;
+  static constexpr size_t kSentinel = 9;
+
+ public:
+  Stats() {
+    for (size_t thread_idx = 0; thread_idx < kMaxThreads; ++thread_idx) {
+      for (size_t offset = 0; offset < kSentinel; ++offset) {
+        PerThread(thread_idx, offset) = 0;
+      }
+    }
+    Reset();
+  }
+
+  // Called by the N lowest-indexed workers that got one of the N tasks, which
+  // includes the main thread because its index is 0.
+  // `d_*` denotes "difference" (of timestamps) and thus also duration.
+  void NotifyRunStatic(size_t worker_idx, timer::Ticks d_func) {
+    if (worker_idx == 0) {  // main thread
+      num_run_static_++;
+      sum_tasks_static_++;
+      sum_d_func_static_ += d_func;
+    } else {
+      const size_t thread_idx = worker_idx - 1;
+      // Defer the sums until `NotifyMainRun` to avoid atomic RMW.
+      PerThread(thread_idx, kTasksStatic)++;
+      PerThread(thread_idx, kDFuncStatic) += d_func;
+    }
+  }
+
+  // Called by all workers, including the main thread, regardless of whether
+  // they actually stole or even ran a task.
+  void NotifyRunDynamic(size_t worker_idx, size_t tasks, size_t stolen,
+                        timer::Ticks d_func) {
+    if (worker_idx == 0) {  // main thread
+      num_run_dynamic_++;
+      sum_tasks_dynamic_ += tasks;
+      sum_tasks_stolen_ += stolen;
+      sum_d_func_dynamic_ += d_func;
+    } else {
+      const size_t thread_idx = worker_idx - 1;
+      // Defer the sums until `NotifyMainRun` to avoid atomic RMW.
+      PerThread(thread_idx, kTasksDynamic) += tasks;
+      PerThread(thread_idx, kTasksStolen) += stolen;
+      PerThread(thread_idx, kDFuncDynamic) += d_func;
+    }
+  }
+
+  // Called concurrently by non-main worker threads after their `WorkerRun` and
+  // before the barrier.
+  void NotifyThreadRun(size_t worker_idx, timer::Ticks d_wait, size_t wait_reps,
+                       timer::Ticks t_before_run, timer::Ticks d_run) {
+    HWY_DASSERT(worker_idx != 0);  // Not called by main thread.
+    const size_t thread_idx = worker_idx - 1;
+    HWY_DASSERT(PerThread(thread_idx, kDWait) == 0);
+    HWY_DASSERT(PerThread(thread_idx, kWaitReps) == 0);
+    HWY_DASSERT(PerThread(thread_idx, kTBeforeRun) == 0);
+    HWY_DASSERT(PerThread(thread_idx, kDRun) == 0);
+    PerThread(thread_idx, kDWait) = d_wait;
+    PerThread(thread_idx, kWaitReps) = wait_reps;
+    PerThread(thread_idx, kTBeforeRun) = t_before_run;  // For wake latency.
+    PerThread(thread_idx, kDRun) = d_run;
+  }
+
+  // Called by the main thread after the barrier, whose store-release and
+  // load-acquire publishes all prior writes. Note: only the main thread can
+  // store `after_barrier`. If workers did, which by definition happens after
+  // the barrier, then they would race with this function's reads.
+  void NotifyMainRun(size_t num_threads, timer::Ticks t_before_wake,
+                     timer::Ticks d_wake, timer::Ticks d_main_run,
+                     timer::Ticks d_barrier) {
+    HWY_DASSERT(num_threads <= kMaxThreads);
+
+    timer::Ticks min_d_run = ~timer::Ticks{0};
+    timer::Ticks max_d_run = 0;
+    timer::Ticks sum_d_run = 0;
+    for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
+      sum_tasks_static_ += PerThread(thread_idx, kTasksStatic);
+      sum_tasks_dynamic_ += PerThread(thread_idx, kTasksDynamic);
+      sum_tasks_stolen_ += PerThread(thread_idx, kTasksStolen);
+      sum_d_func_static_ += PerThread(thread_idx, kDFuncStatic);
+      sum_d_func_dynamic_ += PerThread(thread_idx, kDFuncDynamic);
+      sum_d_wait_ += PerThread(thread_idx, kDWait);
+      sum_wait_reps_ += PerThread(thread_idx, kWaitReps);
+      const timer::Ticks d_thread_run = PerThread(thread_idx, kDRun);
+      min_d_run = HWY_MIN(min_d_run, d_thread_run);
+      max_d_run = HWY_MAX(max_d_run, d_thread_run);
+      sum_d_run += d_thread_run;
+      const timer::Ticks t_before_run = PerThread(thread_idx, kTBeforeRun);
+
+      for (size_t offset = 0; offset < kSentinel; ++offset) {
+        PerThread(thread_idx, offset) = 0;
+      }
+
+      HWY_DASSERT(t_before_run != 0);
+      const timer::Ticks d_latency = t_before_run - t_before_wake;
+      sum_wake_latency_ += d_latency;
+      max_wake_latency_ = HWY_MAX(max_wake_latency_, d_latency);
+    }
+    const double inv_avg_d_run =
+        static_cast<double>(num_threads) / static_cast<double>(sum_d_run);
+    // Ratios of min and max run times to the average, for this pool.Run.
+    const double r_min = static_cast<double>(min_d_run) * inv_avg_d_run;
+    const double r_max = static_cast<double>(max_d_run) * inv_avg_d_run;
+
+    num_run_++;  // `num_run_*` are incremented by `NotifyRun*`.
+    sum_d_run_ += sum_d_run;
+    sum_r_min_ += r_min;  // For average across all pool.Run.
+    sum_r_max_ += r_max;
+
+    sum_d_wake_ += d_wake;  // `*wake_latency_` are updated above.
+    sum_d_barrier_ += d_barrier;
+
+    sum_d_run_ += d_main_run;
+    sum_d_run_main_ += d_main_run;
+  }
+
+  void PrintAndReset(size_t num_threads, timer::Ticks d_thread_lifetime_ticks) {
+    // This is unconditionally called via `ProfilerFunc`. If the pool was unused
+    // in this invocation, skip it.
+    if (num_run_ == 0) return;
+    HWY_ASSERT(num_run_ == num_run_static_ + num_run_dynamic_);
+
+    const double d_func_static = Seconds(sum_d_func_static_);
+    const double d_func_dynamic = Seconds(sum_d_func_dynamic_);
+    const double sum_d_run = Seconds(sum_d_run_);
+    const double func_div_run = (d_func_static + d_func_dynamic) / sum_d_run;
+    if (!(0.95 <= func_div_run && func_div_run <= 1.0)) {
+      HWY_WARN("Func time %f should be similar to total run %f.",
+               d_func_static + d_func_dynamic, sum_d_run);
+    }
+    const double sum_d_run_main = Seconds(sum_d_run_main_);
+    const double max_wake_latency = Seconds(max_wake_latency_);
+    const double sum_d_wait = Seconds(sum_d_wait_);
+    const double d_thread_lifetime = Seconds(d_thread_lifetime_ticks);
+
+    const double inv_run = 1.0 / static_cast<double>(num_run_);
+    const auto per_run = [inv_run](double sum) { return sum * inv_run; };
+    const double avg_d_wake = per_run(Seconds(sum_d_wake_));
+    const double avg_wake_latency = per_run(Seconds(sum_wake_latency_));
+    const double avg_d_wait = per_run(sum_d_wait);
+    const double avg_wait_reps = per_run(static_cast<double>(sum_wait_reps_));
+    const double avg_d_barrier = per_run(Seconds(sum_d_barrier_));
+    const double avg_r_min = per_run(sum_r_min_);
+    const double avg_r_max = per_run(sum_r_max_);
+
+    const size_t num_workers = 1 + num_threads;
+    const double avg_tasks_static =
+        Avg(sum_tasks_static_, num_run_static_ * num_workers);
+    const double avg_tasks_dynamic =
+        Avg(sum_tasks_dynamic_, num_run_dynamic_ * num_workers);
+    const double avg_steals =
+        Avg(sum_tasks_stolen_, num_run_dynamic_ * num_workers);
+    const double avg_d_run = sum_d_run / num_workers;
+
+    const double pc_wait = sum_d_wait / d_thread_lifetime * 100.0;
+    const double pc_run = sum_d_run / d_thread_lifetime * 100.0;
+    const double pc_main = sum_d_run_main / avg_d_run * 100.0;
+
+    const auto us = [](double sec) { return sec * 1E6; };
+    const auto ns = [](double sec) { return sec * 1E9; };
+    printf(
+        "%3zu: %5d x %.2f/%5d x %4.1f tasks, %.2f steals; "
+        "wake %7.3f ns, latency %6.3f < %7.3f us, barrier %7.3f us; "
+        "wait %.1f us (%6.0f reps, %4.1f%%), balance %4.1f%%-%5.1f%%, "
+        "func: %6.3f + %7.3f, "
+        "%.1f%% of thread time %7.3f s; main:worker %5.1f%%\n",
+        num_threads, num_run_static_, avg_tasks_static, num_run_dynamic_,
+        avg_tasks_dynamic, avg_steals, ns(avg_d_wake), us(avg_wake_latency),
+        us(max_wake_latency), us(avg_d_barrier), us(avg_d_wait), avg_wait_reps,
+        pc_wait, avg_r_min * 100.0, avg_r_max * 100.0, d_func_static,
+        d_func_dynamic, pc_run, d_thread_lifetime, pc_main);
+
+    Reset(num_threads);
+  }
+
+  void Reset(size_t num_threads = kMaxThreads) {
+    num_run_ = 0;
+    num_run_static_ = 0;
+    num_run_dynamic_ = 0;
+
+    sum_tasks_stolen_ = 0;
+    sum_tasks_static_ = 0;
+    sum_tasks_dynamic_ = 0;
+
+    sum_d_wake_ = 0;
+    sum_wake_latency_ = 0;
+    max_wake_latency_ = 0;
+    sum_d_wait_ = 0;
+    sum_wait_reps_ = 0;
+    sum_d_barrier_ = 0;
+
+    sum_d_func_static_ = 0;
+    sum_d_func_dynamic_ = 0;
+    sum_r_min_ = 0.0;
+    sum_r_max_ = 0.0;
+    sum_d_run_ = 0;
+    sum_d_run_main_ = 0;
+    // ctor and `NotifyMainRun` already reset `PerThread`.
+  }
+
+ private:
+  template <typename T>
+  static double Avg(T sum, size_t div) {
+    return div == 0 ? 0.0 : static_cast<double>(sum) / static_cast<double>(div);
+  }
+
+  static constexpr size_t kU64PerLine = HWY_ALIGNMENT / sizeof(uint64_t);
+
+  uint64_t& PerThread(size_t thread_idx, size_t offset) {
+    HWY_DASSERT(thread_idx < kMaxThreads);
+    HWY_DASSERT(offset < kSentinel);
+    return per_thread_[thread_idx * kU64PerLine + offset];
+  }
+
+  int32_t num_run_;
+  int32_t num_run_static_;
+  int32_t num_run_dynamic_;
+
+  int32_t sum_tasks_stolen_;
+  int64_t sum_tasks_static_;
+  int64_t sum_tasks_dynamic_;
+
+  timer::Ticks sum_d_wake_;
+  timer::Ticks sum_wake_latency_;
+  timer::Ticks max_wake_latency_;
+  timer::Ticks sum_d_wait_;
+  uint64_t sum_wait_reps_;
+  timer::Ticks sum_d_barrier_;
+
+  timer::Ticks sum_d_func_static_;
+  timer::Ticks sum_d_func_dynamic_;
+  double sum_r_min_;
+  double sum_r_max_;
+  timer::Ticks sum_d_run_;
+  timer::Ticks sum_d_run_main_;
+
+  // One cache line per pool thread to avoid false sharing.
+  uint64_t per_thread_[kMaxThreads * kU64PerLine];
+};
+// Enables shift rather than multiplication.
+static_assert(sizeof(Stats) == (kMaxThreads + 1) * HWY_ALIGNMENT, "Wrong size");
+
+// Non-power of two to avoid 2K aliasing.
+HWY_INLINE_VAR constexpr size_t kMaxCallers = 60;
+
+// Per-caller stats, stored in `PerCluster`.
+class CallerAccumulator {
+ public:
+  bool Any() const { return calls_ != 0; }
+
+  void Add(size_t tasks, size_t workers, bool is_root, timer::Ticks wait_before,
+           timer::Ticks elapsed) {
+    calls_++;
+    root_ += is_root;
+    workers_ += workers;
+    min_tasks_ = HWY_MIN(min_tasks_, tasks);
+    max_tasks_ = HWY_MAX(max_tasks_, tasks);
+    tasks_ += tasks;
+    wait_before_ += wait_before;
+    elapsed_ += elapsed;
+  }
+
+  void AddFrom(const CallerAccumulator& other) {
+    calls_ += other.calls_;
+    root_ += other.root_;
+    workers_ += other.workers_;
+    min_tasks_ = HWY_MIN(min_tasks_, other.min_tasks_);
+    max_tasks_ = HWY_MAX(max_tasks_, other.max_tasks_);
+    tasks_ += other.tasks_;
+    wait_before_ += other.wait_before_;
+    elapsed_ += other.elapsed_;
+  }
+
+  bool operator>(const CallerAccumulator& other) const {
+    return elapsed_ > other.elapsed_;
+  }
+
+  void PrintAndReset(const char* caller, size_t active_clusters) {
+    if (!Any()) return;
+    HWY_ASSERT(root_ <= calls_);
+    const double inv_calls = 1.0 / static_cast<double>(calls_);
+    const double pc_root = static_cast<double>(root_) * inv_calls * 100.0;
+    const double avg_workers = static_cast<double>(workers_) * inv_calls;
+    const double avg_tasks = static_cast<double>(tasks_) * inv_calls;
+    const double avg_tasks_per_worker = avg_tasks / avg_workers;
+    const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
+    const double sum_wait_before = static_cast<double>(wait_before_) * inv_freq;
+    const double avg_wait_before =
+        root_ ? sum_wait_before / static_cast<double>(root_) : 0.0;
+    const double elapsed = static_cast<double>(elapsed_) * inv_freq;
+    const double avg_elapsed = elapsed * inv_calls;
+    const double task_len = avg_elapsed / avg_tasks_per_worker;
+    printf(
+        "%40s: %7.0f x (%3.0f%%) %2zu clusters, %4.1f workers @ "
+        "%5.1f tasks (%5u-%5u), "
+        "%5.0f us wait, %6.1E us run (task len %6.1E us), total %6.2f s\n",
+        caller, static_cast<double>(calls_), pc_root, active_clusters,
+        avg_workers, avg_tasks_per_worker, static_cast<uint32_t>(min_tasks_),
+        static_cast<uint32_t>(max_tasks_), avg_wait_before * 1E6,
+        avg_elapsed * 1E6, task_len * 1E6, elapsed);
+    *this = CallerAccumulator();
+  }
+
+  // For the grand total, only print calls and elapsed because averaging the
+  // the other stats is not very useful. No need to reset because this is called
+  // on a temporary.
+  void PrintTotal() {
+    if (!Any()) return;
+    HWY_ASSERT(root_ <= calls_);
+    const double elapsed =
+        static_cast<double>(elapsed_) / platform::InvariantTicksPerSecond();
+    printf("TOTAL: %7.0f x run %6.2f s\n", static_cast<double>(calls_),
+           elapsed);
+  }
+
+ private:
+  int64_t calls_ = 0;
+  int64_t root_ = 0;
+  uint64_t workers_ = 0;
+  uint64_t min_tasks_ = ~uint64_t{0};
+  uint64_t max_tasks_ = 0;
+  uint64_t tasks_ = 0;
+  // both are wall time for root Run, otherwise CPU time.
+  timer::Ticks wait_before_ = 0;
+  timer::Ticks elapsed_ = 0;
+};
+static_assert(sizeof(CallerAccumulator) == 64, "");
+
+class PerCluster {
+ public:
+  CallerAccumulator& Get(size_t caller_idx) {
+    HWY_DASSERT(caller_idx < kMaxCallers);
+    callers_.Set(caller_idx);
+    return accumulators_[caller_idx];
+  }
+
+  template <class Func>
+  void ForeachCaller(Func&& func) {
+    callers_.Foreach([&](size_t caller_idx) {
+      func(caller_idx, accumulators_[caller_idx]);
+    });
+  }
+
+  // Returns indices (required for `StringTable::Name`) in descending order of
+  // elapsed time.
+  std::vector<size_t> Sorted() {
+    std::vector<size_t> vec;
+    vec.reserve(kMaxCallers);
+    ForeachCaller([&](size_t caller_idx, CallerAccumulator&) {
+      vec.push_back(caller_idx);
+    });
+    std::sort(vec.begin(), vec.end(), [&](size_t a, size_t b) {
+      return accumulators_[a] > accumulators_[b];
+    });
+    return vec;
+  }
+
+  // Caller takes care of resetting `accumulators_`.
+  void ResetBits() { callers_ = hwy::BitSet<kMaxCallers>(); }
+
+ private:
+  CallerAccumulator accumulators_[kMaxCallers];
+  hwy::BitSet<kMaxCallers> callers_;
+};
+
+// Type-safe wrapper.
+class Caller {
+ public:
+  Caller() : idx_(0) {}  // `AddCaller` never returns 0.
+  explicit Caller(size_t idx) : idx_(idx) { HWY_DASSERT(idx < kMaxCallers); }
+  size_t Idx() const { return idx_; }
+
+ private:
+  size_t idx_;
+};
+
+// Singleton, shared by all ThreadPool.
+class Shared {
+ public:
+  static HWY_DLLEXPORT Shared& Get();  // Thread-safe.
+
+  Stopwatch MakeStopwatch() const { return Stopwatch(timer_); }
+  Stopwatch& LastRootEnd() { return last_root_end_; }
+
+  // Thread-safe. Calls with the same `name` return the same `Caller`.
+  Caller AddCaller(const char* name) { return Caller(callers_.Add(name)); }
+
+  PerCluster& Cluster(size_t cluster_idx) {
+    HWY_DASSERT(cluster_idx < kMaxClusters);
+    return per_cluster_[cluster_idx];
+  }
+
+  // Called from the main thread via `Profiler::PrintResults`.
+  void PrintAndReset() {
+    // Start counting pools (= one per cluster) invoked by each caller.
+    size_t active_clusters[kMaxCallers] = {};
+    per_cluster_[0].ForeachCaller(
+        [&](size_t caller_idx, CallerAccumulator& acc) {
+          active_clusters[caller_idx] = acc.Any();
+        });
+    // Reduce per-cluster accumulators into the first cluster.
+    for (size_t cluster_idx = 1; cluster_idx < kMaxClusters; ++cluster_idx) {
+      per_cluster_[cluster_idx].ForeachCaller(
+          [&](size_t caller_idx, CallerAccumulator& acc) {
+            active_clusters[caller_idx] += acc.Any();
+            per_cluster_[0].Get(caller_idx).AddFrom(acc);
+            acc = CallerAccumulator();
+          });
+      per_cluster_[cluster_idx].ResetBits();
+    }
+
+    CallerAccumulator total;
+    for (size_t caller_idx : per_cluster_[0].Sorted()) {
+      CallerAccumulator& acc = per_cluster_[0].Get(caller_idx);
+      total.AddFrom(acc);  // must be before PrintAndReset.
+      acc.PrintAndReset(callers_.Name(caller_idx), active_clusters[caller_idx]);
+    }
+    total.PrintTotal();
+    per_cluster_[0].ResetBits();
+  }
+
+ private:
+  Shared()  // called via Get().
+      : last_root_end_(timer_),
+        send_config(callers_.Add("SendConfig")),
+        dtor(callers_.Add("PoolDtor")),
+        print_stats(callers_.Add("PrintStats")) {
+    Profiler::Get().AddFunc(this, [this]() { PrintAndReset(); });
+    // Can skip `RemoveFunc` because the singleton never dies.
+  }
+
+  const Timer timer_;
+  Stopwatch last_root_end_;
+
+  PerCluster per_cluster_[kMaxClusters];
+  StringTable<kMaxCallers> callers_;
+
+ public:
+  // Returned from `callers_.Add`:
+  Caller send_config;
+  Caller dtor;
+  Caller print_stats;
+};
+
+#else
+
+struct Stats {
+  void NotifyRunStatic(size_t, timer::Ticks) {}
+  void NotifyRunDynamic(size_t, size_t, size_t, timer::Ticks) {}
+  void NotifyThreadRun(size_t, timer::Ticks, size_t, timer::Ticks,
+                       timer::Ticks) {}
+  void NotifyMainRun(size_t, timer::Ticks, timer::Ticks, timer::Ticks,
+                     timer::Ticks) {}
+  void PrintAndReset(size_t, timer::Ticks) {}
+  void Reset(size_t = kMaxThreads) {}
+};
+
+struct Caller {};
+
+class Shared {
+ public:
+  static HWY_DLLEXPORT Shared& Get();  // Thread-safe.
+
+  Stopwatch MakeStopwatch() const { return Stopwatch(timer_); }
+
+  Caller AddCaller(const char*) { return Caller(); }
+
+ private:
+  Shared() {}
+
+  const Timer timer_;
+
+ public:
+  Caller send_config;
+  Caller dtor;
+  Caller print_stats;
 };
-static_assert(sizeof(Config) == 8, "");
+
+#endif  // PROFILER_ENABLED
 
 // Per-worker state used by both main and worker threads. `ThreadFunc`
 // (threads) and `ThreadPool` (main) have a few additional members of their own.
@@ -290,12 +783,33 @@ class alignas(HWY_ALIGNMENT) Worker {  // HWY_ALIGNMENT bytes
   static constexpr auto kAcq = std::memory_order_acquire;
   static constexpr auto kRel = std::memory_order_release;
 
+  bool OwnsGlobalIdx() const {
+#if PROFILER_ENABLED
+    if (global_idx_ >= profiler::kMaxWorkers) {
+      HWY_WARN("Windows-only bug? global_idx %zu >= %zu.", global_idx_,
+               profiler::kMaxWorkers);
+    }
+#endif  // PROFILER_ENABLED
+    // Across-cluster pool owns all except the main thread, which is reserved by
+    // profiler.cc.
+    if (cluster_idx_ == kAllClusters) return global_idx_ != 0;
+    // Within-cluster pool owns all except *its* main thread, because that is
+    // owned by the across-cluster pool.
+    return worker_ != 0;
+  }
+
  public:
   Worker(const size_t worker, const size_t num_threads,
-         const Divisor64& div_workers)
-      : worker_(worker), num_threads_(num_threads), workers_(this - worker) {
-    (void)padding_;
-
+         const PoolWorkerMapping mapping, const Divisor64& div_workers,
+         const Stopwatch& stopwatch)
+      : workers_(this - worker),
+        worker_(worker),
+        num_threads_(num_threads),
+        stopwatch_(stopwatch),
+        // If `num_threads == 0`, we might be in an inner pool and must use
+        // the `global_idx` we are currently running on.
+        global_idx_(num_threads == 0 ? Profiler::GlobalIdx() : mapping(worker)),
+        cluster_idx_(mapping.ClusterIdx()) {
     HWY_DASSERT(IsAligned(this, HWY_ALIGNMENT));
     HWY_DASSERT(worker <= num_threads);
     const size_t num_workers = static_cast<size_t>(div_workers.GetDivisor());
@@ -313,6 +827,20 @@ class alignas(HWY_ALIGNMENT) Worker {  // HWY_ALIGNMENT bytes
       victims_[i] = shuffled_iota.Next(victims_[i - 1], div_workers);
       HWY_DASSERT(victims_[i] != worker);
     }
+
+    HWY_IF_CONSTEXPR(PROFILER_ENABLED) {
+      if (HWY_LIKELY(OwnsGlobalIdx())) {
+        Profiler::Get().ReserveWorker(global_idx_);
+      }
+    }
+  }
+
+  ~Worker() {
+    HWY_IF_CONSTEXPR(PROFILER_ENABLED) {
+      if (HWY_LIKELY(OwnsGlobalIdx())) {
+        Profiler::Get().FreeWorker(global_idx_);
+      }
+    }
   }
 
   // Placement-newed by `WorkerLifecycle`, we do not expect any copying.
@@ -320,15 +848,29 @@ class alignas(HWY_ALIGNMENT) Worker {  // HWY_ALIGNMENT bytes
   Worker& operator=(const Worker&) = delete;
 
   size_t Index() const { return worker_; }
+  // For work stealing.
   Worker* AllWorkers() { return workers_; }
   const Worker* AllWorkers() const { return workers_; }
   size_t NumThreads() const { return num_threads_; }
 
+  size_t GlobalIdx() const { return global_idx_; }
+  size_t ClusterIdx() const { return cluster_idx_; }
+
+  void SetStartTime() { stopwatch_.Reset(); }
+  timer::Ticks ElapsedTime() { return stopwatch_.Elapsed(); }
+
   // ------------------------ Per-worker storage for `SendConfig`
 
-  Config LatchedConfig() const { return latched_; }
-  // For workers, but no harm if also called by main thread.
-  void LatchConfig(Config copy) { latched_ = copy; }
+  Config NextConfig() const { return next_config_; }
+  // Called during `SendConfig` by workers and now also the main thread. This
+  // avoids a separate `ThreadPool` member which risks going out of sync.
+  void SetNextConfig(Config copy) { next_config_ = copy; }
+
+  Exit GetExit() const { return exit_; }
+  void SetExit(Exit exit) { exit_ = exit; }
+
+  uint32_t WorkerEpoch() const { return worker_epoch_; }
+  uint32_t AdvanceWorkerEpoch() { return ++worker_epoch_; }
 
   // ------------------------ Task assignment
 
@@ -365,37 +907,43 @@ class alignas(HWY_ALIGNMENT) Worker {  // HWY_ALIGNMENT bytes
 
   // ------------------------ Barrier: Main thread waits for workers
 
+  // For use by `HasReached` and `UntilReached`.
   const std::atomic<uint32_t>& Barrier() const { return barrier_epoch_; }
-  std::atomic<uint32_t>& MutableBarrier() { return barrier_epoch_; }
+  // Setting to `epoch` signals that the worker has reached the barrier.
   void StoreBarrier(uint32_t epoch) { barrier_epoch_.store(epoch, kRel); }
 
  private:
-  // Atomics first because arm7 clang otherwise makes them unaligned.
-
   // Set by `SetRange`:
-  alignas(8) std::atomic<uint64_t> my_begin_;
-  alignas(8) std::atomic<uint64_t> my_end_;
+  std::atomic<uint64_t> my_begin_;
+  std::atomic<uint64_t> my_end_;
+
+  Worker* const workers_;
+  const size_t worker_;
+  const size_t num_threads_;
 
-  // Use u32 to match futex.h.
-  alignas(4) std::atomic<uint32_t> wait_epoch_{0};
-  alignas(4) std::atomic<uint32_t> barrier_epoch_{0};  // is reset
+  Stopwatch stopwatch_;  // Reset by `SetStartTime`.
+  const size_t global_idx_;
+  const size_t cluster_idx_;
+
+  // Use u32 to match futex.h. These must start at the initial value of
+  // `worker_epoch_`.
+  std::atomic<uint32_t> wait_epoch_{1};
+  std::atomic<uint32_t> barrier_epoch_{1};
 
   uint32_t num_victims_;  // <= kPoolMaxVictims
   std::array<uint32_t, kMaxVictims> victims_;
 
   // Written and read by the same thread, hence not atomic.
-  Config latched_;
-
-  const size_t worker_;
-  const size_t num_threads_;
-  Worker* const workers_;
+  Config next_config_;
+  Exit exit_ = Exit::kNone;
+  // thread_pool_test requires nonzero epoch.
+  uint32_t worker_epoch_ = 1;
 
-  uint8_t padding_[HWY_ALIGNMENT - 64 - sizeof(victims_)];
+  HWY_MEMBER_VAR_MAYBE_UNUSED uint8_t
+      padding_[HWY_ALIGNMENT - 56 - 6 * sizeof(void*) - sizeof(victims_)];
 };
 static_assert(sizeof(Worker) == HWY_ALIGNMENT, "");
 
-#pragma pack(pop)
-
 // Creates/destroys `Worker` using preallocated storage. See comment at
 // `ThreadPool::worker_bytes_` for why we do not dynamically allocate.
 class WorkerLifecycle {  // 0 bytes
@@ -403,10 +951,13 @@ class WorkerLifecycle {  // 0 bytes
   // Placement new for `Worker` into `storage` because its ctor requires
   // the worker index. Returns array of all workers.
   static Worker* Init(uint8_t* storage, size_t num_threads,
-                      const Divisor64& div_workers) {
-    Worker* workers = new (storage) Worker(0, num_threads, div_workers);
+                      PoolWorkerMapping mapping, const Divisor64& div_workers,
+                      Shared& shared) {
+    Worker* workers = new (storage)
+        Worker(0, num_threads, mapping, div_workers, shared.MakeStopwatch());
     for (size_t worker = 1; worker <= num_threads; ++worker) {
-      new (Addr(storage, worker)) Worker(worker, num_threads, div_workers);
+      new (Addr(storage, worker)) Worker(worker, num_threads, mapping,
+                                         div_workers, shared.MakeStopwatch());
       // Ensure pointer arithmetic is the same (will be used in Destroy).
       HWY_DASSERT(reinterpret_cast<uintptr_t>(workers + worker) ==
                   reinterpret_cast<uintptr_t>(Addr(storage, worker)));
@@ -430,10 +981,9 @@ class WorkerLifecycle {  // 0 bytes
   }
 };
 
-#pragma pack(push, 1)
 // Stores arguments to `Run`: the function and range of task indices. Set by
 // the main thread, read by workers including the main thread.
-class alignas(8) Tasks {
+class Tasks {
   static constexpr auto kAcq = std::memory_order_acquire;
 
   // Signature of the (internal) function called from workers(s) for each
@@ -456,7 +1006,8 @@ class alignas(8) Tasks {
   }
 
   // Assigns workers their share of `[begin, end)`. Called from the main
-  // thread; workers are initializing or spinning for a command.
+  // thread; workers are initializing or waiting for a command.
+  // Negligible CPU time.
   static void DivideRangeAmongWorkers(const uint64_t begin, const uint64_t end,
                                       const Divisor64& div_workers,
                                       Worker* workers) {
@@ -482,27 +1033,31 @@ class alignas(8) Tasks {
   }
 
   // Runs the worker's assigned range of tasks, plus work stealing if needed.
-  HWY_POOL_PROFILE void WorkerRun(Worker* worker) const {
+  void WorkerRun(Worker* worker, const Shared& shared, Stats& stats) const {
     if (NumTasks() > worker->NumThreads() + 1) {
-      WorkerRunWithStealing(worker);
+      WorkerRunDynamic(worker, shared, stats);
     } else {
-      WorkerRunSingle(worker->Index());
+      WorkerRunStatic(worker, shared, stats);
     }
   }
 
  private:
   // Special case for <= 1 task per worker, where stealing is unnecessary.
-  void WorkerRunSingle(size_t worker) const {
+  void WorkerRunStatic(Worker* worker, const Shared& shared,
+                       Stats& stats) const {
     const uint64_t begin = begin_.load(kAcq);
     const uint64_t end = end_.load(kAcq);
     HWY_DASSERT(begin <= end);
+    const size_t index = worker->Index();
 
-    const uint64_t task = begin + worker;
+    const uint64_t task = begin + index;
     // We might still have more workers than tasks, so check first.
     if (HWY_LIKELY(task < end)) {
       const void* opaque = Opaque();
       const RunFunc func = Func();
-      func(opaque, task, worker);
+      Stopwatch stopwatch = shared.MakeStopwatch();
+      func(opaque, task, index);
+      stats.NotifyRunStatic(index, stopwatch.Elapsed());
     }
   }
 
@@ -518,12 +1073,16 @@ class alignas(8) Tasks {
   // and perform work from others, as if they were that worker. This deals with
   // imbalances as they arise, but care is required to reduce contention. We
   // randomize the order in which threads choose victims to steal from.
-  HWY_POOL_PROFILE void WorkerRunWithStealing(Worker* worker) const {
+  void WorkerRunDynamic(Worker* worker, const Shared& shared,
+                        Stats& stats) const {
     Worker* workers = worker->AllWorkers();
     const size_t index = worker->Index();
     const RunFunc func = Func();
     const void* opaque = Opaque();
 
+    size_t sum_tasks = 0;
+    size_t sum_stolen = 0;
+    timer::Ticks sum_d_func = 0;
     // For each worker in random order, starting with our own, attempt to do
     // all their work.
     for (uint32_t victim : worker->Victims()) {
@@ -540,11 +1099,16 @@ class alignas(8) Tasks {
           hwy::Pause();  // Reduce coherency traffic while stealing.
           break;
         }
+        Stopwatch stopwatch = shared.MakeStopwatch();
         // Pass the index we are actually running on; this is important
         // because it is the TLS index for user code.
         func(opaque, task, index);
+        sum_tasks++;
+        sum_stolen += worker != other_worker;
+        sum_d_func += stopwatch.Elapsed();
       }
     }
+    stats.NotifyRunDynamic(index, sum_tasks, sum_stolen, sum_d_func);
   }
 
   size_t NumTasks() const {
@@ -566,7 +1130,6 @@ class alignas(8) Tasks {
   std::atomic<const void*> opaque_;
 };
 static_assert(sizeof(Tasks) == 16 + 2 * sizeof(void*), "");
-#pragma pack(pop)
 
 // ------------------------------ Threads wait, main wakes them
 
@@ -592,8 +1155,6 @@ static_assert(sizeof(Tasks) == 16 + 2 * sizeof(void*), "");
 
 // Futex: blocking reduces apparent CPU usage, but has higher wake latency.
 struct WaitBlock {
-  WaitType Type() const { return WaitType::kBlock; }
-
   // Wakes all workers by storing the current `epoch`.
   void WakeWorkers(Worker* workers, const uint32_t epoch) const {
     HWY_DASSERT(epoch != 0);
@@ -603,11 +1164,12 @@ struct WaitBlock {
 
   // Waits until `WakeWorkers(_, epoch)` has been called.
   template <class Spin>
-  void UntilWoken(const Worker* worker, const Spin& /*spin*/,
-                  const uint32_t epoch) const {
-    HWY_DASSERT(worker->Index() != 0);  // main is 0
-    const Worker* workers = worker->AllWorkers();
+  size_t UntilWoken(const Worker& worker, const Spin& /*spin*/) const {
+    HWY_DASSERT(worker.Index() != 0);  // main is 0
+    const uint32_t epoch = worker.WorkerEpoch();
+    const Worker* workers = worker.AllWorkers();
     BlockUntilDifferent(epoch - 1, workers[1].Waiter());
+    return 1;  // iterations
   }
 };
 
@@ -615,27 +1177,23 @@ struct WaitBlock {
 // one cache line and thus have it in a shared state, which means the store
 // will invalidate each of them, leading to more transactions than SpinSeparate.
 struct WaitSpin1 {
-  WaitType Type() const { return WaitType::kSpin1; }
-
   void WakeWorkers(Worker* workers, const uint32_t epoch) const {
     workers[1].StoreWaiter(epoch);
   }
 
+  // Returns the number of spin-wait iterations.
   template <class Spin>
-  void UntilWoken(const Worker* worker, const Spin& spin,
-                  const uint32_t epoch) const {
-    HWY_DASSERT(worker->Index() != 0);  // main is 0
-    const Worker* workers = worker->AllWorkers();
-    (void)spin.UntilEqual(epoch, workers[1].Waiter());
-    // TODO: store reps in stats.
+  size_t UntilWoken(const Worker& worker, const Spin& spin) const {
+    HWY_DASSERT(worker.Index() != 0);  // main is 0
+    const Worker* workers = worker.AllWorkers();
+    const uint32_t epoch = worker.WorkerEpoch();
+    return spin.UntilEqual(epoch, workers[1].Waiter());
   }
 };
 
 // Separate u32 per thread: more stores for the main thread, but each worker
 // only polls its own cache line, leading to fewer cache-coherency transactions.
 struct WaitSpinSeparate {
-  WaitType Type() const { return WaitType::kSpinSeparate; }
-
   void WakeWorkers(Worker* workers, const uint32_t epoch) const {
     for (size_t thread = 0; thread < workers->NumThreads(); ++thread) {
       workers[1 + thread].StoreWaiter(epoch);
@@ -643,385 +1201,71 @@ struct WaitSpinSeparate {
   }
 
   template <class Spin>
-  void UntilWoken(const Worker* worker, const Spin& spin,
-                  const uint32_t epoch) const {
-    HWY_DASSERT(worker->Index() != 0);  // main is 0
-    (void)spin.UntilEqual(epoch, worker->Waiter());
-    // TODO: store reps in stats.
+  size_t UntilWoken(const Worker& worker, const Spin& spin) const {
+    HWY_DASSERT(worker.Index() != 0);  // main is 0
+    const uint32_t epoch = worker.WorkerEpoch();
+    return spin.UntilEqual(epoch, worker.Waiter());
   }
 };
 
-// ------------------------------ Barrier: Main thread waits for workers
-
-// Single atomic counter. TODO: remove if not competitive?
-template <size_t kShards>
-class BarrierCounter {
-  static_assert(kShards == 1 || kShards == 2 || kShards == 4, "");  // pow2
-
- public:
-  BarrierType Type() const {
-    return kShards == 1   ? BarrierType::kCounter1
-           : kShards == 2 ? BarrierType::kCounter2
-                          : BarrierType::kCounter4;
-  }
-
-  void Reset(Worker* workers) const {
-    for (size_t i = 0; i < kShards; ++i) {
-      // Use last worker(s) to avoid contention with other stores to the Worker.
-      // Note that there are kMaxThreads + 1 workers, hence i == 0 is the last.
-      workers[kMaxThreads - i].StoreBarrier(0);
-    }
-  }
-
-  template <class Spin>
-  void WorkerReached(Worker* worker, const Spin& /*spin*/,
-                     uint32_t /*epoch*/) const {
-    Worker* workers = worker->AllWorkers();
-    const size_t shard = worker->Index() & (kShards - 1);
-    const auto kAcqRel = std::memory_order_acq_rel;
-    workers[kMaxThreads - shard].MutableBarrier().fetch_add(1, kAcqRel);
-  }
-
-  template <class Spin>
-  void UntilReached(size_t num_threads, const Worker* workers, const Spin& spin,
-                    uint32_t /*epoch*/) const {
-    HWY_IF_CONSTEXPR(kShards == 1) {
-      (void)spin.UntilEqual(static_cast<uint32_t>(num_threads),
-                            workers[kMaxThreads - 0].Barrier());
-    }
-    HWY_IF_CONSTEXPR(kShards == 2) {
-      const auto kAcq = std::memory_order_acquire;
-      for (;;) {
-        hwy::Pause();
-        const uint64_t sum = workers[kMaxThreads - 0].Barrier().load(kAcq) +
-                             workers[kMaxThreads - 1].Barrier().load(kAcq);
-        if (sum == num_threads) break;
-      }
-    }
-    HWY_IF_CONSTEXPR(kShards == 4) {
-      const auto kAcq = std::memory_order_acquire;
-      for (;;) {
-        hwy::Pause();
-        const uint64_t sum = workers[kMaxThreads - 0].Barrier().load(kAcq) +
-                             workers[kMaxThreads - 1].Barrier().load(kAcq) +
-                             workers[kMaxThreads - 2].Barrier().load(kAcq) +
-                             workers[kMaxThreads - 3].Barrier().load(kAcq);
-        if (sum == num_threads) break;
-      }
-    }
-  }
-};
-
-// As with the wait, a store-release of the same local epoch counter serves as a
-// "have arrived" flag that does not require resetting.
-
-// Main thread loops over each worker.
-class BarrierOrdered {
- public:
-  BarrierType Type() const { return BarrierType::kOrdered; }
-
-  void Reset(Worker* /*workers*/) const {}
-
-  template <class Spin>
-  void WorkerReached(Worker* worker, const Spin&, uint32_t epoch) const {
-    HWY_DASSERT(worker->Index() != 0);  // main is 0
-    worker->StoreBarrier(epoch);
+// Calls unrolled code selected by all config enums.
+template <class Func, typename... Args>
+HWY_INLINE void CallWithConfig(const Config& config, Func&& func,
+                               Args&&... args) {
+  switch (config.wait_type) {
+    case WaitType::kBlock:
+      return func(SpinPause(), WaitBlock(), std::forward<Args>(args)...);
+    case WaitType::kSpin1:
+      return CallWithSpin(config.spin_type, func, WaitSpin1(),
+                          std::forward<Args>(args)...);
+    case WaitType::kSpinSeparate:
+      return CallWithSpin(config.spin_type, func, WaitSpinSeparate(),
+                          std::forward<Args>(args)...);
+    case WaitType::kSentinel:
+      HWY_UNREACHABLE;
   }
+}
 
-  template <class Spin>
-  void UntilReached(size_t num_threads, const Worker* workers, const Spin& spin,
-                    uint32_t epoch) const {
-    for (size_t i = 0; i < num_threads; ++i) {
-      (void)spin.UntilEqual(epoch, workers[1 + i].Barrier());
-    }
-  }
-};
+// ------------------------------ Barrier: Main thread waits for workers
 
-// Leader threads wait for others in the group, main thread loops over leaders.
-template <size_t kGroupSize>
-class BarrierGroup {
+// Similar to `WaitSpinSeparate`, a store-release of the same local epoch
+// counter serves as a "have arrived" flag that does not require resetting.
+class Barrier {
  public:
-  BarrierType Type() const {
-    return kGroupSize == 2 ? BarrierType::kGroup2 : BarrierType::kGroup4;
+  void WorkerReached(Worker& worker, uint32_t epoch) const {
+    HWY_DASSERT(worker.Index() != 0);  // main is 0
+    worker.StoreBarrier(epoch);
   }
 
-  void Reset(Worker* /*workers*/) const {}
-
-  template <class Spin>
-  void WorkerReached(Worker* worker, const Spin& spin, uint32_t epoch) const {
-    const size_t w_idx = worker->Index();
-    HWY_DASSERT(w_idx != 0);  // main is 0
-    // NOTE: the first worker is 1, but our leader election scheme requires a
-    // 0-based index.
-    const size_t rel_idx = w_idx - 1;
-
-    Worker* workers = worker->AllWorkers();
-    const size_t num_workers = 1 + workers->NumThreads();
-
-    // Leaders (the first worker of each group) wait for all others in their
-    // group before marking themselves.
-    if (rel_idx % kGroupSize == 0) {
-      for (size_t i = w_idx + 1; i < HWY_MIN(w_idx + kGroupSize, num_workers);
-           ++i) {
-        // No + 1 here: i is derived from w_idx which is the actual index.
-        (void)spin.UntilEqual(epoch, workers[i].Barrier());
-      }
-    }
-    worker->StoreBarrier(epoch);
+  // Returns true if `worker` (can be the main thread) reached the barrier.
+  bool HasReached(const Worker* worker, uint32_t epoch) const {
+    const uint32_t barrier = worker->Barrier().load(std::memory_order_acquire);
+    HWY_DASSERT(barrier <= epoch);
+    return barrier == epoch;
   }
 
+  // Main thread loops over each worker. A "group of 2 or 4" barrier was not
+  // competitive on Skylake, Granite Rapids and Zen5.
   template <class Spin>
-  void UntilReached(size_t num_threads, const Worker* workers, const Spin& spin,
+  void UntilReached(size_t num_threads, Worker* workers, const Spin& spin,
                     uint32_t epoch) const {
-    for (size_t i = 0; i < num_threads; i += kGroupSize) {
-      (void)spin.UntilEqual(epoch, workers[1 + i].Barrier());
-    }
-  }
-};
-
-// ------------------------------ Inlining policy classes
-
-// We want to inline the various spin/wait/barrier policy classes into larger
-// code sections because both the main and worker threads use two or three of
-// them at a time, and we do not want separate branches around each.
-//
-// We generate code for three combinations of the enums, hence implement
-// composable adapters that 'add' `Wait` and `Barrier` arguments. `spin.h`
-// provides a `CallWithSpin`, hence it is the outermost. C++11 lacks generic
-// lambdas, so we implement these as classes.
-template <class Func>
-class FunctorAddWait {
- public:
-  FunctorAddWait(WaitType wait_type, Func&& func)
-      : func_(std::forward<Func>(func)), wait_type_(wait_type) {}
+    workers[0].StoreBarrier(epoch);  // for main thread HasReached.
 
-  template <class Spin>
-  HWY_INLINE void operator()(const Spin& spin) {
-    switch (wait_type_) {
-      case WaitType::kBlock:
-        return func_(spin, WaitBlock());
-      case WaitType::kSpin1:
-        return func_(spin, WaitSpin1());
-      case WaitType::kSpinSeparate:
-        return func_(spin, WaitSpinSeparate());
-      default:
-        HWY_UNREACHABLE;
+    for (size_t i = 0; i < num_threads; ++i) {
+      // TODO: log number of spin-wait iterations.
+      (void)spin.UntilEqual(epoch, workers[1 + i].Barrier());
     }
   }
-
- private:
-  Func&& func_;
-  WaitType wait_type_;
 };
 
-template <class Func>
-class FunctorAddBarrier {
+// In debug builds, detects when functions are re-entered.
+class BusyFlag {
  public:
-  FunctorAddBarrier(BarrierType barrier_type, Func&& func)
-      : func_(std::forward<Func>(func)), barrier_type_(barrier_type) {}
-
-  template <class Wait>
-  HWY_INLINE void operator()(const Wait& wait) {
-    switch (barrier_type_) {
-      case BarrierType::kOrdered:
-        return func_(wait, BarrierOrdered());
-      case BarrierType::kCounter1:
-        return func_(wait, BarrierCounter<1>());
-      case BarrierType::kCounter2:
-        return func_(wait, BarrierCounter<2>());
-      case BarrierType::kCounter4:
-        return func_(wait, BarrierCounter<4>());
-      case BarrierType::kGroup2:
-        return func_(wait, BarrierGroup<2>());
-      case BarrierType::kGroup4:
-        return func_(wait, BarrierGroup<4>());
-      default:
-        HWY_UNREACHABLE;
-    }
-  }
-  template <class Spin, class Wait>
-  HWY_INLINE void operator()(const Spin& spin, const Wait& wait) {
-    switch (barrier_type_) {
-      case BarrierType::kOrdered:
-        return func_(spin, wait, BarrierOrdered());
-      case BarrierType::kCounter1:
-        return func_(spin, wait, BarrierCounter<1>());
-      case BarrierType::kCounter2:
-        return func_(spin, wait, BarrierCounter<2>());
-      case BarrierType::kCounter4:
-        return func_(spin, wait, BarrierCounter<4>());
-      case BarrierType::kGroup2:
-        return func_(spin, wait, BarrierGroup<2>());
-      case BarrierType::kGroup4:
-        return func_(spin, wait, BarrierGroup<4>());
-      default:
-        HWY_UNREACHABLE;
-    }
-  }
+  void Set() { HWY_DASSERT(!busy_.test_and_set()); }
+  void Clear() { HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) busy_.clear(); }
 
  private:
-  Func&& func_;
-  BarrierType barrier_type_;
-};
-
-// Calls unrolled code selected by all 3 enums.
-template <class Func>
-HWY_INLINE void CallWithConfig(const Config& config, Func&& func) {
-  CallWithSpin(
-      config.spin_type,
-      FunctorAddWait<FunctorAddBarrier<Func>>(
-          config.wait_type, FunctorAddBarrier<Func>(config.barrier_type,
-                                                    std::forward<Func>(func))));
-}
-
-// For `WorkerAdapter`, `Spin` and `Wait`.
-template <class Func>
-HWY_INLINE void CallWithSpinWait(const Config& config, Func&& func) {
-  CallWithSpin(
-      config.spin_type,
-      FunctorAddWait<Func>(config.wait_type, std::forward<Func>(func)));
-}
-
-// For `WorkerAdapter`, only `Spin` and `Barrier`.
-template <class Func>
-HWY_INLINE void CallWithSpinBarrier(const Config& config, Func&& func) {
-  CallWithSpin(
-      config.spin_type,
-      FunctorAddBarrier<Func>(config.barrier_type, std::forward<Func>(func)));
-}
-
-// ------------------------------ Adapters
-
-// Logic of the main and worker threads, again packaged as classes because
-// C++11 lacks generic lambdas, called by `CallWith*`.
-
-class MainAdapter {
- public:
-  MainAdapter(Worker* main, const Tasks* tasks) : main_(main), tasks_(tasks) {
-    HWY_DASSERT(main_ == main->AllWorkers());  // main is first.
-  }
-
-  void SetEpoch(uint32_t epoch) { epoch_ = epoch; }
-
-  template <class Spin, class Wait, class Barrier>
-  HWY_POOL_PROFILE void operator()(const Spin& spin, const Wait& wait,
-                                   const Barrier& barrier) const {
-    Worker* workers = main_->AllWorkers();
-    const size_t num_threads = main_->NumThreads();
-    barrier.Reset(workers);
-
-    wait.WakeWorkers(workers, epoch_);
-    // Threads might still be starting up and wake up late, but we wait for
-    // them at the barrier below.
-
-    // Also perform work on the main thread before the barrier.
-    tasks_->WorkerRun(main_);
-
-    // Waits until all *threads* (not the main thread, because it already knows
-    // it is here) called `WorkerReached`. All `barrier` types use spinning.
-
-    barrier.UntilReached(num_threads, workers, spin, epoch_);
-
-    // Threads may already be waiting `UntilWoken`, which serves as the
-    // 'release' phase of the barrier.
-  }
-
- private:
-  Worker* const main_;
-  const Tasks* const tasks_;
-  uint32_t epoch_;
-};
-
-class WorkerAdapter {
- public:
-  explicit WorkerAdapter(Worker* worker) : worker_(worker) {}
-
-  void SetEpoch(uint32_t epoch) { epoch_ = epoch; }
-
- private:
-  template <class Spin, class Wait>
-  HWY_INLINE void CallImpl(hwy::SizeTag<1> /* second_param_type_tag */,
-                           const Spin& spin, const Wait& wait) const {
-    wait.UntilWoken(worker_, spin, epoch_);
-  }
-  template <class Spin, class Barrier>
-  HWY_INLINE void CallImpl(hwy::SizeTag<2> /* second_param_type_tag */,
-                           const Spin& spin, const Barrier& barrier) const {
-    barrier.WorkerReached(worker_, spin, epoch_);
-  }
-
- public:
-  // Split into separate wait/barrier functions because `ThreadFunc` latches
-  // the config in between them.
-  template <class Spin, class Param2>
-  hwy::EnableIf<hwy::IsSameEither<
-      hwy::RemoveCvRef<decltype(hwy::RemoveCvRef<Param2>().Type())>, WaitType,
-      BarrierType>()>
-  operator()(const Spin& spin, const Param2& wait_or_barrier) const {
-    // Use tag dispatch to work around template argument deduction error with
-    // MSVC 2019.
-
-    constexpr size_t kType =
-        hwy::IsSame<
-            hwy::RemoveCvRef<decltype(hwy::RemoveCvRef<Param2>().Type())>,
-            WaitType>() ? 1 : 2;
-
-    // Using this->CallImpl below ensures that WorkerAdapter::CallImpl is
-    // selected and avoids unwanted argument dependent lookup.
-    this->CallImpl(hwy::SizeTag<kType>(), spin, wait_or_barrier);
-  }
-
- private:
-  Worker* const worker_;
-  uint32_t epoch_;
-};
-
-// Could also be a lambda in ThreadPool ctor, but this allows annotating with
-// `HWY_POOL_PROFILE` so we can more easily inspect the generated code.
-class ThreadFunc {
- public:
-  ThreadFunc(Worker* worker, Tasks* tasks, Config config)
-      : worker_(worker),
-        tasks_(tasks),
-        config_(config),
-        worker_adapter_(worker_) {
-    worker->LatchConfig(config);
-  }
-
-  HWY_POOL_PROFILE void operator()() {
-    // Ensure main thread's writes are visible (synchronizes with fence in
-    // `WorkerLifecycle::Init`).
-    std::atomic_thread_fence(std::memory_order_acquire);
-
-    HWY_DASSERT(worker_->Index() != 0);  // main is 0
-    SetThreadName("worker%03zu", static_cast<int>(worker_->Index() - 1));
-    hwy::Profiler::InitThread();
-
-    // Initialization must match pre-increment in `MainAdapter::SetEpoch`.
-    // Loop termination is triggered by `~ThreadPool`.
-    for (uint32_t epoch = 1;; ++epoch) {
-      worker_adapter_.SetEpoch(epoch);
-      CallWithSpinWait(config_, worker_adapter_);
-
-      // Must happen before `WorkerRun` because `SendConfig` writes it there.
-      config_ = worker_->LatchedConfig();
-
-      tasks_->WorkerRun(worker_);
-
-      // Notify barrier after `WorkerRun`.
-      CallWithSpinBarrier(config_, worker_adapter_);
-
-      // Check after notifying the barrier, otherwise the main thread deadlocks.
-      if (HWY_UNLIKELY(config_.exit)) break;
-    }
-  }
-
- private:
-  Worker* const worker_;
-  Tasks* const tasks_;
-
-  Config config_;
-  WorkerAdapter worker_adapter_;
+  std::atomic_flag busy_ = ATOMIC_FLAG_INIT;
 };
 
 }  // namespace pool
@@ -1034,11 +1278,11 @@ class ThreadFunc {
 // that threads do not schedule new work themselves. This allows us to avoid
 // queues and only store a counter plus the current task. The latter is a
 // pointer to a lambda function, without the allocation/indirection required for
-// std::function.
+// `std::function`.
 //
 // To reduce fork/join latency, we choose an efficient barrier, optionally
-// enable spin-waits via SetWaitMode, and avoid any mutex/lock. We largely even
-// avoid atomic RMW operations (LOCK prefix): currently for the wait and
+// enable spin-waits via `SetWaitMode`, and avoid any mutex/lock. We largely
+// even avoid atomic RMW operations (LOCK prefix): currently for the wait and
 // barrier, in future hopefully also for work stealing.
 //
 // To eliminate false sharing and enable reasoning about cache line traffic, the
@@ -1046,6 +1290,17 @@ class ThreadFunc {
 //
 // For load-balancing, we use work stealing in random order.
 class alignas(HWY_ALIGNMENT) ThreadPool {
+  // Used to initialize `num_threads_` from the ctor argument.
+  static size_t ClampedNumThreads(size_t num_threads) {
+    // Upper bound is required for `worker_bytes_`.
+    if (HWY_UNLIKELY(num_threads > pool::kMaxThreads)) {
+      HWY_WARN("ThreadPool: clamping num_threads %zu to %zu.", num_threads,
+               pool::kMaxThreads);
+      num_threads = pool::kMaxThreads;
+    }
+    return num_threads;
+  }
+
  public:
   // This typically includes hyperthreads, hence it is a loose upper bound.
   // -1 because these are in addition to the main thread.
@@ -1061,49 +1316,61 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
 
   // `num_threads` is the number of *additional* threads to spawn, which should
   // not exceed `MaxThreads()`. Note that the main thread also performs work.
-  explicit ThreadPool(size_t num_threads)
-      : have_timer_stop_(platform::HaveTimerStop(cpu100_)),
-        num_threads_(ClampedNumThreads(num_threads)),
+  // `mapping` indicates how to map local worker_idx to global.
+  ThreadPool(size_t num_threads,
+             PoolWorkerMapping mapping = PoolWorkerMapping())
+      : num_threads_(ClampedNumThreads(num_threads)),
         div_workers_(1 + num_threads_),
+        shared_(pool::Shared::Get()),  // on first call, calls ReserveWorker(0)!
         workers_(pool::WorkerLifecycle::Init(worker_bytes_, num_threads_,
-                                             div_workers_)),
-        // Assign main thread the first worker slot (it used to be the last).
-        main_adapter_(workers_ + 0, &tasks_) {
+                                             mapping, div_workers_, shared_)) {
     // Leaves the default wait mode as `kBlock`, which means futex, because
     // spinning only makes sense when threads are pinned and wake latency is
     // important, so it must explicitly be requested by calling `SetWaitMode`.
     for (PoolWaitMode mode : {PoolWaitMode::kSpin, PoolWaitMode::kBlock}) {
       wait_mode_ = mode;  // for AutoTuner
       AutoTuner().SetCandidates(
-          pool::Config::AllCandidates(mode, num_threads_));
+          pool::Config::AllCandidates(mode));
+    }
+
+    // Skip empty pools because they do not update stats anyway.
+    if (num_threads_ > 0) {
+      Profiler::Get().AddFunc(this, [this]() { PrintStats(); });
     }
-    config_ = AutoTuner().Candidates()[0];
 
     threads_.reserve(num_threads_);
     for (size_t thread = 0; thread < num_threads_; ++thread) {
       threads_.emplace_back(
-          pool::ThreadFunc(workers_ + 1 + thread, &tasks_, config_));
+          ThreadFunc(workers_[1 + thread], tasks_, shared_, stats_));
     }
 
-    // No barrier is required here because wakeup works regardless of the
-    // relative order of wake and wait.
+    // Threads' `Config` defaults to spinning. Change to `kBlock` (see above).
+    // This also ensures all threads have started before we return, so that
+    // startup latency is billed to the ctor, not the first `Run`.
+    SendConfig(AutoTuner().Candidates()[0]);
   }
 
-  // Waits for all threads to exit.
+  // If we created threads, waits for them all to exit.
   ~ThreadPool() {
     // There is no portable way to request threads to exit like `ExitThread` on
     // Windows, otherwise we could call that from `Run`. Instead, we must cause
-    // the thread to wake up and exit. We can use the same `SendConfig`
-    // mechanism as `SetWaitMode`.
-    pool::Config copy = config_;
-    copy.exit = true;
-    SendConfig(copy);
+    // the thread to wake up and exit. We can just use `Run`.
+    (void)RunWithoutAutotune(
+        0, NumWorkers(), shared_.dtor,
+        [this](HWY_MAYBE_UNUSED uint64_t task, size_t worker) {
+          HWY_DASSERT(task == worker);
+          workers_[worker].SetExit(Exit::kThread);
+        });
 
     for (std::thread& thread : threads_) {
       HWY_DASSERT(thread.joinable());
       thread.join();
     }
 
+    if (num_threads_ > 0) {
+      Profiler::Get().RemoveFunc(this);
+    }
+
     pool::WorkerLifecycle::Destroy(workers_, num_threads_);
   }
 
@@ -1126,12 +1393,16 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
                                   : AutoTuner().NextConfig());
   }
 
-  // For printing which are in use.
-  pool::Config config() const { return config_; }
+  // For printing which is in use.
+  pool::Config config() const { return workers_[0].NextConfig(); }
 
   bool AutoTuneComplete() const { return AutoTuner().Best(); }
   Span<CostDistribution> AutoTuneCosts() { return AutoTuner().Costs(); }
 
+  static pool::Caller AddCaller(const char* name) {
+    return pool::Shared::Get().AddCaller(name);
+  }
+
   // parallel-for: Runs `closure(task, worker)` on workers for every `task` in
   // `[begin, end)`. Note that the unit of work should be large enough to
   // amortize the function call overhead, but small enough that each worker
@@ -1140,7 +1411,204 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
   // Not thread-safe - concurrent parallel-for in the same `ThreadPool` are
   // forbidden unless `NumWorkers() == 1` or `end <= begin + 1`.
   template <class Closure>
+  void Run(uint64_t begin, uint64_t end, pool::Caller caller,
+           const Closure& closure) {
+    AutoTuneT& auto_tuner = AutoTuner();
+    // Already finished tuning: run without time measurement.
+    if (HWY_LIKELY(auto_tuner.Best())) {
+      // Don't care whether threads ran, we are done either way.
+      (void)RunWithoutAutotune(begin, end, caller, closure);
+      return;
+    }
+
+    // Not yet finished: measure time and notify autotuner.
+    Stopwatch stopwatch(shared_.MakeStopwatch());
+    // Skip update if threads didn't actually run.
+    if (!RunWithoutAutotune(begin, end, caller, closure)) return;
+    auto_tuner.NotifyCost(stopwatch.Elapsed());
+
+    pool::Config next = auto_tuner.NextConfig();  // may be overwritten below
+    if (auto_tuner.Best()) {  // just finished
+      next = *auto_tuner.Best();
+      HWY_IF_CONSTEXPR(pool::kVerbosity >= 1) {
+        const size_t idx_best = static_cast<size_t>(
+            auto_tuner.Best() - auto_tuner.Candidates().data());
+        HWY_DASSERT(idx_best < auto_tuner.Costs().size());
+        auto& AT = auto_tuner.Costs()[idx_best];
+        const double best_cost = AT.EstimateCost();
+        HWY_DASSERT(best_cost > 0.0);  // will divide by this below
+
+        Stats s_ratio;
+        for (size_t i = 0; i < auto_tuner.Costs().size(); ++i) {
+          if (i == idx_best) continue;
+          const double cost = auto_tuner.Costs()[i].EstimateCost();
+          s_ratio.Notify(static_cast<float>(cost / best_cost));
+        }
+
+        fprintf(stderr,
+                "Pool %3zu: %s %8.0f +/- %6.0f. Gain %.2fx [%.2fx, %.2fx]\n",
+                NumWorkers(), auto_tuner.Best()->ToString().c_str(), best_cost,
+                AT.Stddev(), s_ratio.GeometricMean(),
+                static_cast<double>(s_ratio.Min()),
+                static_cast<double>(s_ratio.Max()));
+      }
+    }
+    SendConfig(next);
+  }
+
+  // Backward-compatible version without Caller.
+  template <class Closure>
   void Run(uint64_t begin, uint64_t end, const Closure& closure) {
+    Run(begin, end, pool::Caller(), closure);
+  }
+
+ private:
+  // Called via `CallWithConfig`.
+  struct MainWakeAndBarrier {
+    template <class Spin, class Wait>
+    void operator()(const Spin& spin, const Wait& wait, pool::Worker& main,
+                    const pool::Tasks& tasks, const pool::Shared& shared,
+                    pool::Stats& stats) const {
+      const pool::Barrier barrier;
+      pool::Worker* workers = main.AllWorkers();
+      HWY_DASSERT(&main == main.AllWorkers());  // main is first.
+      const size_t num_threads = main.NumThreads();
+      const uint32_t epoch = main.AdvanceWorkerEpoch();
+
+      HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) {
+        for (size_t i = 0; i < 1 + num_threads; ++i) {
+          HWY_DASSERT(!barrier.HasReached(workers + i, epoch));
+        }
+      }
+
+      Stopwatch stopwatch(shared.MakeStopwatch());
+      const timer::Ticks t_before_wake = stopwatch.Origin();
+      wait.WakeWorkers(workers, epoch);
+      const timer::Ticks d_wake = stopwatch.Elapsed();
+
+      // Also perform work on the main thread before the barrier.
+      tasks.WorkerRun(&main, shared, stats);
+      const timer::Ticks d_run = stopwatch.Elapsed();
+
+      // Spin-waits until all worker *threads* (not `main`, because it already
+      // knows it is here) called `WorkerReached`.
+      barrier.UntilReached(num_threads, workers, spin, epoch);
+      const timer::Ticks d_barrier = stopwatch.Elapsed();
+      stats.NotifyMainRun(main.NumThreads(), t_before_wake, d_wake, d_run,
+                          d_barrier);
+
+      HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) {
+        for (size_t i = 0; i < 1 + num_threads; ++i) {
+          HWY_DASSERT(barrier.HasReached(workers + i, epoch));
+        }
+      }
+
+      // Threads are or will soon be waiting `UntilWoken`, which serves as the
+      // 'release' phase of the barrier.
+    }
+  };
+
+  // Called by `std::thread`. Could also be a lambda.
+  class ThreadFunc {
+    // Functor called by `CallWithConfig`. Loops until `SendConfig` changes the
+    // Spin or Wait policy or the pool is destroyed.
+    struct WorkerLoop {
+      template <class Spin, class Wait>
+      void operator()(const Spin& spin, const Wait& wait, pool::Worker& worker,
+                      pool::Tasks& tasks, const pool::Shared& shared,
+                      pool::Stats& stats) const {
+        do {
+          // Main worker also calls this, so their epochs match.
+          const uint32_t epoch = worker.AdvanceWorkerEpoch();
+
+          Stopwatch stopwatch(shared.MakeStopwatch());
+
+          const size_t wait_reps = wait.UntilWoken(worker, spin);
+          const timer::Ticks d_wait = stopwatch.Elapsed();
+          const timer::Ticks t_before_run = stopwatch.Origin();
+
+          tasks.WorkerRun(&worker, shared, stats);
+          const timer::Ticks d_run = stopwatch.Elapsed();
+          stats.NotifyThreadRun(worker.Index(), d_wait, wait_reps, t_before_run,
+                                d_run);
+
+          // Notify barrier after `WorkerRun`. Note that we cannot send an
+          // after-barrier timestamp, see above.
+          pool::Barrier().WorkerReached(worker, epoch);
+          // Check after `WorkerReached`, otherwise the main thread deadlocks.
+        } while (worker.GetExit() == Exit::kNone);
+      }
+    };
+
+   public:
+    ThreadFunc(pool::Worker& worker, pool::Tasks& tasks,
+               const pool::Shared& shared, pool::Stats& stats)
+        : worker_(worker), tasks_(tasks), shared_(shared), stats_(stats) {}
+
+    void operator()() {
+      // Ensure main thread's writes are visible (synchronizes with fence in
+      // `WorkerLifecycle::Init`).
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      HWY_DASSERT(worker_.Index() != 0);  // main is 0
+      SetThreadName("worker%03zu", static_cast<int>(worker_.Index() - 1));
+
+      worker_.SetStartTime();
+      Profiler& profiler = Profiler::Get();
+      profiler.SetGlobalIdx(worker_.GlobalIdx());
+      // No Zone here because it would only exit after `GetExit`, which may be
+      // after the main thread's `PROFILER_END_ROOT_RUN`, and thus too late to
+      // be counted. Instead, `ProfilerFunc` records the elapsed time.
+
+      // Loop termination via `GetExit` is triggered by `~ThreadPool`.
+      for (;;) {
+        // Uses the initial config, or the last one set during WorkerRun.
+        CallWithConfig(worker_.NextConfig(), WorkerLoop(), worker_, tasks_,
+                       shared_, stats_);
+
+        // Exit or reset the flag and return to WorkerLoop with a new config.
+        if (worker_.GetExit() == Exit::kThread) break;
+        worker_.SetExit(Exit::kNone);
+      }
+
+      profiler.SetGlobalIdx(~size_t{0});
+
+      // Defer `FreeWorker` until workers are destroyed to ensure the profiler
+      // is not still using the worker.
+    }
+
+   private:
+    pool::Worker& worker_;
+    pool::Tasks& tasks_;
+    const pool::Shared& shared_;
+    pool::Stats& stats_;
+  };
+
+  void PrintStats() {
+    // Total run time from all non-main threads.
+    std::atomic<timer::Ticks> sum_thread_elapsed{0};
+    (void)RunWithoutAutotune(
+        0, NumWorkers(), shared_.print_stats,
+        [this, &sum_thread_elapsed](HWY_MAYBE_UNUSED uint64_t task,
+                                    size_t worker) {
+          HWY_DASSERT(task == worker);
+          // Skip any main thread(s) because they did not init the stopwatch.
+          if (worker != 0) {
+            sum_thread_elapsed.fetch_add(workers_[worker].ElapsedTime());
+          }
+        });
+    const timer::Ticks thread_total =
+        sum_thread_elapsed.load(std::memory_order_acquire);
+    stats_.PrintAndReset(num_threads_, thread_total);
+  }
+
+  // Returns whether threads were used. If not, there is no need to update
+  // the autotuner config.
+  template <class Closure>
+  bool RunWithoutAutotune(uint64_t begin, uint64_t end, pool::Caller caller,
+                          const Closure& closure) {
+    pool::Worker& main = workers_[0];
+
     const size_t num_tasks = static_cast<size_t>(end - begin);
     const size_t num_workers = NumWorkers();
 
@@ -1150,11 +1618,17 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
       for (uint64_t task = begin; task < end; ++task) {
         closure(task, /*worker=*/0);
       }
-      return;
+      return false;
     }
 
-    SetBusy();
+    busy_.Set();
+
+#if PROFILER_ENABLED
     const bool is_root = PROFILER_IS_ROOT_RUN();
+    Stopwatch stopwatch(shared_.MakeStopwatch());
+    const timer::Ticks wait_before =
+        is_root ? shared_.LastRootEnd().Elapsed() : 0;
+#endif
 
     tasks_.Set(begin, end, closure);
 
@@ -1163,116 +1637,43 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
       pool::Tasks::DivideRangeAmongWorkers(begin, end, div_workers_, workers_);
     }
 
-    main_adapter_.SetEpoch(++epoch_);
-
-    AutoTuneT& auto_tuner = AutoTuner();
-    if (HWY_LIKELY(auto_tuner.Best())) {
-      CallWithConfig(config_, main_adapter_);
-      if (is_root) {
-        PROFILER_END_ROOT_RUN();
-      }
-      ClearBusy();
-    } else {
-      const uint64_t t0 = timer::Start();
-      CallWithConfig(config_, main_adapter_);
-      const uint64_t t1 = have_timer_stop_ ? timer::Stop() : timer::Start();
-      auto_tuner.NotifyCost(t1 - t0);
-      if (is_root) {
-        PROFILER_END_ROOT_RUN();
-      }
-      ClearBusy();              // before `SendConfig`
-      if (auto_tuner.Best()) {  // just finished
-        HWY_IF_CONSTEXPR(pool::kVerbosity >= 1) {
-          const size_t idx_best = static_cast<size_t>(
-              auto_tuner.Best() - auto_tuner.Candidates().data());
-          HWY_DASSERT(idx_best < auto_tuner.Costs().size());
-          auto& AT = auto_tuner.Costs()[idx_best];
-          const double best_cost = AT.EstimateCost();
-          HWY_DASSERT(best_cost > 0.0);  // will divide by this below
-
-          Stats s_ratio;
-          for (size_t i = 0; i < auto_tuner.Costs().size(); ++i) {
-            if (i == idx_best) continue;
-            const double cost = auto_tuner.Costs()[i].EstimateCost();
-            s_ratio.Notify(static_cast<float>(cost / best_cost));
-          }
-
-          fprintf(stderr, "  %s %5.0f +/- %4.0f. Gain %.2fx [%.2fx, %.2fx]\n",
-                  auto_tuner.Best()->ToString().c_str(), best_cost, AT.Stddev(),
-                  s_ratio.GeometricMean(), s_ratio.Min(), s_ratio.Max());
-        }
-        SendConfig(*auto_tuner.Best());
-      } else {
-        HWY_IF_CONSTEXPR(pool::kVerbosity >= 2) {
-          fprintf(stderr, "  %s %5lu\n", config_.ToString().c_str(), t1 - t0);
-        }
-        SendConfig(auto_tuner.NextConfig());
-      }
+    // Runs `MainWakeAndBarrier` with the first worker slot.
+    CallWithConfig(config(), MainWakeAndBarrier(), main, tasks_, shared_,
+                   stats_);
+
+#if PROFILER_ENABLED
+    pool::CallerAccumulator& acc =
+        shared_.Cluster(main.ClusterIdx()).Get(caller.Idx());
+    acc.Add(num_tasks, num_workers, is_root, wait_before, stopwatch.Elapsed());
+    if (is_root) {
+      PROFILER_END_ROOT_RUN();
+      shared_.LastRootEnd().Reset();
     }
-  }
+#else
+    (void)caller;
+#endif
 
- private:
-  // Used to initialize ThreadPool::num_threads_ from its ctor argument.
-  static size_t ClampedNumThreads(size_t num_threads) {
-    // Upper bound is required for `worker_bytes_`.
-    if (HWY_UNLIKELY(num_threads > pool::kMaxThreads)) {
-      HWY_WARN("ThreadPool: clamping num_threads %zu to %zu.", num_threads,
-               pool::kMaxThreads);
-      num_threads = pool::kMaxThreads;
-    }
-    return num_threads;
+    busy_.Clear();
+    return true;
   }
 
-  // Debug-only re-entrancy detection.
-  void SetBusy() { HWY_DASSERT(!busy_.test_and_set()); }
-  void ClearBusy() { HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) busy_.clear(); }
-
-  // Two-phase barrier protocol for sending `copy` to workers, similar to the
-  // 'quiescent state' used in RCU.
-  //
-  // Phase 1:
-  // - Main wakes threads using the old config.
-  // - Threads latch `copy` during `WorkerRun`.
-  // - Threads notify a barrier and wait for the next wake using the old config.
-  //
-  // Phase 2:
-  // - Main wakes threads still using the old config.
-  // - Threads switch their config to their latched `copy`.
-  // - Threads notify a barrier and wait, BOTH with the new config.
-  // - Main thread switches to `copy` for the next wake.
-  HWY_NOINLINE void SendConfig(pool::Config copy) {
-    if (NumWorkers() == 1) {
-      config_ = copy;
-      return;
-    }
-
-    SetBusy();
-
-    const auto closure = [this, copy](uint64_t task, size_t worker) {
-      (void)task;
-      HWY_DASSERT(task == worker);  // one task per worker
-      workers_[worker].LatchConfig(copy);
-    };
-
-    tasks_.Set(0, NumWorkers(), closure);
-    // Same config as workers are *currently* using.
-    main_adapter_.SetEpoch(++epoch_);
-    CallWithConfig(config_, main_adapter_);
-    // All workers have latched `copy` and are waiting with the old config.
-
-    // No-op task; will not be called because begin == end.
-    tasks_.Set(0, 0, [](uint64_t /*task*/, size_t /*worker*/) {});
-    // Threads are waiting using the old config, but will switch after waking,
-    // which means we must already use the new barrier.
-    pool::Config new_barrier = config_;
-    new_barrier.barrier_type = copy.barrier_type;
-    main_adapter_.SetEpoch(++epoch_);
-    CallWithConfig(new_barrier, main_adapter_);
-    // All have woken and are, or will be, waiting per the *new* config. Now we
+  // Sends `next_config` to workers:
+  // - Main wakes threads using the current config.
+  // - Threads copy `next_config` into their `Worker` during `WorkerRun`.
+  // - Threads notify the (same) barrier and already wait for the next wake
+  //   using `next_config`.
+  HWY_NOINLINE void SendConfig(pool::Config next_config) {
+    (void)RunWithoutAutotune(
+        0, NumWorkers(), shared_.send_config,
+        [this, next_config](HWY_MAYBE_UNUSED uint64_t task, size_t worker) {
+          HWY_DASSERT(task == worker);  // one task per worker
+          workers_[worker].SetNextConfig(next_config);
+          workers_[worker].SetExit(Exit::kLoop);
+        });
+
+    // All have woken and are, or will be, waiting per `next_config`. Now we
     // can entirely switch the main thread's config for the next wake.
-    config_ = copy;
-
-    ClearBusy();
+    workers_[0].SetNextConfig(next_config);
   }
 
   using AutoTuneT = AutoTune<pool::Config, 30>;
@@ -1284,21 +1685,21 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
     return auto_tune_[static_cast<size_t>(wait_mode_) - 1];
   }
 
-  char cpu100_[100];
-  const bool have_timer_stop_;
   const size_t num_threads_;  // not including main thread
   const Divisor64 div_workers_;
+  pool::Shared& shared_;
   pool::Worker* const workers_;  // points into `worker_bytes_`
 
-  pool::MainAdapter main_adapter_;
+  alignas(HWY_ALIGNMENT) pool::Stats stats_;
 
-  // The only mutable state:
-  pool::Tasks tasks_;    // written by `Run` and read by workers.
-  pool::Config config_;  // for use by the next `Run`. Updated via `SendConfig`.
-  uint32_t epoch_ = 0;   // passed to `MainAdapter`.
+  // This is written by the main thread and read by workers, via reference
+  // passed to `ThreadFunc`. Padding ensures that the workers' cache lines are
+  // not unnecessarily invalidated when the main thread writes other members.
+  alignas(HWY_ALIGNMENT) pool::Tasks tasks_;
+  HWY_MEMBER_VAR_MAYBE_UNUSED char
+      padding_[HWY_ALIGNMENT - sizeof(pool::Tasks)];
 
-  // In debug builds, detects if functions are re-entered.
-  std::atomic_flag busy_ = ATOMIC_FLAG_INIT;
+  pool::BusyFlag busy_;
 
   // Unmodified after ctor, but cannot be const because we call thread::join().
   std::vector<std::thread> threads_;
diff --git a/third_party/highway/hwy/contrib/thread_pool/thread_pool_test.cc b/third_party/highway/hwy/contrib/thread_pool/thread_pool_test.cc
@@ -31,7 +31,6 @@
 #include "hwy/base.h"  // PopCount
 #include "hwy/contrib/thread_pool/spin.h"
 #include "hwy/contrib/thread_pool/topology.h"
-#include "hwy/profiler.h"
 #include "hwy/tests/hwy_gtest.h"
 #include "hwy/tests/test_util-inl.h"  // AdjustedReps
 
@@ -204,64 +203,57 @@ TEST(ThreadPoolTest, TestMultiplePermutations) {
 
 class DoWait {
  public:
-  DoWait(Worker* worker, uint32_t epoch) : worker_(worker), epoch_(epoch) {}
+  explicit DoWait(Worker& worker) : worker_(worker) {}
 
-  template <class Spin, class Wait, class Barrier>
-  void operator()(const Spin& spin, const Wait& wait, const Barrier&) const {
-    wait.UntilWoken(worker_, spin, epoch_);
+  template <class Spin, class Wait>
+  void operator()(const Spin& spin, const Wait& wait) const {
+    wait.UntilWoken(worker_, spin);
   }
 
  private:
-  Worker* const worker_;
-  const uint32_t epoch_;
+  Worker& worker_;
 };
 
 class DoWakeWorkers {
  public:
-  DoWakeWorkers(Worker* workers, uint32_t epoch)
-      : workers_(workers), epoch_(epoch) {}
+  explicit DoWakeWorkers(Worker* workers) : workers_(workers) {}
 
-  template <class Spin, class Wait, class Barrier>
-  void operator()(const Spin&, const Wait& wait, const Barrier&) const {
-    wait.WakeWorkers(workers_, epoch_);
+  template <class Spin, class Wait>
+  void operator()(const Spin&, const Wait& wait) const {
+    wait.WakeWorkers(workers_, workers_[0].WorkerEpoch());
   }
 
  private:
   Worker* const workers_;
-  const uint32_t epoch_;
 };
 
 // Verifies that waiter(s) can be woken by another thread.
 TEST(ThreadPoolTest, TestWaiter) {
   if (!hwy::HaveThreadingSupport()) return;
 
-  const uint32_t epoch = 1;
-
   // Not actual threads, but we allocate and loop over this many workers.
   for (size_t num_threads = 1; num_threads < 6; ++num_threads) {
     const size_t num_workers = 1 + num_threads;
     auto storage = hwy::AllocateAligned<uint8_t>(num_workers * sizeof(Worker));
     HWY_ASSERT(storage);
     const Divisor64 div_workers(num_workers);
+    Shared& shared = Shared::Get();  // already calls ReserveWorker(0).
 
     for (WaitType wait_type :
          {WaitType::kBlock, WaitType::kSpin1, WaitType::kSpinSeparate}) {
-      Worker* workers =
-          pool::WorkerLifecycle::Init(storage.get(), num_threads, div_workers);
+      Worker* workers = pool::WorkerLifecycle::Init(
+          storage.get(), num_threads, PoolWorkerMapping(), div_workers, shared);
 
-      alignas(8) const Config config(SpinType::kPause, wait_type,
-                                     BarrierType::kGroup4);
+      alignas(8) const Config config(SpinType::kPause, wait_type);
 
       // This thread acts as the "main thread", which will wake the actual main
       // and all its worker instances.
-      std::thread thread([&]() {
-        hwy::Profiler::InitThread();
-        CallWithConfig(config, DoWakeWorkers(workers, epoch));
-      });
+      std::thread thread(
+          [&]() { CallWithConfig(config, DoWakeWorkers(workers)); });
 
       // main is 0
       for (size_t worker = 1; worker < num_workers; ++worker) {
-        CallWithConfig(config, DoWait(workers + 1, epoch));
+        CallWithConfig(config, DoWait(workers[1]));
       }
       thread.join();
 
@@ -277,8 +269,10 @@ TEST(ThreadPoolTest, TestTasks) {
     auto storage = hwy::AllocateAligned<uint8_t>(num_workers * sizeof(Worker));
     HWY_ASSERT(storage);
     const Divisor64 div_workers(num_workers);
-    Worker* workers =
-        WorkerLifecycle::Init(storage.get(), num_threads, div_workers);
+    Shared& shared = Shared::Get();
+    Stats stats;
+    Worker* workers = WorkerLifecycle::Init(
+        storage.get(), num_threads, PoolWorkerMapping(), div_workers, shared);
 
     constexpr uint64_t kMaxTasks = 20;
     uint64_t mementos[kMaxTasks];  // non-atomic, no threads involved.
@@ -300,7 +294,7 @@ TEST(ThreadPoolTest, TestTasks) {
         Tasks::DivideRangeAmongWorkers(begin, end, div_workers, workers);
         // The `tasks < workers` special case requires running by all workers.
         for (size_t worker = 0; worker < num_workers; ++worker) {
-          tasks.WorkerRun(workers + worker);
+          tasks.WorkerRun(workers + worker, shared, stats);
         }
 
         // Ensure all tasks were run.
@@ -320,8 +314,6 @@ TEST(ThreadPoolTest, TestTasks) {
 TEST(ThreadPoolTest, TestPool) {
   if (!hwy::HaveThreadingSupport()) return;
 
-  hwy::ThreadPool inner(0);
-
   constexpr uint64_t kMaxTasks = 20;
   static std::atomic<uint64_t> mementos[kMaxTasks];
   static std::atomic<uint64_t> a_begin;
@@ -329,7 +321,7 @@ TEST(ThreadPoolTest, TestPool) {
   static std::atomic<uint64_t> a_num_workers;
 
   // Called by pool; sets mementos and runs a nested but serial Run.
-  const auto func = [&inner](uint64_t task, size_t worker) {
+  const auto func = [](uint64_t task, size_t worker) {
     HWY_ASSERT(worker < a_num_workers.load());
     const uint64_t begin = a_begin.load(std::memory_order_acquire);
     const uint64_t end = a_end.load(std::memory_order_acquire);
@@ -342,11 +334,14 @@ TEST(ThreadPoolTest, TestPool) {
     // Store mementos ensure we visited each task.
     mementos[task - begin].store(1000 + task);
 
-    // Re-entering Run is fine on a 0-worker pool.
-    inner.Run(begin, end, [begin, end](uint64_t task, size_t worker) {
-      HWY_ASSERT(worker == 0);
-      HWY_ASSERT(begin <= task && task < end);
-    });
+    // Re-entering Run is fine on a 0-worker pool. Note that this must be
+    // per-thread so that it gets the `global_idx` it is running on.
+    hwy::ThreadPool inner(0);
+    inner.Run(begin, end,
+              [begin, end](uint64_t inner_task, size_t inner_worker) {
+                HWY_ASSERT(inner_worker == 0);
+                HWY_ASSERT(begin <= inner_task && inner_task < end);
+              });
   };
 
   for (size_t num_threads = 0; num_threads <= 6; num_threads += 3) {
@@ -451,7 +446,7 @@ TEST(ThreadPoolTest, TestWaitMode) {
   ThreadPool pool(9);
   RandomState rng;
   for (size_t i = 0; i < 100; ++i) {
-    pool.SetWaitMode(Random32(&rng) ? PoolWaitMode::kSpin
+    pool.SetWaitMode((Random32(&rng) & 1u) ? PoolWaitMode::kSpin
                                     : PoolWaitMode::kBlock);
   }
 }
diff --git a/third_party/highway/hwy/contrib/thread_pool/topology.cc b/third_party/highway/hwy/contrib/thread_pool/topology.cc
@@ -15,6 +15,7 @@
 
 #include "hwy/contrib/thread_pool/topology.h"
 
+#include <ctype.h>  // isspace
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -113,7 +114,9 @@ bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {
   }
   HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER);
   // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding.
-  uint8_t* buf = static_cast<uint8_t*>(malloc(buf_bytes));
+  // `calloc` zero-initializes the `Reserved` field, part of which has been
+  // repurposed into `GroupCount` in SDKs, 10.0.22000.0 or possibly earlier.
+  uint8_t* buf = static_cast<uint8_t*>(calloc(1, buf_bytes));
   HWY_ASSERT(buf);
 
   // Fill the buffer.
@@ -560,6 +563,9 @@ std::vector<size_t> ExpandList(const char* list, size_t list_end,
   constexpr size_t kNotFound = ~size_t{0};
   size_t pos = 0;
 
+  // Gracefully handle empty lists, happens on GH200 systems (#2668).
+  if (isspace(list[0]) && list_end <= 2) return expanded;
+
   // Returns first `found_pos >= pos` where `list[found_pos] == c`, or
   // `kNotFound`.
   const auto find = [list, list_end, &pos](char c) -> size_t {
@@ -654,6 +660,27 @@ void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
 
 #elif HWY_OS_WIN
 
+// See #2734. GroupCount was added around Windows 10, but SDK docs do not
+// mention the actual version required. It is known to be absent in 8.1 and
+// MinGW 5.0.1, and present in the 10.0.22000.0 SDK. However, the OS must also
+// know about the field. Thus we zero-initialize the reserved field, assume it
+// remains zero, and return 1 if zero (old style single GroupMask), otherwise
+// the number of groups. There are two such structures, but note that
+// `PROCESSOR_RELATIONSHIP` already had this field.
+static size_t GroupCount(const CACHE_RELATIONSHIP& cr) {
+  // Added as the last u16 in the reserved area before GroupMask. We only read
+  // one byte because 256*64 processor bits are plenty.
+  const uint8_t* pcount =
+      reinterpret_cast<const uint8_t*>(&cr.GroupMask) - sizeof(uint16_t);
+  return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1);
+}
+
+static size_t GroupCount(const NUMA_NODE_RELATIONSHIP& nn) {
+  const uint8_t* pcount =
+      reinterpret_cast<const uint8_t*>(&nn.GroupMask) - sizeof(uint16_t);
+  return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1);
+}
+
 // Also sets LP.core and LP.smt.
 size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) {
   size_t max_lps_per_core = 0;
@@ -707,7 +734,7 @@ size_t MaxCoresPerCluster(const size_t max_lps_per_core,
     const CACHE_RELATIONSHIP& cr = info.Cache;
     if (cr.Type != CacheUnified && cr.Type != CacheData) return;
     if (cr.Level != 3) return;
-    foreach_cluster(cr.GroupCount, cr.GroupMasks);
+    foreach_cluster(GroupCount(cr), cr.GroupMasks);
   };
 
   if (!ForEachSLPI(RelationProcessorDie, foreach_die)) {
@@ -764,7 +791,7 @@ void SetNodes(std::vector<Topology::LP>& lps) {
     if (info.Relationship != RelationNumaNode) return;
     const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode;
     // This field was previously reserved/zero. There is at least one group.
-    const size_t num_groups = HWY_MAX(1, nn.GroupCount);
+    const size_t num_groups = HWY_MAX(1, GroupCount(nn));
     const uint8_t node = static_cast<uint8_t>(nn.NodeNumber);
     ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__,
                [node](size_t lp, std::vector<Topology::LP>& lps) {
@@ -991,7 +1018,7 @@ bool InitCachesSysfs(Caches& caches) {
 // and their properties. It's OK to return false; callers are responsible for
 // assuming reasonable defaults.
 #ifndef __ANDROID__
-    HWY_WARN("sysfs detected L1=%u L2=%u, err %x\n", caches[1].size_kib,
+    HWY_WARN("sysfs detected L1=%u L2=%u, err %d\n", caches[1].size_kib,
              caches[2].size_kib, errno);
 #endif
     return false;
@@ -1023,7 +1050,7 @@ bool InitCachesWin(Caches& caches) {
                             : cr.Associativity;
 
       // How many cores share this cache?
-      size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks);
+      size_t shared_with = NumBits(GroupCount(cr), cr.GroupMasks);
       // Divide out hyperthreads. This core may have fewer than
       // `max_lps_per_core`, hence round up.
       shared_with = DivCeil(shared_with, max_lps_per_core);
diff --git a/third_party/highway/hwy/detect_compiler_arch.h b/third_party/highway/hwy/detect_compiler_arch.h
@@ -20,6 +20,11 @@
 // inclusion by foreach_target.h.
 
 // Add to #if conditions to prevent IDE from graying out code.
+// Note for clangd users: There is no predefined macro in clangd, so you must
+// manually add these two lines (without the preceding '// ') to your project's
+// `.clangd` file:
+// CompileFlags:
+//   Add: [-D__CLANGD__]
 #if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
     (defined Q_CREATOR_RUN) || (defined __CLANGD__) ||        \
     (defined GROK_ELLIPSIS_BUILD)
@@ -65,15 +70,23 @@
 #define HWY_COMPILER_GCC 0
 #endif
 
-// Clang or clang-cl, not GCC.
-#ifdef __clang__
+#ifndef HWY_COMPILER_CLANG  // Allow user override.
+#ifdef __clang__            // Clang or clang-cl, not GCC.
 // In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
 // an invalid version number, deduce it from the presence of warnings.
 // Originally based on
 // https://github.com/simd-everywhere/simde/blob/47d6e603de9d04ee05cdfbc57cf282a02be1bf2a/simde/simde-detect-clang.h#L59.
 // Please send updates below to them as well, thanks!
 #if defined(__apple_build_version__) || __clang_major__ >= 999
-#if __has_warning("-Woverriding-option")
+#if __has_builtin(__builtin_elementwise_fshl)
+#define HWY_COMPILER_CLANG 2201
+#elif __has_builtin(__builtin_structured_binding_size)
+#define HWY_COMPILER_CLANG 2101
+#elif __has_builtin(__builtin_common_type)
+#define HWY_COMPILER_CLANG 2001
+#elif __has_warning("-Wreturn-mismatch")
+#define HWY_COMPILER_CLANG 1901
+#elif __has_warning("-Woverriding-option")
 #define HWY_COMPILER_CLANG 1801
 // No new warnings in 17.0, and Apple LLVM 15.3, which should be 1600, already
 // has the unsafe_buffer_usage attribute, so we instead check for new builtins.
@@ -108,7 +121,6 @@
 #else  // Anything older than 7.0 is not recommended for Highway.
 #define HWY_COMPILER_CLANG 600
 #endif  // __has_warning chain
-#define HWY_COMPILER3_CLANG (HWY_COMPILER_CLANG * 100)
 #else  // use normal version
 #define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
 #define HWY_COMPILER3_CLANG \
@@ -117,6 +129,12 @@
 #else  // Not clang
 #define HWY_COMPILER_CLANG 0
 #define HWY_COMPILER3_CLANG 0
+#endif  // __clang__
+#endif  // HWY_COMPILER_CLANG
+
+// User-defined or deduced HWY_COMPILER_CLANG: derive HWY_COMPILER3_CLANG.
+#ifndef HWY_COMPILER3_CLANG
+#define HWY_COMPILER3_CLANG (HWY_COMPILER_CLANG * 100)
 #endif
 
 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && !HWY_COMPILER_ICC && \
diff --git a/third_party/highway/hwy/detect_targets.h b/third_party/highway/hwy/detect_targets.h
@@ -92,7 +92,7 @@
 #define HWY_SVE2 (1LL << 23)
 #define HWY_SVE (1LL << 24)
 // Bit 25 reserved for NEON
-#define HWY_NEON_BF16 (1LL << 26)  // fp16/dot/bf16 (e.g. Neoverse V2/N2/N3)
+#define HWY_NEON_BF16 (1LL << 26)  // fp16/dot/bf16/i8mm (e.g. Neoverse V2/N2)
 // Bit 27 reserved for NEON
 #define HWY_NEON (1LL << 28)  // Implies support for AES
 #define HWY_NEON_WITHOUT_AES (1LL << 29)
@@ -194,6 +194,17 @@
 #endif
 #endif  // HWY_BROKEN_MSVC
 
+#ifndef HWY_BROKEN_AVX10_2  // allow override
+// AVX10_2 requires clang >= 20.1 (postpone to 23 due to "avx10.2-512" remnant,
+// only removed in https://github.com/llvm/llvm-project/pull/157034) or
+// gcc >= 15.2 with binutils 2.44.
+#if (HWY_COMPILER_CLANG < 2300) && (HWY_COMPILER_GCC_ACTUAL < 1502)
+#define HWY_BROKEN_AVX10_2 HWY_AVX10_2
+#else
+#define HWY_BROKEN_AVX10_2 0
+#endif
+#endif  // HWY_BROKEN_AVX10_2
+
 #ifndef HWY_BROKEN_AVX3_DL_ZEN4  // allow override
 // AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
 // 2021.
@@ -245,9 +256,10 @@
 #endif  // HWY_BROKEN_ARM7_WITHOUT_VFP4
 
 #ifndef HWY_BROKEN_NEON_BF16  // allow override
-// HWY_NEON_BF16 requires recent compilers.
+// Broken on older compilers:
 #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1700) || \
-    (HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 1302)
+    (HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 1302) || \
+    (defined(__apple_build_version__) && __apple_build_version__ <= 17000000)
 #define HWY_BROKEN_NEON_BF16 (HWY_NEON_BF16)
 #else
 #define HWY_BROKEN_NEON_BF16 0
@@ -257,9 +269,9 @@
 // SVE[2] require recent clang or gcc versions.
 
 #ifndef HWY_BROKEN_SVE  // allow override
-// GCC 10+. Clang 19 still has many test failures for SVE. No Apple CPU (at
-// least up to and including M4 and A18) has SVE.
-#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2000) ||           \
+// GCC 10+. Clang 22 still has test failures for SVE, including MSAN. No Apple
+// CPU (at least up to and including M4 and A18) has SVE.
+#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2300) ||           \
     (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
     HWY_OS_APPLE
 #define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE_256)
@@ -269,8 +281,8 @@
 #endif  // HWY_BROKEN_SVE
 
 #ifndef HWY_BROKEN_SVE2  // allow override
-// Clang 19 still has many test failures for SVE2.
-#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2000) ||           \
+// Clang 21 still has test failures for SVE2, including MSAN.
+#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2300) ||           \
     (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
     HWY_OS_APPLE
 #define HWY_BROKEN_SVE2 (HWY_SVE2 | HWY_SVE2_128)
@@ -332,9 +344,12 @@
 #ifndef HWY_BROKEN_LOONGARCH  // allow override
 // Using __loongarch_sx and __loongarch_asx macros to
 // check whether LSX/LASX targets are available.
-#if !defined(__loongarch_sx)
+// GCC does not work yet, see https://gcc.gnu.org/PR121875.
+#if !defined(__loongarch_sx) && \
+    !(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800)
 #define HWY_BROKEN_LOONGARCH (HWY_LSX | HWY_LASX)
-#elif !defined(__loongarch_asx)
+#elif !defined(__loongarch_asx) && \
+      !(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800)
 #define HWY_BROKEN_LOONGARCH (HWY_LASX)
 #else
 #define HWY_BROKEN_LOONGARCH 0
@@ -360,12 +375,12 @@
 // Allow the user to override this without any guarantee of success.
 #ifndef HWY_BROKEN_TARGETS
 
-#define HWY_BROKEN_TARGETS                                     \
-  (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC |    \
-   HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR |             \
-   HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \
-   HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_SVE2 |   \
-   HWY_BROKEN_PPC10 | HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV |  \
+#define HWY_BROKEN_TARGETS                                              \
+  (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC |             \
+   HWY_BROKEN_AVX10_2 | HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
+   HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 |          \
+   HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_SVE2 |            \
+   HWY_BROKEN_PPC10 | HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV |           \
    HWY_BROKEN_LOONGARCH | HWY_BROKEN_Z14)
 
 #endif  // HWY_BROKEN_TARGETS
@@ -489,7 +504,8 @@
 #if defined(__ARM_FEATURE_AES) &&                    \
     defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && \
     defined(__ARM_FEATURE_DOTPROD) &&                \
-    defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
+    defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
+    defined(__ARM_FEATURE_MATMUL_INT8)
 #define HWY_BASELINE_NEON HWY_ALL_NEON
 #elif defined(__ARM_FEATURE_AES)
 #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON)
@@ -642,8 +658,7 @@
 #define HWY_BASELINE_AVX3_SPR 0
 #endif
 
-#if HWY_BASELINE_AVX3_SPR != 0 && defined(__AVX10_2__) && \
-    (HWY_COMPILER_GCC_ACTUAL >= 1500 || HWY_COMPILER_CLANG >= 2001)
+#if HWY_BASELINE_AVX3_SPR != 0 && defined(__AVX10_2__)
 #define HWY_BASELINE_AVX10_2 HWY_AVX10_2
 #else
 #define HWY_BASELINE_AVX10_2 0
@@ -669,7 +684,20 @@
 #define HWY_BASELINE_LOONGARCH 0
 #endif
 
-// Allow the user to override this without any guarantee of success.
+// Workaround for libaom, which unconditionally defines HWY_BASELINE_TARGETS
+// even when that would be disabled/broken. If so, at least use AVX2.
+#if defined(HWY_BASELINE_TARGETS)
+#if HWY_BASELINE_TARGETS == HWY_AVX3_DL && \
+    ((HWY_BROKEN_TARGETS | HWY_DISABLED_TARGETS) & HWY_AVX3_DL)
+#undef HWY_BASELINE_TARGETS
+#define HWY_BASELINE_TARGETS HWY_AVX2
+#endif
+#endif  // HWY_BASELINE_TARGETS
+
+// Allow the user to override this without any guarantee of success. If the
+// compiler invocation considers that target to be broken/disabled, then
+// `HWY_ENABLED_BASELINE` will be 0 and users will have to check for that and
+// skip their code.
 #ifndef HWY_BASELINE_TARGETS
 #define HWY_BASELINE_TARGETS                                               \
   (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 |           \
@@ -686,7 +714,11 @@
 
 #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
 #if HWY_ENABLED_BASELINE == 0
-#error "At least one baseline target must be defined and enabled"
+#pragma message                                                            \
+    "All baseline targets are disabled or considered broken."              \
+    "This is typically due to very restrictive HWY_BASELINE_TARGETS, or "  \
+    "too expansive HWY_BROKEN_TARGETS or HWY_DISABLED_TAREGTS. User code " \
+    "must also check for this and skip any usage of SIMD."
 #endif
 
 // Best baseline, used for static dispatch. This is the least-significant 1-bit
@@ -766,8 +798,8 @@
 #endif  // HWY_HAVE_RUNTIME_DISPATCH_APPLE
 
 #ifndef HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH  // allow override
-#if HWY_ARCH_LOONGARCH && HWY_HAVE_AUXV && (defined(__loongarch_sx) || \
-    defined(__loongarch_asx))
+#if HWY_ARCH_LOONGARCH && HWY_HAVE_AUXV && !defined(__loongarch_asx) && \
+    HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800
 #define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 1
 #else
 #define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 0
@@ -930,7 +962,7 @@
 // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
 // one of the dynamic targets. This also implies HWY_TARGETS != 0 and
 // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
-#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
+#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0 && HWY_ENABLED_BASELINE != 0
 #error "Logic error: best baseline should be included in dynamic targets"
 #endif
 
diff --git a/third_party/highway/hwy/highway.h b/third_party/highway/hwy/highway.h
@@ -677,6 +677,19 @@ struct AddExport {
 #define HWY_HIGHWAY_PER_TARGET
 #endif
 
+// No SIMD target enabled, skip header inclusion.
+#if HWY_ENABLED_BASELINE == 0
+
+// We would expect that HWY_TARGET and HWY_STATIC_TARGET are now both 0.
+#if HWY_TARGET != 0
+#error "Why is HWY_TARGET not 0 when HWY_ENABLED_BASELINE == 0?"
+#endif
+#if HWY_STATIC_TARGET != 0
+#error "Why is HWY_STATIC_TARGET not 0 when HWY_ENABLED_BASELINE == 0?"
+#endif
+
+#else
+
 // These define ops inside namespace hwy::HWY_NAMESPACE.
 #if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
 #include "hwy/ops/x86_128-inl.h"
@@ -713,4 +726,6 @@ struct AddExport {
 
 #include "hwy/ops/generic_ops-inl.h"
 
+#endif  // HWY_ENABLED_BASELINE
+
 #endif  // HWY_HIGHWAY_PER_TARGET
diff --git a/third_party/highway/hwy/highway_test.cc b/third_party/highway/hwy/highway_test.cc
@@ -548,7 +548,7 @@ struct TestBlocks {
 };
 
 HWY_NOINLINE void TestAllBlocks() {
-  ForAllTypes(ForPartialVectors<TestDFromV>());
+  ForAllTypes(ForPartialVectors<TestBlocks>());
 }
 
 struct TestBlockDFromD {
diff --git a/third_party/highway/hwy/nanobenchmark.h b/third_party/highway/hwy/nanobenchmark.h
@@ -132,8 +132,8 @@ HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg,
 
 // Calls operator() of the given closure (lambda function).
 template <class Closure>
-static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
-  return (*f)(input);
+static FuncOutput CallClosure(const void* f, const FuncInput input) {
+  return (*reinterpret_cast<const Closure*>(f))(input);
 }
 
 // Same as Measure, except "closure" is typically a lambda function of
@@ -143,7 +143,7 @@ static inline size_t MeasureClosure(const Closure& closure,
                                     const FuncInput* inputs,
                                     const size_t num_inputs, Result* results,
                                     const Params& p = Params()) {
-  return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
+  return Measure(static_cast<Func>(&CallClosure<Closure>),
                  reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
                  results, p);
 }
diff --git a/third_party/highway/hwy/ops/arm_neon-inl.h b/third_party/highway/hwy/ops/arm_neon-inl.h
@@ -7662,22 +7662,18 @@ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
 #endif
 
-template <class DI32, HWY_IF_I32_D(DI32)>
+template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
-    DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
+    DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
     VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
-  // TODO: use vusdot[q]_s32 on NEON targets that require support for NEON I8MM
-
-  const RebindToUnsigned<decltype(di32)> du32;
-  const Repartition<uint8_t, decltype(di32)> du8;
-
-  const auto b_u = BitCast(du8, b_i);
-  const auto result_sum0 =
-      SumOfMulQuadAccumulate(du32, a_u, b_u, BitCast(du32, sum));
-  const auto result_sum1 = ShiftLeft<8>(
-      SumOfMulQuadAccumulate(du32, a_u, ShiftRight<7>(b_u), Zero(du32)));
+  return VFromD<DI32>(vusdot_s32(sum.raw, a_u.raw, b_i.raw));
+}
 
-  return BitCast(di32, Sub(result_sum0, result_sum1));
+template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
+HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
+    DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
+    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
+  return VFromD<DI32>(vusdotq_s32(sum.raw, a_u.raw, b_i.raw));
 }
 
 #endif  // HWY_TARGET == HWY_NEON_BF16
diff --git a/third_party/highway/hwy/ops/arm_sve-inl.h b/third_party/highway/hwy/ops/arm_sve-inl.h
@@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -389,9 +390,18 @@ HWY_API svbool_t PFalse() { return svpfalse_b(); }
 //
 // This is used in functions that load/store memory; other functions (e.g.
 // arithmetic) can ignore d and use PTrue instead.
+//
+// Always use FirstN(N) for HWY_TARGET == HWY_SVE2_128 to avoid vector length
+// information loss when using PTrue(d) predicates in memory intrinsics.
+//
+// SVE2_256 is untested due to unavailable hardware and cannot assume
+// equal minimum and maximum vector lengths as SVE2_128 can.
 template <class D>
 svbool_t MakeMask(D d) {
-  return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
+#if HWY_TARGET != HWY_SVE2_128
+  HWY_IF_CONSTEXPR(IsFull(d)) { return PTrue(d); }
+#endif
+  return FirstN(d, Lanes(d));
 }
 
 }  // namespace detail
@@ -407,6 +417,20 @@ HWY_API svbool_t MaskFalse(const D /*d*/) {
   return detail::PFalse();
 }
 
+#ifdef HWY_NATIVE_SET_MASK
+#undef HWY_NATIVE_SET_MASK
+#else
+#define HWY_NATIVE_SET_MASK
+#endif
+
+template <class D>
+HWY_API svbool_t SetMask(D d, bool val) {
+  // The SVE svdup_n_b* intrinsics are equivalent to the FirstN op below if
+  // detail::IsFull(d) is true since svdup_n_b* is simply a wrapper around the
+  // SVE whilelo instruction.
+  return FirstN(d, size_t{0} - static_cast<size_t>(val));
+}
+
 // ================================================== INIT
 
 // ------------------------------ Set
@@ -5304,7 +5328,7 @@ HWY_API V AverageRound(const V a, const V b) {
 // `p` points to at least 8 readable bytes, not all of which need be valid.
 template <class D, HWY_IF_T_SIZE_D(D, 1)>
 HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
-#if HWY_COMPILER_CLANG >= 1901 || HWY_COMPILER_GCC_ACTUAL >= 1200
+#if HWY_COMPILER_CLANG >= 2200 || HWY_COMPILER_GCC_ACTUAL >= 1200
   typedef svbool_t UnalignedSveMaskT
       __attribute__((__aligned__(1), __may_alias__));
   (void)d;
@@ -6531,9 +6555,10 @@ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(DU32 /*du32*/, svuint8_t a,
 template <class DI32, HWY_IF_I32_D(DI32)>
 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32, svuint8_t a_u,
                                             svint8_t b_i, svint32_t sum) {
-  // TODO: use svusdot_u32 on SVE targets that require support for both SVE2
-  // and SVE I8MM.
-
+#if HWY_SVE_HAVE_2
+  (void)di32;
+  return svusdot_s32(sum, a_u, b_i);
+#else
   const RebindToUnsigned<decltype(di32)> du32;
   const Repartition<uint8_t, decltype(di32)> du8;
 
@@ -6543,6 +6568,7 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32, svuint8_t a_u,
       ShiftLeft<8>(svdot_u32(Zero(du32), a_u, ShiftRight<7>(b_u)));
 
   return BitCast(di32, Sub(result_sum0, result_sum1));
+#endif
 }
 
 #ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
diff --git a/third_party/highway/hwy/ops/generic_ops-inl.h b/third_party/highway/hwy/ops/generic_ops-inl.h
@@ -245,6 +245,22 @@ HWY_API Mask<D> MaskFalse(D d) {
 
 #endif  // HWY_NATIVE_MASK_FALSE
 
+// ------------------------------ SetMask
+#if (defined(HWY_NATIVE_SET_MASK) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_SET_MASK
+#undef HWY_NATIVE_SET_MASK
+#else
+#define HWY_NATIVE_SET_MASK
+#endif
+
+template <class D>
+HWY_API Mask<D> SetMask(D d, bool val) {
+  const Repartition<int32_t, decltype(d)> di32;
+  return MaskFromVec(ResizeBitCast(d, Set(di32, -static_cast<int32_t>(val))));
+}
+
+#endif  // HWY_NATIVE_SET_MASK
+
 // ------------------------------ IfNegativeThenElseZero
 #if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
 #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
@@ -4525,43 +4541,46 @@ HWY_API V CLMulUpper(V a, V b) {
 #define HWY_NATIVE_POPCNT
 #endif
 
-// This overload requires vectors to be at least 16 bytes, which is the case
-// for LMUL >= 2.
-#undef HWY_IF_POPCNT
-#if HWY_TARGET == HWY_RVV
-#define HWY_IF_POPCNT(D) \
-  hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
-#else
-// Other targets only have these two overloads which are mutually exclusive, so
-// no further conditions are required.
-#define HWY_IF_POPCNT(D) void* = nullptr
-#endif  // HWY_TARGET == HWY_RVV
-
-template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
-          HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
+template <class V, class D = DFromV<V>, HWY_IF_U8_D(D)>
 HWY_API V PopulationCount(V v) {
   const D d;
-  const V lookup =
-      Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-  const auto lo = And(v, Set(d, uint8_t{0xF}));
-  const auto hi = ShiftRight<4>(v);
-  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
-}
 
-// RVV has a specialization that avoids the Set().
-#if HWY_TARGET != HWY_RVV
-// Slower fallback for capped vectors.
-template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
-          HWY_IF_V_SIZE_LE_D(D, 8)>
-HWY_API V PopulationCount(V v) {
-  const D d;
+#if HWY_TARGET == HWY_SSE2
+  // TableLookupBytes is slow on SSE2
+
   // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
   const V k33 = Set(d, uint8_t{0x33});
   v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
   v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
   return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
+#else  // HWY_TARGET != HWY_SSE2
+
+#if HWY_TARGET == HWY_RVV
+  // Need at least LMUL=1 on RVV to ensure that Lanes(d_tbl) is at least 16
+  const ScalableTag<uint8_t, HWY_MAX(HWY_POW2_D(D), 0)> d_tbl;
+#else
+  const FixedTag<uint8_t, HWY_MAX(HWY_MAX_LANES_D(D), 16)> d_tbl;
+#endif
+
+  const auto lookup = Dup128VecFromValues(d_tbl, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2,
+                                          2, 3, 2, 3, 3, 4);
+  const auto lo = And(v, Set(d, uint8_t{0xF}));
+  const auto hi = ShiftRight<4>(v);
+
+#if HWY_TARGET == HWY_RVV
+  // On RVV, use TableLookupLanes to avoid unnecessary overhead
+  const auto hi_popcnt =
+      ResizeBitCast(d, TableLookupLanes(lookup, ResizeBitCast(d_tbl, hi)));
+  const auto lo_popcnt =
+      ResizeBitCast(d, TableLookupLanes(lookup, ResizeBitCast(d_tbl, lo)));
+#else  // HWY_TARGET != HWY_RVV
+  const auto hi_popcnt = TableLookupBytes(lookup, hi);
+  const auto lo_popcnt = TableLookupBytes(lookup, lo);
+#endif  // HWY_TARGET == HWY_RVV
+
+  return Add(hi_popcnt, lo_popcnt);
+#endif  // HWY_TARGET == HWY_SSE2
 }
-#endif  // HWY_TARGET != HWY_RVV
 
 template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
 HWY_API V PopulationCount(V v) {
@@ -5474,17 +5493,15 @@ HWY_API V RoundingShiftRight(V v) {
 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
 HWY_API V RoundingShiftRightSame(V v, int shift_amt) {
   const DFromV<V> d;
-  using T = TFromD<decltype(d)>;
-
-  const int shift_amt_is_zero_mask = -static_cast<int>(shift_amt == 0);
 
+  const bool shift_amt_is_zero = (shift_amt == 0);
   const auto scaled_down_v = ShiftRightSame(
       v, static_cast<int>(static_cast<unsigned>(shift_amt) +
-                          static_cast<unsigned>(~shift_amt_is_zero_mask)));
+                          static_cast<unsigned>(shift_amt_is_zero) - 1u));
 
   return AverageRound(
       scaled_down_v,
-      And(scaled_down_v, Set(d, static_cast<T>(shift_amt_is_zero_mask))));
+      IfThenElseZero(SetMask(d, shift_amt_is_zero), scaled_down_v));
 }
 
 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
diff --git a/third_party/highway/hwy/ops/loongarch_lasx-inl.h b/third_party/highway/hwy/ops/loongarch_lasx-inl.h
@@ -15,11 +15,28 @@
 // 256-bit LASX vectors and operations.
 // External include guard in highway.h - see comment there.
 
-#include <lasxintrin.h>
-
 #include "hwy/ops/loongarch_lsx-inl.h"
 #include "hwy/ops/shared-inl.h"
 
+#ifndef __loongarch_asx
+// If LASX is to be runtime dispatched (instead of in baseline), we need
+// to enable it *and* define __loongarch_asx or the intrinsic header will
+// fail to compile.
+//
+// For consistency, the same pattern as the lsxintrin.h handling in
+// loongarch_lsx-inl.h is used (instead of moving lasxintrin.h after
+// HWY_BEFORE_NAMESPACE).
+HWY_PUSH_ATTRIBUTES("lsx,lasx")
+#define __loongarch_asx
+#include <lasxintrin.h>
+#undef __loongarch_asx
+// Prevent "unused push_attribute" warning from Clang.
+HWY_MAYBE_UNUSED static void HWY_CONCAT(hwy_lasx_dummy, __COUNTER__) () {}
+HWY_POP_ATTRIBUTES
+#else
+#include <lasxintrin.h>
+#endif
+
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
diff --git a/third_party/highway/hwy/ops/loongarch_lsx-inl.h b/third_party/highway/hwy/ops/loongarch_lsx-inl.h
@@ -13,9 +13,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <lsxintrin.h>
 #include <stdio.h>
 
+#ifndef __loongarch_sx
+// If LSX is to be runtime dispatched (instead of in baseline), we need
+// to enable it *and* define __loongarch_sx or the intrinsic header will
+// fail to compile.
+//
+// We cannot simply move lsxintrin.h after HWY_BEFORE_NAMESPACE because
+// doing so may cause the first (the only effective) inclusion of
+// lsxintrin.h to be compiled with both LSX and LASX enabled.  Then when
+// we call the inline functions in the header with only LSX enabled,
+// we'll get an "always_inline function requires lasx but would be inlined
+// into a function that is compiled without suport for lasx" error.
+HWY_PUSH_ATTRIBUTES("lsx")
+#define __loongarch_sx
+#include <lsxintrin.h>
+#undef __loongarch_sx
+// Prevent "unused push_attribute" warning from Clang.
+HWY_MAYBE_UNUSED static void HWY_CONCAT(hwy_lsx_dummy, __COUNTER__) () {}
+HWY_POP_ATTRIBUTES
+#else
+#include <lsxintrin.h>
+#endif
+
 #include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"
 
diff --git a/third_party/highway/hwy/ops/ppc_vsx-inl.h b/third_party/highway/hwy/ops/ppc_vsx-inl.h
@@ -560,6 +560,19 @@ HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
   return Xor(a, b);
 }
 
+// ------------------------------ PopulationCount
+
+#ifdef HWY_NATIVE_POPCNT
+#undef HWY_NATIVE_POPCNT
+#else
+#define HWY_NATIVE_POPCNT
+#endif
+
+template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
+HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
+  return Vec128<T, N>{vec_popcnt(v.raw)};
+}
+
 // ================================================== SIGN
 
 // ------------------------------ Neg
diff --git a/third_party/highway/hwy/ops/rvv-inl.h b/third_party/highway/hwy/ops/rvv-inl.h
@@ -5266,18 +5266,6 @@ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                               detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
 }
 
-// ------------------------------ PopulationCount (ShiftRight)
-
-// Handles LMUL < 2 or capped vectors, which generic_ops-inl cannot.
-template <typename V, class D = DFromV<V>, HWY_IF_U8_D(D),
-          hwy::EnableIf<D().Pow2() < 1 || D().MaxLanes() < 16>* = nullptr>
-HWY_API V PopulationCount(V v) {
-  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
-  v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
-  v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
-  return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
-}
-
 // ------------------------------ LoadDup128
 
 template <class D>
@@ -5565,6 +5553,15 @@ constexpr int SufficientPow2ForMask() {
   return HWY_MAX(
       D().Pow2() - 3 - static_cast<int>(FloorLog2(sizeof(TFromD<D>))), -3);
 }
+
+template <class M>
+static HWY_INLINE HWY_MAYBE_UNUSED M RvvVmmv(M mask) {
+  // The below And operation is equivalent to the RVV vmmv instruction and
+  // ensures that mask is not in the same register as a vector operand when used
+  // in RVV instructions that take both a vector operand and a mask operand.
+  return And(mask, mask);
+}
+
 }  // namespace detail
 
 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
@@ -5573,8 +5570,10 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
 
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
-  return detail::U8MaskBitsVecToMask(
-      d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
+  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
+  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
+      d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
+                            Set(du8, static_cast<uint8_t>(mask_bits)))));
 #else
   const RebindToUnsigned<decltype(d)> du8;
   const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
@@ -5594,10 +5593,10 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   const ScalableTag<uint16_t, detail::SufficientPow2ForMask<D>()> du16;
   // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
-  return detail::U8MaskBitsVecToMask(
+  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
       d, detail::ChangeLMUL(
              ScalableTag<uint8_t>(),
-             BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)))));
+             BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))))));
 #else
   // Slow fallback for completeness; the above bits to mask cast is preferred.
   const RebindToUnsigned<decltype(d)> du8;
@@ -5626,9 +5625,9 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
-  return detail::U8MaskBitsVecToMask(
+  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
       d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
-                            Set(du8, static_cast<uint8_t>(mask_bits))));
+                            Set(du8, static_cast<uint8_t>(mask_bits)))));
 #else
   // Slow fallback for completeness; the above bits to mask cast is preferred.
   const RebindToUnsigned<D> du;
@@ -5645,9 +5644,9 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
 
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
-  return detail::U8MaskBitsVecToMask(
+  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
       d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
-                            Set(du8, static_cast<uint8_t>(mask_bits * 0x11))));
+                            Set(du8, static_cast<uint8_t>(mask_bits * 0x11)))));
 #else
   // Slow fallback for completeness; the above bits to mask cast is preferred.
   const RebindToUnsigned<D> du;
@@ -5663,9 +5662,9 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
 
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
-  return detail::U8MaskBitsVecToMask(
+  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
       d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
-                            Set(du8, static_cast<uint8_t>(mask_bits * 0x55))));
+                            Set(du8, static_cast<uint8_t>(mask_bits * 0x55)))));
 #else
   // Slow fallback for completeness; the above bits to mask cast is preferred.
   const RebindToUnsigned<D> du;
@@ -5674,6 +5673,27 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
 #endif
 }
 
+// ------------------------------ SetMask
+
+#ifdef HWY_NATIVE_SET_MASK
+#undef HWY_NATIVE_SET_MASK
+#else
+#define HWY_NATIVE_SET_MASK
+#endif
+
+template <class D>
+HWY_API MFromD<D> SetMask(D d, bool val) {
+  const uint8_t u8_mask_val = static_cast<uint8_t>(-static_cast<int>(val));
+#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
+  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
+  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
+      d, detail::ChangeLMUL(ScalableTag<uint8_t>(), Set(du8, u8_mask_val))));
+#else
+  const Rebind<uint8_t, DFromV<VFromD<decltype(d)>>> du8;
+  return MaskFromVec(Set(du8, u8_mask_val));
+#endif
+}
+
 // ------------------------------ Abs (Max, Neg)
 
 template <class V, HWY_IF_SIGNED_V(V)>
@@ -5947,7 +5967,7 @@ HWY_API V64 BitShuffle(V64 values, VI idx) {
 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
           class D = DFromV<V>, class DW = RepartitionToWide<D>>
 HWY_API VFromD<DW> MulEven(const V a, const V b) {
-  constexpr int maskVal = sizeof(TFromD<D>) == 4 ? 5
+  constexpr int maskVal = sizeof(TFromD<D>) == 4   ? 5
                           : sizeof(TFromD<D>) == 2 ? 0x55
                                                    : 0x5555;
   const auto mask = Dup128MaskFromMaskBits(D(), maskVal);
diff --git a/third_party/highway/hwy/ops/scalar-inl.h b/third_party/highway/hwy/ops/scalar-inl.h
@@ -310,6 +310,17 @@ HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) {
   return Mask1<T>::FromBool(n != 0);
 }
 
+#ifdef HWY_NATIVE_SET_MASK
+#undef HWY_NATIVE_SET_MASK
+#else
+#define HWY_NATIVE_SET_MASK
+#endif
+
+template <class D>
+HWY_API MFromD<D> SetMask(D /*d*/, bool val) {
+  return MFromD<D>::FromBool(val);
+}
+
 // ------------------------------ IfVecThenElse
 template <typename T>
 HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
diff --git a/third_party/highway/hwy/ops/set_macros-inl.h b/third_party/highway/hwy/ops/set_macros-inl.h
@@ -44,6 +44,7 @@
 #undef HWY_MEM_OPS_MIGHT_FAULT
 #undef HWY_NATIVE_FMA
 #undef HWY_NATIVE_DOT_BF16
+#undef HWY_NATIVE_MASK
 #undef HWY_CAP_GE256
 #undef HWY_CAP_GE512
 
@@ -141,57 +142,65 @@
 #define HWY_TARGET_STR_AVX2 \
   HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
 
-#if (HWY_COMPILER_GCC_ACTUAL >= 1400 && HWY_COMPILER_GCC_ACTUAL < 1600) || \
-    HWY_COMPILER_CLANG >= 1800
+#ifndef HWY_HAVE_EVEX512  // allow override
+// evex512 has been removed from clang 22, see
+// https://github.com/llvm/llvm-project/pull/157034
+#if (1400 <= HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \
+    (1800 <= HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2200)
+#define HWY_HAVE_EVEX512 1
+#else
+#define HWY_HAVE_EVEX512 0
+#endif
+#endif
+
+#if (HWY_HAVE_EVEX512 == 1)
 #define HWY_TARGET_STR_AVX3_VL512 ",evex512"
 #else
 #define HWY_TARGET_STR_AVX3_VL512
 #endif
 
-#define HWY_TARGET_STR_AVX3_256 \
-  HWY_TARGET_STR_AVX2           \
+#define HWY_TARGET_STR_AVX3 \
+  HWY_TARGET_STR_AVX2       \
   ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" HWY_TARGET_STR_AVX3_VL512
 
-#define HWY_TARGET_STR_AVX3 HWY_TARGET_STR_AVX3_256 HWY_TARGET_STR_AVX3_VL512
-
-#define HWY_TARGET_STR_AVX3_DL_256                                   \
-  HWY_TARGET_STR_AVX3_256                                            \
+#define HWY_TARGET_STR_AVX3_DL                                       \
+  HWY_TARGET_STR_AVX3                                                \
   ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
   "avx512vpopcntdq,gfni"
 
-#define HWY_TARGET_STR_AVX3_DL \
-  HWY_TARGET_STR_AVX3_DL_256 HWY_TARGET_STR_AVX3_VL512
-
-// Force-disable for compilers that do not properly support avx512bf16.
-#if !defined(HWY_AVX3_DISABLE_AVX512BF16) &&                        \
+// Opt-out for compilers that do not properly support avx512bf16.
+#ifndef HWY_AVX3_ENABLE_AVX512BF16  // allow override
+// Default is to disable if the DISABLE macro is defined, or if old compiler.
+// clang-cl 21.1.4 reportedly works; feel free to define this to 1 there.
+#if defined(HWY_AVX3_DISABLE_AVX512BF16) ||                         \
     (HWY_COMPILER_CLANGCL ||                                        \
      (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
      (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
-#define HWY_AVX3_DISABLE_AVX512BF16
+#define HWY_AVX3_ENABLE_AVX512BF16 0
+#else
+#define HWY_AVX3_ENABLE_AVX512BF16 1
 #endif
+#endif  // HWY_AVX3_ENABLE_AVX512BF16
 
-#if !defined(HWY_AVX3_DISABLE_AVX512BF16)
-#define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
+#if HWY_AVX3_ENABLE_AVX512BF16
+#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
 #else
-#define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL
+#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
 #endif
 
-#define HWY_TARGET_STR_AVX3_ZEN4 \
-  HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_VL512
-
 #if HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1400
-#define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4_256 ",avx512fp16"
+#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16"
 #else
-#define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4_256
+#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4
 #endif
 
-#define HWY_TARGET_STR_AVX3_SPR \
-  HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_VL512
-
-#if HWY_COMPILER_GCC_ACTUAL >= 1500
-#define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2"
-#elif HWY_COMPILER_CLANG >= 2000
+// Support for avx10.2-512 was removed between clang 22 and 23 without a
+// feature test macro.
+#if HWY_COMPILER_CLANG >= 2200 && HWY_HAVE_EVEX512
 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2-512"
+// Recent compilers drop the -512 suffix because 512 bits are always available.
+#elif HWY_COMPILER_GCC_ACTUAL >= 1500 || HWY_COMPILER_CLANG >= 2200
+#define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2"
 #else
 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR
 #endif
@@ -243,6 +252,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0  // a few actually are
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -263,6 +273,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0  // a few actually are
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -284,6 +295,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0  // a few actually are
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -310,6 +322,7 @@
 #define HWY_NATIVE_FMA 1
 #endif
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0  // a few actually are
 
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 0
@@ -317,10 +330,8 @@
 #define HWY_TARGET_STR HWY_TARGET_STR_AVX2
 
 //-----------------------------------------------------------------------------
-// AVX3[_DL]/AVX10
-#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL ||     \
-    HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR || \
-    HWY_TARGET == HWY_AVX10_2
+// AVX3[_DL/ZEN4/SPR]/AVX10
+#elif HWY_TARGET <= HWY_AVX3
 
 #define HWY_ALIGN alignas(64)
 #define HWY_MAX_BYTES 64
@@ -329,7 +340,7 @@
 #define HWY_HAVE_SCALABLE 0
 #define HWY_HAVE_INTEGER64 1
 #if HWY_TARGET <= HWY_AVX3_SPR &&                              \
-    (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1901) && \
+    (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 2200) && \
     HWY_HAVE_SCALAR_F16_TYPE
 #define HWY_HAVE_FLOAT16 1
 #else
@@ -338,11 +349,12 @@
 #define HWY_HAVE_FLOAT64 1
 #define HWY_MEM_OPS_MIGHT_FAULT 0
 #define HWY_NATIVE_FMA 1
-#if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16)
+#if (HWY_TARGET <= HWY_AVX3_ZEN4) && HWY_AVX3_ENABLE_AVX512BF16
 #define HWY_NATIVE_DOT_BF16 1
 #else
 #define HWY_NATIVE_DOT_BF16 0
 #endif
+#define HWY_NATIVE_MASK 1
 #define HWY_CAP_GE256 1
 
 #if HWY_MAX_BYTES >= 64
@@ -395,6 +407,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 1
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -432,6 +445,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 1
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -508,6 +522,8 @@
 #define HWY_NATIVE_DOT_BF16 0
 #endif
 
+#define HWY_NATIVE_MASK 0
+
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -553,12 +569,20 @@
 #define HWY_TARGET_STR_FP16 "+fp16"
 #endif
 
+#define HWY_TARGET_STR_I8MM "+i8mm"
+
 #if HWY_TARGET == HWY_NEON_WITHOUT_AES
+#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
+// Prevents inadvertent use of SVE by GCC 13.4 and earlier, see #2689.
+#define HWY_TARGET_STR "+nosve"
+#else
 // Do not define HWY_TARGET_STR (no pragma).
+#endif  // HWY_COMPILER_GCC_ACTUAL
 #elif HWY_TARGET == HWY_NEON
 #define HWY_TARGET_STR HWY_TARGET_STR_NEON
 #elif HWY_TARGET == HWY_NEON_BF16
-#define HWY_TARGET_STR HWY_TARGET_STR_FP16 "+bf16+dotprod" HWY_TARGET_STR_NEON
+#define HWY_TARGET_STR \
+  HWY_TARGET_STR_FP16 HWY_TARGET_STR_I8MM "+bf16+dotprod" HWY_TARGET_STR_NEON
 #else
 #error "Logic error, missing case"
 #endif  // HWY_TARGET
@@ -589,6 +613,7 @@
 #else
 #define HWY_NATIVE_DOT_BF16 0
 #endif
+#define HWY_NATIVE_MASK 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -610,15 +635,17 @@
 #define HWY_HAVE_SCALABLE 1
 #endif
 
+#define HWY_TARGET_STR_I8MM "+i8mm"
+
 // Can use pragmas instead of -march compiler flag
 #if HWY_HAVE_RUNTIME_DISPATCH
 #if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
 // Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic
 // dispatch, which checks for AES support at runtime.
 #if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0)
-#define HWY_TARGET_STR "+sve2+sve2-aes,+sve"
+#define HWY_TARGET_STR "+sve2+sve2-aes,+sve" HWY_TARGET_STR_I8MM
 #else  // SVE2 without AES
-#define HWY_TARGET_STR "+sve2,+sve"
+#define HWY_TARGET_STR "+sve2,+sve" HWY_TARGET_STR_I8MM
 #endif
 #else  // not SVE2 target
 #define HWY_TARGET_STR "+sve"
@@ -642,6 +669,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -664,6 +692,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 0
 
@@ -692,6 +721,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 0
 #define HWY_NATIVE_FMA 1
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -717,9 +747,15 @@
 #if HWY_TARGET == HWY_LSX
 #define HWY_ALIGN alignas(16)
 #define HWY_MAX_BYTES 16
+#ifndef __loongarch_sx
+#define HWY_TARGET_STR "lsx"
+#endif
 #else
 #define HWY_ALIGN alignas(32)
 #define HWY_MAX_BYTES 32
+#ifndef __loongarch_asx
+#define HWY_TARGET_STR "lsx,lasx"
+#endif
 #endif
 
 #define HWY_LANES(T) (HWY_MAX_BYTES / sizeof(T))
@@ -731,6 +767,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 1
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 
 #if HWY_TARGET == HWY_LSX
 #define HWY_CAP_GE256 0
@@ -763,6 +800,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
@@ -785,6 +823,7 @@
 #define HWY_MEM_OPS_MIGHT_FAULT 0
 #define HWY_NATIVE_FMA 0
 #define HWY_NATIVE_DOT_BF16 0
+#define HWY_NATIVE_MASK 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
diff --git a/third_party/highway/hwy/ops/x86_128-inl.h b/third_party/highway/hwy/ops/x86_128-inl.h
@@ -57,7 +57,7 @@ namespace detail {
 #undef HWY_AVX3_HAVE_F32_TO_BF16C
 #if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL &&           \
     (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
-    !defined(HWY_AVX3_DISABLE_AVX512BF16)
+    HWY_AVX3_ENABLE_AVX512BF16
 #define HWY_AVX3_HAVE_F32_TO_BF16C 1
 #else
 #define HWY_AVX3_HAVE_F32_TO_BF16C 0
@@ -71,8 +71,9 @@ namespace detail {
 #endif
 
 #undef HWY_X86_HAVE_AVX10_2_OPS
-#if HWY_TARGET_IS_AVX10_2 && \
-    (HWY_COMPILER_GCC_ACTUAL >= 1501 || HWY_COMPILER3_CLANG >= 200103)
+#if HWY_TARGET_IS_AVX10_2 &&            \
+    (HWY_COMPILER_GCC_ACTUAL >= 1501 || \
+     (HWY_COMPILER3_CLANG >= 200103 && HWY_COMPILER_CLANG != 2100))
 #define HWY_X86_HAVE_AVX10_2_OPS 1
 #else
 #define HWY_X86_HAVE_AVX10_2_OPS 0
@@ -1004,6 +1005,23 @@ HWY_API MFromD<D> MaskFalse(D /*d*/) {
   return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
 }
 
+// ------------------------------ SetMask
+#ifdef HWY_NATIVE_SET_MASK
+#undef HWY_NATIVE_SET_MASK
+#else
+#define HWY_NATIVE_SET_MASK
+#endif
+
+template <class D>
+HWY_API MFromD<D> SetMask(D /*d*/, bool val) {
+  constexpr uint64_t kMask = (HWY_MAX_LANES_D(D) < 64)
+                                 ? ((1ULL << (HWY_MAX_LANES_D(D) & 63)) - 1ULL)
+                                 : LimitsMax<uint64_t>();
+
+  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(
+      static_cast<uint64_t>(-static_cast<int64_t>(val)) & kMask)};
+}
+
 // ------------------------------ IsNegative (MFromD)
 #ifdef HWY_NATIVE_IS_NEGATIVE
 #undef HWY_NATIVE_IS_NEGATIVE
@@ -6962,7 +6980,11 @@ HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
 template <int kLane, typename T, size_t N, HWY_IF_UI32(T)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
+  HWY_IF_CONSTEXPR(N == 1){
+    return Vec128<T, N>{v};  // Workaround for MSVC compiler bug on single lane integer broadcast
+  }else{
+    return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
+  }
 }
 
 template <int kLane, typename T, size_t N, HWY_IF_UI64(T)>
@@ -10003,12 +10025,21 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
 #else
 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
 #endif
+
+#if HWY_X86_HAVE_AVX10_2_OPS
+template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
+HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
+                                            VFromD<Repartition<int8_t, DI32>> a,
+                                            VFromD<Repartition<int8_t, DI32>> b,
+                                            VFromD<DI32> sum) {
+  return VFromD<DI32>{_mm_dpbssd_epi32(sum.raw, a.raw, b.raw)};
+}
+#else   // !HWY_X86_HAVE_AVX10_2_OPS
 template <class DI32, HWY_IF_I32_D(DI32)>
 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
                                             VFromD<Repartition<int8_t, DI32>> a,
                                             VFromD<Repartition<int8_t, DI32>> b,
                                             VFromD<DI32> sum) {
-  // TODO(janwas): AVX-VNNI-INT8 has dpbssd.
   const Repartition<uint8_t, decltype(di32)> du8;
 
   const auto a_u = BitCast(du8, a);
@@ -10017,17 +10048,26 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
       SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32)));
   return result_sum_0 - result_sum_1;
 }
+#endif  // HWY_X86_HAVE_AVX10_2_OPS
 
 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
 #else
 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
 #endif
+
+#if HWY_X86_HAVE_AVX10_2_OPS
+template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16)>
+HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
+    DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
+    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
+  return VFromD<DU32>{_mm_dpbuud_epi32(sum.raw, a.raw, b.raw)};
+}
+#else   // !HWY_X86_HAVE_AVX10_2_OPS
 template <class DU32, HWY_IF_U32_D(DU32)>
 HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
     DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
     VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
-  // TODO(janwas): AVX-VNNI-INT8 has dpbuud.
   const Repartition<uint8_t, decltype(du32)> du8;
   const RebindToSigned<decltype(du8)> di8;
   const RebindToSigned<decltype(du32)> di32;
@@ -10040,6 +10080,7 @@ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
 
   return BitCast(du32, result_sum_0 - result_sum_1);
 }
+#endif  // HWY_X86_HAVE_AVX10_2_OPS
 
 #endif  // HWY_TARGET <= HWY_AVX3_DL
 
diff --git a/third_party/highway/hwy/ops/x86_256-inl.h b/third_party/highway/hwy/ops/x86_256-inl.h
@@ -2113,8 +2113,8 @@ HWY_INLINE Vec256<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
 // ------------------------------ SumsOfAdjQuadAbsDiff
 
 template <int kAOffset, int kBOffset>
-static Vec256<uint16_t> SumsOfAdjQuadAbsDiff(Vec256<uint8_t> a,
-                                             Vec256<uint8_t> b) {
+HWY_API Vec256<uint16_t> SumsOfAdjQuadAbsDiff(Vec256<uint8_t> a,
+                                              Vec256<uint8_t> b) {
   static_assert(0 <= kAOffset && kAOffset <= 1,
                 "kAOffset must be between 0 and 1");
   static_assert(0 <= kBOffset && kBOffset <= 3,
@@ -6424,7 +6424,24 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
   return VFromD<DI32>{_mm256_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
 }
 
-#endif
+#if HWY_X86_HAVE_AVX10_2_OPS
+template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
+HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
+                                            VFromD<Repartition<int8_t, DI32>> a,
+                                            VFromD<Repartition<int8_t, DI32>> b,
+                                            VFromD<DI32> sum) {
+  return VFromD<DI32>{_mm256_dpbssd_epi32(sum.raw, a.raw, b.raw)};
+}
+
+template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_D(DU32, 32)>
+HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
+    DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
+    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
+  return VFromD<DU32>{_mm256_dpbuud_epi32(sum.raw, a.raw, b.raw)};
+}
+#endif  // HWY_X86_HAVE_AVX10_2_OPS
+
+#endif  // HWY_TARGET <= HWY_AVX3_DL
 
 // ================================================== CONVERT
 
diff --git a/third_party/highway/hwy/ops/x86_512-inl.h b/third_party/highway/hwy/ops/x86_512-inl.h
@@ -6744,22 +6744,30 @@ HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
 // SumsOfAdjShufQuadAbsDiff)
 
 template <int kAOffset, int kBOffset>
-static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
-                                             Vec512<uint8_t> b) {
+HWY_API Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
+                                              Vec512<uint8_t> b) {
   static_assert(0 <= kAOffset && kAOffset <= 1,
                 "kAOffset must be between 0 and 1");
   static_assert(0 <= kBOffset && kBOffset <= 3,
                 "kBOffset must be between 0 and 3");
 
+#if HWY_X86_HAVE_AVX10_2_OPS
+  // AVX10.2 now has the _mm512_mpsadbw_epu8 intrinsic available
+  return Vec512<uint16_t>{_mm512_mpsadbw_epu8(
+      a.raw, b.raw,
+      (kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
+#else
   const DFromV<decltype(a)> d;
   const RepartitionToWideX2<decltype(d)> du32;
 
-  // While AVX3 does not have a _mm512_mpsadbw_epu8 intrinsic, the
-  // SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
-  // AVX3 using SumsOfShuffledQuadAbsDiff and U32 Broadcast.
+  // The _mm512_mpsadbw_epu8 intrinsic is not available prior to AVX10.2.
+  // The SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
+  // pre-AVX10.2 targets that support AVX3 using SumsOfShuffledQuadAbsDiff and
+  // U32 Broadcast.
   return SumsOfShuffledQuadAbsDiff<kAOffset + 2, kAOffset + 1, kAOffset + 1,
                                    kAOffset>(
       a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
+#endif
 }
 
 #if !HWY_IS_MSAN
@@ -7636,6 +7644,23 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
   return VFromD<DI32>{_mm512_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
 }
 
+#if HWY_X86_HAVE_AVX10_2_OPS
+template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
+HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
+                                            VFromD<Repartition<int8_t, DI32>> a,
+                                            VFromD<Repartition<int8_t, DI32>> b,
+                                            VFromD<DI32> sum) {
+  return VFromD<DI32>{_mm512_dpbssd_epi32(sum.raw, a.raw, b.raw)};
+}
+
+template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_D(DU32, 64)>
+HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
+    DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
+    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
+  return VFromD<DU32>{_mm512_dpbuud_epi32(sum.raw, a.raw, b.raw)};
+}
+#endif  // HWY_X86_HAVE_AVX10_2_OPS
+
 #endif
 
 // ------------------------------ Reductions
diff --git a/third_party/highway/hwy/perf_counters.cc b/third_party/highway/hwy/perf_counters.cc
@@ -343,7 +343,7 @@ class PMU {
 
 // Monostate, see header.
 PMU& GetPMU() {
-  static PMU pmu;
+  static PMU& pmu = *new PMU();  // avoids exit-dtor warning (no dtor required)
   return pmu;
 }
 
diff --git a/third_party/highway/hwy/perf_counters.h b/third_party/highway/hwy/perf_counters.h
@@ -82,7 +82,7 @@ class PerfCounters {
       case kMigrations:
         return "migration";
       default:
-        HWY_ABORT("Bug: unknown counter %d", c);
+        HWY_UNREACHABLE;
     }
   }
 
diff --git a/third_party/highway/hwy/print-inl.h b/third_party/highway/hwy/print-inl.h
@@ -15,6 +15,8 @@
 
 // Print() function
 
+#include <stddef.h>
+
 #include "hwy/highway.h"
 #include "hwy/print.h"
 
diff --git a/third_party/highway/hwy/print.cc b/third_party/highway/hwy/print.cc
@@ -37,6 +37,8 @@ HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) {
   }
 }
 
+// The NOLINT are to suppress the warning about passing 100 instead of
+// `sizeof(string100)`, which is a pointer.
 HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
                             char* string100) {
   if (info.sizeof_t == 1) {
@@ -52,12 +54,14 @@ HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
   } else if (info.sizeof_t == 2) {
     if (info.is_bf16) {
       const double value = static_cast<double>(F32FromBF16Mem(ptr));
-      const char* fmt = hwy::ScalarAbs(value) < 1E-3 ? "%.3E" : "%.3f";
-      snprintf(string100, 100, fmt, value);  // NOLINT
+      // NOLINTNEXTLINE
+      snprintf(string100, 100, hwy::ScalarAbs(value) < 1E-3 ? "%.3E" : "%.3f",
+               value);
     } else if (info.is_float) {
       const double value = static_cast<double>(F32FromF16Mem(ptr));
-      const char* fmt = hwy::ScalarAbs(value) < 1E-4 ? "%.4E" : "%.4f";
-      snprintf(string100, 100, fmt, value);  // NOLINT
+      // NOLINTNEXTLINE
+      snprintf(string100, 100, hwy::ScalarAbs(value) < 1E-4 ? "%.4E" : "%.4f",
+               value);
     } else {
       uint16_t bits;
       CopyBytes<2>(ptr, &bits);
@@ -67,8 +71,9 @@ HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
     if (info.is_float) {
       float value;
       CopyBytes<4>(ptr, &value);
-      const char* fmt = hwy::ScalarAbs(value) < 1E-6 ? "%.9E" : "%.9f";
-      snprintf(string100, 100, fmt, static_cast<double>(value));  // NOLINT
+      // NOLINTNEXTLINE
+      snprintf(string100, 100, hwy::ScalarAbs(value) < 1E-6f ? "%.9E" : "%.9f",
+               static_cast<double>(value));
     } else if (info.is_signed) {
       int32_t value;
       CopyBytes<4>(ptr, &value);
@@ -82,8 +87,9 @@ HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
     if (info.is_float) {
       double value;
       CopyBytes<8>(ptr, &value);
-      const char* fmt = hwy::ScalarAbs(value) < 1E-9 ? "%.18E" : "%.18f";
-      snprintf(string100, 100, fmt, value);  // NOLINT
+      // NOLINTNEXTLINE
+      snprintf(string100, 100, hwy::ScalarAbs(value) < 1E-9 ? "%.18E" : "%.18f",
+               value);
     } else {
       const uint8_t* ptr8 = reinterpret_cast<const uint8_t*>(ptr);
       uint32_t lo, hi;
diff --git a/third_party/highway/hwy/profiler.cc b/third_party/highway/hwy/profiler.cc
@@ -33,15 +33,15 @@ namespace hwy {
 
 #if PROFILER_ENABLED
 
-constexpr bool kPrintOverhead = true;
+static constexpr bool kPrintOverhead = true;
 
-// Initialize to an invalid value to detect when `InitThread` was not called.
-// `Profiler()` does so for the main thread and `ThreadPool()` for all workers.
-/*static*/ thread_local size_t Profiler::s_thread = ~size_t{0};
-/*static*/ std::atomic<size_t> Profiler::s_num_threads{0};
+// Must zero-init because `ThreadFunc` calls `SetGlobalIdx()` potentially after
+// this is first used in the `pool::Worker` ctor.
+/*static*/ thread_local size_t Profiler::s_global_idx = 0;
 
 // Detects duration of a zero-length zone: timer plus packet overhead.
-static uint64_t DetectSelfOverhead(Profiler& profiler, size_t thread) {
+static uint64_t DetectSelfOverhead(Profiler& profiler, size_t global_idx) {
+  static const profiler::ZoneHandle zone = profiler.AddZone("DetectSelf");
   profiler::Results results;
   const size_t kNumSamples = 25;
   uint32_t samples[kNumSamples];
@@ -52,13 +52,10 @@ static uint64_t DetectSelfOverhead(Profiler& profiler, size_t thread) {
     for (size_t idx_duration = 0; idx_duration < kNumDurations;
          ++idx_duration) {
       {
-        static const profiler::ZoneHandle zone =
-            profiler.AddZone("DetectSelfOverhead");
-        PROFILER_ZONE3(profiler, /*thread=*/0, zone);
+        PROFILER_ZONE3(profiler, global_idx, zone);
       }
-      durations[idx_duration] = static_cast<uint32_t>(
-          profiler.GetThread(thread).GetFirstDurationAndReset(
-              thread, profiler.Accumulators()));
+      durations[idx_duration] =
+          static_cast<uint32_t>(profiler.GetFirstDurationAndReset(global_idx));
     }
     samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
   }
@@ -68,8 +65,9 @@ static uint64_t DetectSelfOverhead(Profiler& profiler, size_t thread) {
 // Detects average duration of a zero-length zone, after deducting self
 // overhead. This accounts for the delay before/after capturing start/end
 // timestamps, for example due to fence instructions in timer::Start/Stop.
-static uint64_t DetectChildOverhead(Profiler& profiler, size_t thread,
+static uint64_t DetectChildOverhead(Profiler& profiler, size_t global_idx,
                                     uint64_t self_overhead) {
+  static const profiler::ZoneHandle zone = profiler.AddZone("DetectChild");
   // Enough for stable measurements, but only about 50 ms startup cost.
   const size_t kMaxSamples = 30;
   uint32_t samples[kMaxSamples];
@@ -83,20 +81,17 @@ static uint64_t DetectChildOverhead(Profiler& profiler, size_t thread,
       HWY_FENCE;
       const uint64_t t0 = timer::Start();
       for (size_t r = 0; r < kReps; ++r) {
-        static const profiler::ZoneHandle zone =
-            profiler.AddZone("DetectChildOverhead");
-        PROFILER_ZONE3(profiler, /*thread=*/0, zone);
+        PROFILER_ZONE3(profiler, global_idx, zone);
       }
       const uint64_t t1 = timer::Stop();
       HWY_FENCE;
       // We are measuring the total, not individual zone durations, to include
       // cross-zone overhead.
-      (void)profiler.GetThread(thread).GetFirstDurationAndReset(
-          thread, profiler.Accumulators());
+      (void)profiler.GetFirstDurationAndReset(global_idx);
 
       const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
       durations[d] = static_cast<uint32_t>(
-          profiler::PerThread::ClampedSubtract(avg_duration, self_overhead));
+          profiler::PerWorker::ClampedSubtract(avg_duration, self_overhead));
     }
     samples[num_samples] = robust_statistics::Mode(durations, kNumDurations);
     // Overhead is nonzero, but we often measure zero; skip them to prevent
@@ -109,21 +104,26 @@ static uint64_t DetectChildOverhead(Profiler& profiler, size_t thread,
 Profiler::Profiler() {
   const uint64_t t0 = timer::Start();
 
-  InitThread();
-
   char cpu[100];
   if (HWY_UNLIKELY(!platform::HaveTimerStop(cpu))) {
     HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
   }
 
+  // `ThreadPool` calls `Profiler::Get()` before it creates threads, hence this
+  // is guaranteed to be running on the main thread.
+  constexpr size_t kMain = 0;
+  // Must be called before any use of `PROFILER_ZONE*/PROFILER_FUNC*`. This runs
+  // only once because `Profiler` is a singleton.
+  ReserveWorker(kMain);
+  SetGlobalIdx(kMain);
+
   profiler::Overheads overheads;
-  // WARNING: must pass in Profiler& and use `PROFILER_ZONE3` to avoid calling
-  // `Profiler::Get()` here, because that would re-enter the magic static init.
-  constexpr size_t kThread = 0;
-  overheads.self = DetectSelfOverhead(*this, kThread);
-  overheads.child = DetectChildOverhead(*this, kThread, overheads.self);
-  for (size_t thread = 0; thread < profiler::kMaxThreads; ++thread) {
-    threads_[thread].SetOverheads(overheads);
+  // WARNING: must pass in `*this` and use `PROFILER_ZONE3` to avoid calling
+  // `Profiler::Get()`, because that would re-enter the magic static init.
+  overheads.self = DetectSelfOverhead(*this, kMain);
+  overheads.child = DetectChildOverhead(*this, kMain, overheads.self);
+  for (size_t worker = 0; worker < profiler::kMaxWorkers; ++worker) {
+    workers_[worker].SetOverheads(overheads);
   }
 
   HWY_IF_CONSTEXPR(kPrintOverhead) {
@@ -139,8 +139,8 @@ Profiler::Profiler() {
 
 // Even if disabled, we want to export the symbol.
 HWY_DLLEXPORT Profiler& Profiler::Get() {
-  static Profiler profiler;
-  return profiler;
+  static Profiler* profiler = new Profiler();
+  return *profiler;
 }
 
 }  // namespace hwy
diff --git a/third_party/highway/hwy/profiler.h b/third_party/highway/hwy/profiler.h
@@ -17,21 +17,29 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // strcmp, strlen
+
+#include <atomic>
+#include <functional>
 
+#include "hwy/base.h"
 #include "hwy/highway_export.h"
 
 // High precision, low overhead time measurements. Returns exact call counts and
 // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
 //
 // Uses RAII to capture begin/end timestamps, with user-specified zone names:
-//   { PROFILER_ZONE("name"); /*code*/ } or
-// the name of the current function:
-//   void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
-// You can reduce the overhead by passing a thread ID:
-//   `PROFILER_ZONE2(thread, name)`. The new and preferred API also allows
-// passing flags, such as requesting inclusive time:
+// `{ PROFILER_ZONE("name"); /*code*/ }` or the name of the current function:
+// `void FuncToMeasure() { PROFILER_FUNC; /*code*/ }`.
+//
+// You can reduce the overhead by passing `global_idx`, which can be taken from
+// the argument to the `ThreadPool::Run` lambda (if the pool was constructed
+// with non-default `PoolWorkerMapping`), or from a saved copy of the
+// thread-local `Profiler::Thread`: `PROFILER_ZONE2(global_idx, name)`.
+//
+// The preferred API allows passing flags, such as requesting inclusive time:
 // `static const auto zone = profiler.AddZone("name", flags);` and then
-// `PROFILER_ZONE3(profiler, thread, zone)`.
+// `PROFILER_ZONE3(profiler, global_idx, zone)`.
 //
 // After all threads exit all zones, call `Profiler::Get().PrintResults()` to
 // print call counts and average durations [CPU cycles] to stdout, sorted in
@@ -44,14 +52,12 @@
 
 #if PROFILER_ENABLED
 #include <stdio.h>
-#include <string.h>  // strcmp, strlen
 
 #include <algorithm>  // std::sort
-#include <atomic>
+#include <utility>
 #include <vector>
 
 #include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
 #include "hwy/bit_set.h"
 #include "hwy/timer.h"
 #endif  // PROFILER_ENABLED
@@ -60,8 +66,8 @@ namespace hwy {
 
 // Flags: we want type-safety (enum class) to catch mistakes such as confusing
 // zone with flags. Base type (`uint32_t`) ensures it is safe to cast. Defined
-// outside the `#if` because callers pass them to `PROFILER_ZONE3`. Keep in
-// sync with `kNumFlags` below.
+// outside the `#if` because callers pass them to `PROFILER_ZONE3`. When adding
+// flags, also update `kNumFlags` and `ChildTotalMask`.
 enum class ProfilerFlags : uint32_t {
   kDefault = 0,
   // The zone should report cumulative time, including all child zones. If not
@@ -69,6 +75,78 @@ enum class ProfilerFlags : uint32_t {
   kInclusive = 1
 };
 
+// Called during `PrintResults` to print results from other modules.
+using ProfilerFunc = std::function<void(void)>;
+
+template <size_t kMaxStrings>
+class StringTable {
+  static constexpr std::memory_order kRelaxed = std::memory_order_relaxed;
+  static constexpr std::memory_order kAcq = std::memory_order_acquire;
+  static constexpr std::memory_order kRel = std::memory_order_release;
+
+ public:
+  // Returns a copy of the `name` passed to `Add` that returned the
+  // given `idx`.
+  const char* Name(size_t idx) const {
+    // `kRelaxed` is sufficient because pointers are immutable once published
+    // via a `kRelease` store.
+    return ptrs_[idx].load(kRelaxed);
+  }
+
+  // Returns `idx < kMaxStrings`. Can be called concurrently. Calls with the
+  // same `name` return the same `idx`.
+  size_t Add(const char* name) {
+    // Linear search if it already exists. `kAcq` ensures we see prior stores.
+    const size_t num_strings = next_ptr_.load(kAcq);
+    HWY_ASSERT(num_strings < kMaxStrings);
+    for (size_t idx = 1; idx < num_strings; ++idx) {
+      const char* existing = ptrs_[idx].load(kAcq);
+      // `next_ptr_` was published after writing `ptr_`, hence it is non-null.
+      HWY_ASSERT(existing != nullptr);
+      if (HWY_UNLIKELY(!strcmp(existing, name))) {
+        return idx;
+      }
+    }
+
+    // Copy `name` into `chars_` before publishing the pointer.
+    const size_t len = strlen(name) + 1;
+    const size_t pos = next_char_.fetch_add(len, kRelaxed);
+    HWY_ASSERT(pos + len <= sizeof(chars_));
+    strcpy(chars_ + pos, name);  // NOLINT
+
+    for (;;) {
+      size_t idx = next_ptr_.load(kRelaxed);
+      HWY_ASSERT(idx < kMaxStrings);
+
+      // Attempt to claim the next `idx` via CAS.
+      const char* expected = nullptr;
+      if (HWY_LIKELY(ptrs_[idx].compare_exchange_weak(expected, chars_ + pos,
+                                                      kRel, kRelaxed))) {
+        // Publish the new count and make the `ptrs_` write visible.
+        next_ptr_.store(idx + 1, kRel);
+        HWY_DASSERT(!strcmp(Name(idx), name));
+        return idx;
+      }
+
+      // We lost the race. `expected` has been updated.
+      if (HWY_UNLIKELY(!strcmp(expected, name))) {
+        // Done, another thread added the same name. Note that we waste the
+        // extra space in `chars_`, which is fine because it is rare.
+        HWY_DASSERT(!strcmp(Name(idx), name));
+        return idx;
+      }
+
+      // Other thread added a different name. Retry with the next slot.
+    }
+  }
+
+ private:
+  std::atomic<const char*> ptrs_[kMaxStrings];
+  std::atomic<size_t> next_ptr_{1};  // next idx
+  std::atomic<size_t> next_char_{0};
+  char chars_[kMaxStrings * 55];
+};
+
 #if PROFILER_ENABLED
 
 // Implementation details.
@@ -78,13 +156,13 @@ HWY_INLINE_VAR constexpr size_t kNumFlags = 1;
 
 // Upper bounds for fixed-size data structures, guarded via HWY_DASSERT:
 
-// Maximum nesting of zones, chosen such that PerThread is 256 bytes.
+// Maximum nesting of zones, chosen such that `PerWorker` is 256 bytes.
 HWY_INLINE_VAR constexpr size_t kMaxDepth = 13;
 // Reports with more than ~50 are anyway difficult to read.
 HWY_INLINE_VAR constexpr size_t kMaxZones = 128;
-// Upper bound on threads that call `InitThread`, and `thread` arguments. Note
-// that fiber libraries can spawn hundreds of threads. Enough for Turin cores.
-HWY_INLINE_VAR constexpr size_t kMaxThreads = 256;
+// Upper bound on global worker_idx across all pools. Note that fiber libraries
+// can spawn hundreds of threads. Turin has 128-192 cores.
+HWY_INLINE_VAR constexpr size_t kMaxWorkers = 256;
 
 // Type-safe wrapper for zone index plus flags, returned by `AddZone`.
 class ZoneHandle {
@@ -119,8 +197,11 @@ class ZoneHandle {
 
   // Returns a mask to zero/ignore child totals for inclusive zones.
   uint64_t ChildTotalMask() const {
-    // Without this function, clang tends to generate a branch.
-    return IsInclusive() ? 0 : ~uint64_t{0};
+    // With a ternary operator, clang tends to generate a branch.
+    // return IsInclusive() ? 0 : ~uint64_t{0};
+    const uint32_t bit =
+        bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive);
+    return uint64_t{bit} - 1;
   }
 
  private:
@@ -128,48 +209,96 @@ class ZoneHandle {
 };
 
 // Storage for zone names.
-class Names {
-  static constexpr std::memory_order kRel = std::memory_order_relaxed;
-
+class Zones {
  public:
   // Returns a copy of the `name` passed to `AddZone` that returned the
   // given `zone`.
-  const char* Get(ZoneHandle zone) const { return ptrs_[zone.ZoneIdx()]; }
+  const char* Name(ZoneHandle zone) const {
+    return strings_.Name(zone.ZoneIdx());
+  }
 
+  // Can be called concurrently. Calls with the same `name` return the same
+  // `ZoneHandle.ZoneIdx()`.
   ZoneHandle AddZone(const char* name, ProfilerFlags flags) {
-    // Linear search whether it already exists.
-    const size_t num_zones = next_ptr_.load(kRel);
-    HWY_ASSERT(num_zones < kMaxZones);
-    for (size_t zone_idx = 1; zone_idx < num_zones; ++zone_idx) {
-      if (!strcmp(ptrs_[zone_idx], name)) {
-        return ZoneHandle(zone_idx, flags);
+    return ZoneHandle(strings_.Add(name), flags);
+  }
+
+ private:
+  StringTable<kMaxZones> strings_;
+};
+
+// Allows other classes such as `ThreadPool` to register/unregister a function
+// to call during `PrintResults`. This allows us to gather data from the worker
+// threads without having to wait until they exit, and decouples the profiler
+// from other modules. Thread-safe.
+class Funcs {
+  static constexpr auto kAcq = std::memory_order_acquire;
+  static constexpr auto kRel = std::memory_order_release;
+
+ public:
+  // Can be called concurrently with distinct keys.
+  void Add(intptr_t key, ProfilerFunc func) {
+    HWY_ASSERT(key != 0 && key != kPending);  // reserved values
+    HWY_ASSERT(func);                         // not empty
+
+    for (size_t i = 0; i < kMaxFuncs; ++i) {
+      intptr_t expected = 0;
+      // Lost a race with a concurrent `Add`, try the next slot.
+      if (!keys_[i].compare_exchange_strong(expected, kPending, kRel)) {
+        continue;
       }
+      // We own the slot: move func there.
+      funcs_[i] = std::move(func);
+      keys_[i].store(key, kRel);  // publishes the `func` write.
+      return;
     }
 
-    // Reserve the next `zone_idx` (index in `ptrs_`).
-    const size_t zone_idx = next_ptr_.fetch_add(1, kRel);
-
-    // Copy into `name` into `chars_`.
-    const size_t len = strlen(name) + 1;
-    const size_t pos = next_char_.fetch_add(len, kRel);
-    HWY_ASSERT(pos + len <= sizeof(chars_));
-    strcpy(chars_ + pos, name);  // NOLINT
+    HWY_ABORT("Funcs::Add: no free slot, increase kMaxFuncs.");
+  }
+
+  // Can be called concurrently with distinct keys. It is an error to call this
+  // without a prior `Add` of the same key.
+  void Remove(intptr_t key) {
+    HWY_ASSERT(key != 0 && key != kPending);  // reserved values
+
+    for (size_t i = 0; i < kMaxFuncs; ++i) {
+      intptr_t actual = keys_[i].load(kAcq);
+      if (actual == key) {
+        // In general, concurrent removal is fine, but in this specific context,
+        // owners are expected to remove their key exactly once, from the same
+        // thread that added it. In that case, CAS should not fail.
+        if (!keys_[i].compare_exchange_strong(actual, kPending, kRel)) {
+          HWY_WARN("Funcs: CAS failed, why is there a concurrent Remove?");
+        }
+        funcs_[i] = ProfilerFunc();
+        keys_[i].store(0, kRel);  // publishes the `func` write.
+        return;
+      }
+    }
+    HWY_ABORT("Funcs::Remove: failed to find key %p.",
+              reinterpret_cast<void*>(key));
+  }
 
-    ptrs_[zone_idx] = chars_ + pos;
-    const ZoneHandle zone(zone_idx, flags);
-    HWY_DASSERT(!strcmp(Get(zone), name));
-    return zone;
+  void CallAll() const {
+    for (size_t i = 0; i < kMaxFuncs; ++i) {
+      intptr_t key = keys_[i].load(kAcq);  // ensures `funcs_` is visible.
+      // Safely handles concurrent Add/Remove.
+      if (key != 0 && key != kPending) {
+        funcs_[i]();
+      }
+    }
   }
 
  private:
-  const char* ptrs_[kMaxZones];
-  std::atomic<size_t> next_ptr_{1};  // next zone_idx
-  char chars_[kMaxZones * 70];
-  std::atomic<size_t> next_char_{0};
+  static constexpr size_t kMaxFuncs = 64;
+  static constexpr intptr_t kPending = -1;
+
+  ProfilerFunc funcs_[kMaxFuncs];  // non-atomic
+  std::atomic<intptr_t> keys_[kMaxFuncs] = {};
 };
 
-// Holds total duration and number of calls. "Which thread entered it" is
-// unnecessary because these are per-thread.
+// Holds total duration and number of calls. Worker index is implicit in the
+// index of this class within the `Accumulators` array.
 struct Accumulator {
   void Add(ZoneHandle new_zone, uint64_t self_duration) {
     duration += self_duration;
@@ -183,7 +312,7 @@ struct Accumulator {
     num_calls += 1;
   }
 
-  void Assimilate(Accumulator& other) {
+  void Take(Accumulator& other) {
     duration += other.duration;
     other.duration = 0;
 
@@ -203,94 +332,38 @@ struct Accumulator {
 };
 static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
 
-// Modified from `hwy::BitSet4096`. Avoids the second-level `BitSet64`, because
-// we only need `kMaxZones` = 128.
-class ZoneSet {
- public:
-  // No harm if `i` is already set.
-  void Set(size_t i) {
-    HWY_DASSERT(i < kMaxZones);
-    const size_t idx = i / 64;
-    const size_t mod = i % 64;
-    bits_[idx].Set(mod);
-    HWY_DASSERT(Get(i));
-  }
-
-  void Clear(size_t i) {
-    HWY_DASSERT(i < kMaxZones);
-    const size_t idx = i / 64;
-    const size_t mod = i % 64;
-    bits_[idx].Clear(mod);
-    HWY_DASSERT(!Get(i));
-  }
-
-  bool Get(size_t i) const {
-    HWY_DASSERT(i < kMaxZones);
-    const size_t idx = i / 64;
-    const size_t mod = i % 64;
-    return bits_[idx].Get(mod);
-  }
-
-  // Returns lowest i such that Get(i). Caller must ensure Any() beforehand!
-  size_t First() const {
-    HWY_DASSERT(bits_[0].Any() || bits_[1].Any());
-    const size_t idx = bits_[0].Any() ? 0 : 1;
-    return idx * 64 + bits_[idx].First();
-  }
-
-  // Calls `func(i)` for each `i` in the set. It is safe for `func` to modify
-  // the set, but the current Foreach call is only affected if changing one of
-  // the not yet visited BitSet64 for which Any() is true.
-  template <class Func>
-  void Foreach(const Func& func) const {
-    bits_[0].Foreach([&func](size_t mod) { func(mod); });
-    bits_[1].Foreach([&func](size_t mod) { func(64 + mod); });
-  }
-
-  size_t Count() const { return bits_[0].Count() + bits_[1].Count(); }
-
- private:
-  static_assert(kMaxZones == 128, "Update ZoneSet");
-  BitSet64 bits_[2];
-};
-
-// Modified from `ZoneSet`.
-class ThreadSet {
- public:
-  // No harm if `i` is already set.
-  void Set(size_t i) {
-    HWY_DASSERT(i < kMaxThreads);
-    const size_t idx = i / 64;
-    const size_t mod = i % 64;
-    bits_[idx].Set(mod);
-  }
-
-  size_t Count() const {
-    size_t total = 0;
-    for (const BitSet64& bits : bits_) {
-      total += bits.Count();
-    }
-    return total;
-  }
-
- private:
-  BitSet64 bits_[DivCeil(kMaxThreads, size_t{64})];
-};
+using ZoneSet = hwy::BitSet<kMaxZones>;
+using WorkerSet = hwy::BitSet<kMaxWorkers>;
+using AtomicWorkerSet = hwy::AtomicBitSet<kMaxWorkers>;
 
 // Durations are per-CPU, but end to end performance is defined by wall time.
 // Assuming fork-join parallelism, zones are entered by multiple threads
 // concurrently, which means the total number of unique threads is also the
 // degree of concurrency, so we can estimate wall time as CPU time divided by
-// the number of unique threads seen, tracked via `ThreadSet`.
+// the number of unique threads seen. This is facilitated by unique `global_idx`
+// passed in by callers, or taken from thread-local `GlobalIdx()`.
 //
 // We also want to support varying thread counts per call site, because the same
 // function/zone may be called from multiple pools. `EndRootRun` calls
-// `CountThreadsAndReset` after each top-level `ThreadPool::Run`, which
+// `CountWorkersAndReset` after each top-level `ThreadPool::Run`, which
 // generates one data point summarized via descriptive statistics. Here we
-// implement a simpler version of `hwy::Stats` because we do not require
+// implement a simpler version of `Stats` because we do not require
 // geomean/variance/kurtosis/skewness. Because concurrency is a small integer,
 // we can simply compute sums rather than online moments. There is also only one
-// instance across all threads, hence we do not require `Assimilate`.
+// instance across all threads, hence we do not require a `Take`.
+//
+// Note that subsequently discovered prior work estimates the number of active
+// and idle processors by updating atomic counters whenever they start/finish a
+// task: https://homes.cs.washington.edu/~tom/pubs/quartz.pdf and "Effective
+// performance measurement and analysis of multithreaded applications". We
+// instead accumulate zone durations into per-thread storage.
+// `CountWorkersAndReset` then checks how many were nonzero, which avoids
+// expensive atomic updates and ensures accurate counts per-zone, rather than
+// estimates of current activity at each sample.
+// D. Vyukov's https://github.com/dvyukov/perf-load, also integrated into Linux
+// perf, also corrects for parallelism without using atomic counters by tracing
+// context switches. Note that we often pin threads, which avoids migrations,
+// but reduces the number of context switch events to mainly preemptions.
 class ConcurrencyStats {
  public:
   ConcurrencyStats() { Reset(); }
@@ -324,45 +397,42 @@ class ConcurrencyStats {
 };
 static_assert(sizeof(ConcurrencyStats) == (8 + 3 * sizeof(size_t)), "");
 
-// Holds the final results across all threads, including `ConcurrencyStats`.
-// There is only one instance because this is updated by the main thread.
+// Holds the final results across all threads, including `ConcurrencyStats`
+// and `PoolStats`, updated/printed by the main thread.
 class Results {
  public:
-  void Assimilate(const size_t thread, const size_t zone_idx,
-                  Accumulator& other) {
-    HWY_DASSERT(thread < kMaxThreads);
+  void TakeAccumulator(const size_t global_idx, const size_t zone_idx,
+                       Accumulator& other) {
+    HWY_DASSERT(global_idx < kMaxWorkers);
     HWY_DASSERT(zone_idx < kMaxZones);
     HWY_DASSERT(other.zone.ZoneIdx() == zone_idx);
 
     visited_zones_.Set(zone_idx);
-    totals_[zone_idx].Assimilate(other);
-    threads_[zone_idx].Set(thread);
+    totals_[zone_idx].Take(other);
+    workers_[zone_idx].Set(global_idx);
   }
 
   // Moves the total number of threads seen during the preceding root-level
   // `ThreadPool::Run` into one data point for `ConcurrencyStats`.
-  void CountThreadsAndReset(const size_t zone_idx) {
+  void CountWorkersAndReset(const size_t zone_idx) {
     HWY_DASSERT(zone_idx < kMaxZones);
-    const size_t num_threads = threads_[zone_idx].Count();
-    // Although threads_[zone_idx] at one point was non-empty, it is reset
+    const size_t num_workers = workers_[zone_idx].Count();
+    // Although workers_[zone_idx] at one point was non-empty, it is reset
     // below, and so can be empty on the second call to this via `PrintResults`,
     // after one from `EndRootRun`. Do not add a data point if empty.
-    if (num_threads != 0) {
-      concurrency_[zone_idx].Notify(num_threads);
+    if (num_workers != 0) {
+      concurrency_[zone_idx].Notify(num_workers);
     }
-    threads_[zone_idx] = ThreadSet();
+    workers_[zone_idx] = WorkerSet();
   }
 
-  void CountThreadsAndReset() {
+  void CountWorkersAndReset() {
     visited_zones_.Foreach(
-        [&](size_t zone_idx) { CountThreadsAndReset(zone_idx); });
+        [&](size_t zone_idx) { CountWorkersAndReset(zone_idx); });
   }
 
-  void AddAnalysisTime(uint64_t t0) { analyze_elapsed_ += timer::Stop() - t0; }
-
-  void Print(const Names& names) {
-    const uint64_t t0 = timer::Start();
-    const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
+  void PrintAndReset(const Zones& zones) {
+    const double inv_freq = 1.0 / hwy::platform::InvariantTicksPerSecond();
 
     // Sort by decreasing total (self) cost. `totals_` are sparse, so sort an
     // index vector instead.
@@ -371,7 +441,7 @@ class Results {
     visited_zones_.Foreach([&](size_t zone_idx) {
       indices.push_back(static_cast<uint32_t>(zone_idx));
       // In case the zone exited after `EndRootRun` and was not yet added.
-      CountThreadsAndReset(zone_idx);
+      CountWorkersAndReset(zone_idx);
     });
     std::sort(indices.begin(), indices.end(), [&](uint32_t a, uint32_t b) {
       return totals_[a].duration > totals_[b].duration;
@@ -391,29 +461,23 @@ class Results {
       // Avoid division by zero.
       const double concurrency_divisor = HWY_MAX(1.0, avg_concurrency);
       printf("%s%-40s: %10.0f x %15.0f / %5.1f (%5zu %3zu-%3zu) = %9.6f\n",
-             total.zone.IsInclusive() ? "(I)" : "   ", names.Get(total.zone),
+             total.zone.IsInclusive() ? "(I)" : "   ", zones.Name(total.zone),
              static_cast<double>(total.num_calls), per_call, avg_concurrency,
              concurrency.Count(), concurrency.Min(), concurrency.Max(),
              duration * inv_freq / concurrency_divisor);
 
       total = Accumulator();
       concurrency.Reset();
-      // `threads_` was already reset by `CountThreadsAndReset`.
+      // `workers_` was already reset by `CountWorkersAndReset`.
     }
     visited_zones_ = ZoneSet();
-
-    AddAnalysisTime(t0);
-    printf("Total analysis [s]: %f\n",
-           static_cast<double>(analyze_elapsed_) * inv_freq);
-    analyze_elapsed_ = 0;
   }
 
  private:
-  uint64_t analyze_elapsed_ = 0;
   // Indicates which of the array entries are in use.
   ZoneSet visited_zones_;
   Accumulator totals_[kMaxZones];
-  ThreadSet threads_[kMaxZones];
+  WorkerSet workers_[kMaxZones];
   ConcurrencyStats concurrency_[kMaxZones];
 };
 
@@ -421,33 +485,33 @@ class Results {
 // with frequency throttling disabled, this has a multimodal distribution,
 // including 32, 34, 48, 52, 59, 62.
 struct Overheads {
-  uint32_t self = 0;
-  uint32_t child = 0;
+  uint64_t self = 0;
+  uint64_t child = 0;
 };
-static_assert(sizeof(Overheads) == 8, "Wrong Overheads size");
+static_assert(sizeof(Overheads) == 16, "Wrong Overheads size");
 
 class Accumulators {
   // We generally want to group threads together because they are often
   // accessed together during a zone, but also want to avoid threads sharing a
-  // cache line. Hence interleave 8 zones per thread.
+  // cache line. Hence interleave 8 zones per worker.
   static constexpr size_t kPerLine = HWY_ALIGNMENT / sizeof(Accumulator);
 
  public:
-  Accumulator& Get(const size_t thread, const size_t zone_idx) {
-    HWY_DASSERT(thread < kMaxThreads);
+  Accumulator& Get(const size_t global_idx, const size_t zone_idx) {
+    HWY_DASSERT(global_idx < kMaxWorkers);
     HWY_DASSERT(zone_idx < kMaxZones);
     const size_t line = zone_idx / kPerLine;
     const size_t offset = zone_idx % kPerLine;
-    return zones_[(line * kMaxThreads + thread) * kPerLine + offset];
+    return zones_[(line * kMaxWorkers + global_idx) * kPerLine + offset];
   }
 
  private:
-  Accumulator zones_[kMaxZones * kMaxThreads];
+  Accumulator zones_[kMaxZones * kMaxWorkers];
 };
 
 // Reacts to zone enter/exit events. Builds a stack of active zones and
 // accumulates self/child duration for each.
-class PerThread {
+class PerWorker {
  public:
   template <typename T>
   static T ClampedSubtract(const T minuend, const T subtrahend) {
@@ -467,12 +531,11 @@ class PerThread {
     t_enter_[depth] = t_enter;
     child_total_[1 + depth] = 0;
     depth_ = 1 + depth;
-    HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { any_ = 1; }
   }
 
   // Exiting the most recently entered zone (top of stack).
-  void Exit(const uint64_t t_exit, const size_t thread, const ZoneHandle zone,
-            Accumulators& accumulators) {
+  void Exit(const uint64_t t_exit, const size_t global_idx,
+            const ZoneHandle zone, Accumulators& accumulators) {
     HWY_DASSERT(depth_ > 0);
     const size_t depth = depth_ - 1;
     const size_t zone_idx = zone.ZoneIdx();
@@ -484,8 +547,8 @@ class PerThread {
 
     const uint64_t self_duration = ClampedSubtract(
         duration, overheads_.self + overheads_.child + child_total);
-    accumulators.Get(thread, zone_idx).Add(zone, self_duration);
-    // For faster Assimilate() - not all zones are encountered.
+    accumulators.Get(global_idx, zone_idx).Add(zone, self_duration);
+    // For faster TakeAccumulator() - not all zones are encountered.
     visited_zones_.Set(zone_idx);
 
     // Adding this nested time to the parent's `child_total` will
@@ -495,11 +558,10 @@ class PerThread {
     depth_ = depth;
   }
 
-  bool HadAnyZones() const { return HWY_IS_DEBUG_BUILD ? (any_ != 0) : false; }
-
   // Returns the duration of one enter/exit pair and resets all state. Called
   // via `DetectSelfOverhead`.
-  uint64_t GetFirstDurationAndReset(size_t thread, Accumulators& accumulators) {
+  uint64_t GetFirstDurationAndReset(size_t global_idx,
+                                    Accumulators& accumulators) {
     HWY_DASSERT(depth_ == 0);
 
     HWY_DASSERT(visited_zones_.Count() == 1);
@@ -508,32 +570,28 @@ class PerThread {
     HWY_DASSERT(visited_zones_.Get(zone_idx));
     visited_zones_.Clear(zone_idx);
 
-    Accumulator& zone = accumulators.Get(thread, zone_idx);
+    Accumulator& zone = accumulators.Get(global_idx, zone_idx);
     const uint64_t duration = zone.duration;
     zone = Accumulator();
     return duration;
   }
 
   // Adds all data to `results` and resets it here. Called from the main thread.
-  void MoveTo(const size_t thread, Accumulators& accumulators,
+  void MoveTo(const size_t global_idx, Accumulators& accumulators,
               Results& results) {
-    const uint64_t t0 = timer::Start();
-
     visited_zones_.Foreach([&](size_t zone_idx) {
-      results.Assimilate(thread, zone_idx, accumulators.Get(thread, zone_idx));
+      results.TakeAccumulator(global_idx, zone_idx,
+                              accumulators.Get(global_idx, zone_idx));
     });
     // OK to reset even if we have active zones, because we set `visited_zones_`
     // when exiting the zone.
     visited_zones_ = ZoneSet();
-
-    results.AddAnalysisTime(t0);
   }
 
  private:
   // 40 bytes:
-  ZoneSet visited_zones_;  // Which `zones_` have been active on this thread.
+  ZoneSet visited_zones_;  // Which `zones_` have been active on this worker.
   uint64_t depth_ = 0;     // Current nesting level for active zones.
-  uint64_t any_ = 0;
   Overheads overheads_;
 
   uint64_t t_enter_[kMaxDepth];
@@ -541,9 +599,8 @@ class PerThread {
   // Shifting by one avoids bounds-checks for depth_ = 0 (root zone).
   uint64_t child_total_[1 + kMaxDepth] = {0};
 };
-
 // Enables shift rather than multiplication.
-static_assert(sizeof(PerThread) == 256, "Wrong size");
+static_assert(sizeof(PerWorker) == 256, "Wrong size");
 
 }  // namespace profiler
 
@@ -551,37 +608,59 @@ class Profiler {
  public:
   static HWY_DLLEXPORT Profiler& Get();
 
-  // Assigns the next counter value to the `thread_local` that `Thread` reads.
-  // Must be called exactly once on each thread before any `PROFILER_ZONE`
-  // (without a thread argument) are re-entered by multiple threads.
-  // `Profiler()` takes care of calling this for the main thread. It is fine not
-  // to call it for other threads as long as they only use `PROFILER_ZONE2` or
-  // `PROFILER_ZONE3`, which take a thread argument and do not call `Thread`.
-  static void InitThread() { s_thread = s_num_threads.fetch_add(1); }
+  // Returns `global_idx` from thread-local storage (0 for the main thread).
+  // Used by `PROFILER_ZONE/PROFILER_FUNC`. It is faster to instead pass the
+  // global_idx from `ThreadPool::Run` (if constructed with non-default
+  // `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`.
+  // DEPRECATED: use `GlobalIdx` instead.
+  static size_t Thread() { return s_global_idx; }
+  static size_t GlobalIdx() { return s_global_idx; }
+  // Must be called from all worker threads, and once also on the main thread,
+  // before any use of `PROFILER_ZONE/PROFILER_FUNC`.
+  static void SetGlobalIdx(size_t global_idx) { s_global_idx = global_idx; }
+
+  void ReserveWorker(size_t global_idx) {
+    HWY_ASSERT(!workers_reserved_.Get(global_idx));
+    workers_reserved_.Set(global_idx);
+  }
+
+  void FreeWorker(size_t global_idx) {
+    HWY_ASSERT(workers_reserved_.Get(global_idx));
+    workers_reserved_.Clear(global_idx);
+  }
 
-  // Used by `PROFILER_ZONE/PROFILER_FUNC` to read the `thread` argument from
-  // thread_local storage. It is faster to instead pass the ThreadPool `thread`
-  // argument to `PROFILER_ZONE2/PROFILER_ZONE3`. Note that the main thread
-  // calls `InitThread` first, hence its `Thread` returns zero, which matches
-  // the main-first worker numbering used by `ThreadPool`.
-  static size_t Thread() { return s_thread; }
+  // Called by `Zone` from any thread.
+  void Enter(uint64_t t_enter, size_t global_idx) {
+    GetWorker(global_idx).Enter(t_enter);
+  }
 
-  // Speeds up `UpdateResults` by providing an upper bound on the number of
-  // threads tighter than `profiler::kMaxThreads`. It is not required to be
-  // tight, and threads less than this can still be unused.
-  void SetMaxThreads(size_t max_threads) {
-    HWY_ASSERT(max_threads <= profiler::kMaxThreads);
-    max_threads_ = max_threads;
+  // Called by `~Zone` from any thread.
+  void Exit(uint64_t t_exit, size_t global_idx, profiler::ZoneHandle zone) {
+    GetWorker(global_idx).Exit(t_exit, global_idx, zone, accumulators_);
   }
 
-  const char* Name(profiler::ZoneHandle zone) const { return names_.Get(zone); }
+  uint64_t GetFirstDurationAndReset(size_t global_idx) {
+    return GetWorker(global_idx)
+        .GetFirstDurationAndReset(global_idx, accumulators_);
+  }
+
+  const char* Name(profiler::ZoneHandle zone) const {
+    return zones_.Name(zone);
+  }
 
   // Copies `name` into the string table and returns its unique `zone`. Uses
   // linear search, which is fine because this is called during static init.
   // Called via static initializer and the result is passed to the `Zone` ctor.
   profiler::ZoneHandle AddZone(const char* name,
                                ProfilerFlags flags = ProfilerFlags::kDefault) {
-    return names_.AddZone(name, flags);
+    return zones_.AddZone(name, flags);
+  }
+
+  void AddFunc(void* owner, ProfilerFunc func) {
+    funcs_.Add(reinterpret_cast<intptr_t>(owner), func);
+  }
+  void RemoveFunc(void* owner) {
+    funcs_.Remove(reinterpret_cast<intptr_t>(owner));
   }
 
   // For reporting average concurrency. Called by `ThreadPool::Run` on the main
@@ -602,9 +681,9 @@ class Profiler {
   // broadcasts to "all cores", but there is no universal guarantee.
   //
   // Under the assumption that all concurrency is via our `ThreadPool`, we can
-  // record all `thread` for each outermost (root) `ThreadPool::Run`. This
+  // record all `global_idx` for each outermost (root) `ThreadPool::Run`. This
   // collapses all nested pools into one 'invocation'. We then compute per-zone
-  // concurrency as the number of unique `thread` seen per invocation.
+  // concurrency as the number of unique `global_idx` seen per invocation.
   bool IsRootRun() {
     // We are not the root if a Run was already active.
     return !run_active_.test_and_set(std::memory_order_acquire);
@@ -616,7 +695,7 @@ class Profiler {
   // when `PrintResults` is called.
   void EndRootRun() {
     UpdateResults();
-    results_.CountThreadsAndReset();
+    results_.CountWorkersAndReset();
 
     run_active_.clear(std::memory_order_release);
   }
@@ -625,58 +704,59 @@ class Profiler {
   // zones. Resets all state, can be called again after more zones.
   void PrintResults() {
     UpdateResults();
-    // `CountThreadsAndReset` is fused into `Print`, so do not call it here.
+    // `CountWorkersAndReset` is fused into `Print`, so do not call it here.
 
-    results_.Print(names_);
-  }
+    results_.PrintAndReset(zones_);
 
-  // Only for use by Zone; called from any thread.
-  profiler::PerThread& GetThread(size_t thread) {
-    HWY_DASSERT(thread < profiler::kMaxThreads);
-    return threads_[thread];
+    funcs_.CallAll();
   }
-  profiler::Accumulators& Accumulators() { return accumulators_; }
+
+  // TODO: remove when no longer called.
+  void SetMaxThreads(size_t) {}
 
  private:
   // Sets main thread index, computes self-overhead, and checks timer support.
   Profiler();
 
-  // Called from the main thread.
+  profiler::PerWorker& GetWorker(size_t global_idx) {
+    HWY_DASSERT(workers_reserved_.Get(global_idx));
+    return workers_[global_idx];
+  }
+
+  // Moves accumulators into Results. Called from the main thread.
   void UpdateResults() {
-    for (size_t thread = 0; thread < max_threads_; ++thread) {
-      threads_[thread].MoveTo(thread, accumulators_, results_);
-    }
+    // Ensure we see all writes from before the workers' release fence.
+    std::atomic_thread_fence(std::memory_order_acquire);
 
-    // Check that all other threads did not have any zones.
-    HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) {
-      for (size_t thread = max_threads_; thread < profiler::kMaxThreads;
-           ++thread) {
-        HWY_ASSERT(!threads_[thread].HadAnyZones());
-      }
-    }
+    workers_reserved_.Foreach([&](size_t global_idx) {
+      workers_[global_idx].MoveTo(global_idx, accumulators_, results_);
+    });
   }
 
-  static thread_local size_t s_thread;
-  static std::atomic<size_t> s_num_threads;
-  size_t max_threads_ = profiler::kMaxThreads;
+  static thread_local size_t s_global_idx;
+
+  // These are atomic because `ThreadFunc` reserves its slot(s) and even
+  // `ThreadPool::ThreadPool` may be called concurrently. Both have bit `i` set
+  // between calls to `Reserve*(i)` and `Free*(i)`. They are consulted in
+  // `UpdateResults` and to validate arguments in debug builds, and only updated
+  // in the pool/thread init/shutdown.
+  profiler::AtomicWorkerSet workers_reserved_;
 
   std::atomic_flag run_active_ = ATOMIC_FLAG_INIT;
 
-  // To avoid locking, each thread has its own working set. We could access this
+  profiler::Funcs funcs_;
+
+  // To avoid locking, each worker has its own working set. We could access this
   // through `thread_local` pointers, but that is slow to read on x86. Because
-  // our `ThreadPool` anyway passes a `thread` argument, we can instead pass
+  // our `ThreadPool` anyway passes a `global_idx` argument, we can instead pass
   // that through the `PROFILER_ZONE2/PROFILER_ZONE3` macros.
-  profiler::PerThread threads_[profiler::kMaxThreads];
+  profiler::PerWorker workers_[profiler::kMaxWorkers];
 
   profiler::Accumulators accumulators_;
 
-  // Updated by the main thread after the root `ThreadPool::Run` and during
-  // `PrintResults`.
-  profiler::ConcurrencyStats concurrency_[profiler::kMaxZones];
-
-  profiler::Names names_;
-
   profiler::Results results_;
+
+  profiler::Zones zones_;
 };
 
 namespace profiler {
@@ -684,35 +764,33 @@ namespace profiler {
 // RAII for zone entry/exit.
 class Zone {
  public:
-  // Thread-compatible; must not be called concurrently with the same `thread`.
-  // `thread` must be < `HWY_MIN(kMaxThreads, max_threads_)`, and is typically:
-  // - passed from `ThreadPool` via `PROFILER_ZONE2/PROFILER_ZONE3`. NOTE:
-  //   this value must be unique across all pools, which requires an offset to
-  //   a nested pool's `thread` argument.
-  // - obtained from `Profiler::Thread()`, or
-  // - 0 if only a single thread is active.
-  Zone(Profiler& profiler, size_t thread, ZoneHandle zone)
+  // Thread-compatible; must not call concurrently with the same `global_idx`,
+  // which is either:
+  // - passed from `ThreadPool::Run` (if it was constructed with non-default
+  //  `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`;
+  // - obtained from `Profiler::GlobalIdx()`; or
+  // - 0 if running on the main thread.
+  Zone(Profiler& profiler, size_t global_idx, ZoneHandle zone)
       : profiler_(profiler) {
     HWY_FENCE;
     const uint64_t t_enter = timer::Start();
     HWY_FENCE;
-    thread_ = static_cast<uint32_t>(thread);
+    global_idx_ = static_cast<uint32_t>(global_idx);
     zone_ = zone;
-    profiler.GetThread(thread).Enter(t_enter);
+    profiler.Enter(t_enter, global_idx);
     HWY_FENCE;
   }
 
   ~Zone() {
     HWY_FENCE;
     const uint64_t t_exit = timer::Stop();
-    profiler_.GetThread(thread_).Exit(t_exit, thread_, zone_,
-                                      profiler_.Accumulators());
+    profiler_.Exit(t_exit, static_cast<size_t>(global_idx_), zone_);
     HWY_FENCE;
   }
 
  private:
   Profiler& profiler_;
-  uint32_t thread_;
+  uint32_t global_idx_;
   ZoneHandle zone_;
 };
 
@@ -726,9 +804,15 @@ struct ZoneHandle {};
 struct Profiler {
   static HWY_DLLEXPORT Profiler& Get();
 
-  static void InitThread() {}
+  // DEPRECATED: use `GlobalIdx` instead.
   static size_t Thread() { return 0; }
-  void SetMaxThreads(size_t) {}
+  static size_t GlobalIdx() { return 0; }
+  static void SetGlobalIdx(size_t) {}
+  void ReserveWorker(size_t) {}
+  void FreeWorker(size_t) {}
+  void Enter(uint64_t, size_t) {}
+  void Exit(uint64_t, size_t, profiler::ZoneHandle) {}
+  uint64_t GetFirstDurationAndReset(size_t) { return 0; }
 
   const char* Name(profiler::ZoneHandle) const { return nullptr; }
   profiler::ZoneHandle AddZone(const char*,
@@ -736,10 +820,15 @@ struct Profiler {
     return profiler::ZoneHandle();
   }
 
+  void AddFunc(void*, ProfilerFunc) {}
+  void RemoveFunc(void*) {}
+
   bool IsRootRun() { return false; }
   void EndRootRun() {}
-
   void PrintResults() {}
+
+  // TODO: remove when no longer called.
+  void SetMaxThreads(size_t) {}
 };
 
 namespace profiler {
@@ -754,26 +843,26 @@ struct Zone {
 
 // Creates a `Zone` lvalue with a line-dependent name, which records the elapsed
 // time from here until the end of the current scope. `p` is from
-// `Profiler::Get()` or a cached reference. `thread` is < `kMaxThreads`. `zone`
+// `Profiler::Get()` or a cached reference. `global_idx < kMaxWorkers`. `zone`
 // is the return value of `AddZone`. Separating its static init from the `Zone`
 // may be more efficient than `PROFILER_ZONE2`.
-#define PROFILER_ZONE3(p, thread, zone)                               \
-  HWY_FENCE;                                                          \
-  const hwy::profiler::Zone HWY_CONCAT(Z, __LINE__)(p, thread, zone); \
+#define PROFILER_ZONE3(p, global_idx, zone)                               \
+  HWY_FENCE;                                                              \
+  const hwy::profiler::Zone HWY_CONCAT(Z, __LINE__)(p, global_idx, zone); \
   HWY_FENCE
 
 // For compatibility with old callers that do not pass `p` nor `flags`.
-// Also calls AddZone. Usage: `PROFILER_ZONE2(thread, "MyZone");`
-#define PROFILER_ZONE2(thread, name)                                  \
+// Also calls AddZone. Usage: `PROFILER_ZONE2(global_idx, "MyZone");`
+#define PROFILER_ZONE2(global_idx, name)                              \
   static const hwy::profiler::ZoneHandle HWY_CONCAT(zone, __LINE__) = \
       hwy::Profiler::Get().AddZone(name);                             \
-  PROFILER_ZONE3(hwy::Profiler::Get(), thread, HWY_CONCAT(zone, __LINE__))
-#define PROFILER_FUNC2(thread) PROFILER_ZONE2(thread, __func__)
+  PROFILER_ZONE3(hwy::Profiler::Get(), global_idx, HWY_CONCAT(zone, __LINE__))
+#define PROFILER_FUNC2(global_idx) PROFILER_ZONE2(global_idx, __func__)
 
-// OBSOLETE: it is more efficient to pass `thread` from `ThreadPool` to
+// OBSOLETE: it is more efficient to pass `global_idx` from `ThreadPool` to
 // `PROFILER_ZONE2/PROFILER_ZONE3`. Here we get it from thread_local storage.
-#define PROFILER_ZONE(name) PROFILER_ZONE2(hwy::Profiler::Thread(), name)
-#define PROFILER_FUNC PROFILER_FUNC2(hwy::Profiler::Thread())
+#define PROFILER_ZONE(name) PROFILER_ZONE2(hwy::Profiler::GlobalIdx(), name)
+#define PROFILER_FUNC PROFILER_FUNC2(hwy::Profiler::GlobalIdx())
 
 // DEPRECATED: Use `hwy::Profiler::Get()` directly instead.
 #define PROFILER_ADD_ZONE(name) hwy::Profiler::Get().AddZone(name)
diff --git a/third_party/highway/hwy/stats.cc b/third_party/highway/hwy/stats.cc
@@ -69,7 +69,7 @@ std::string Stats::ToString(int exclude) const {
   if (Count() == 0) return std::string("(none)");
 
   char buf[300];
-  int pos = 0;
+  size_t pos = 0;
   int ret;  // snprintf - bytes written or negative for error.
 
   if ((exclude & kNoCount) == 0) {
@@ -93,8 +93,8 @@ std::string Stats::ToString(int exclude) const {
   }
 
   if ((exclude & kNoMinMax) == 0) {
-    ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5e Max=%8.5e ", Min(),
-                   Max());
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5e Max=%8.5e ",
+                   static_cast<double>(Min()), static_cast<double>(Max()));
     HWY_ASSERT(ret > 0);
     pos += ret;
   }
@@ -113,7 +113,7 @@ std::string Stats::ToString(int exclude) const {
     pos += ret;
   }
 
-  HWY_ASSERT(pos < static_cast<int>(sizeof(buf)));
+  HWY_ASSERT(pos < sizeof(buf));
   return buf;
 }
 
diff --git a/third_party/highway/hwy/targets.cc b/third_party/highway/hwy/targets.cc
@@ -395,7 +395,7 @@ static int64_t DetectTargets() {
     // https://github.com/simdutf/simdutf/pull/236.
 
     // In addition to the bug that is there on macOS 12.1 or earlier, bits 5, 6,
-    // and 7 can be set to 0 on x86_64 CPU's with AVX3 support on macOS until
+    // and 7 can be set to 0 on x86_64 CPUs with AVX3 support on macOS until
     // the first AVX512 instruction is executed as macOS only preserves
     // ZMM16-ZMM31, the upper 256 bits of the ZMM registers, and K0-K7 across a
     // context switch on threads that have executed an AVX512 instruction.
@@ -445,6 +445,10 @@ static int64_t DetectTargets() {
 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
 namespace arm {
 
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
 #if HWY_ARCH_ARM_A64 && !HWY_OS_APPLE &&        \
     (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
     ((HWY_TARGETS & HWY_ALL_SVE) != 0)
@@ -490,7 +494,8 @@ static int64_t DetectTargets() {
     if ((HasCpuFeature("hw.optional.AdvSIMD_HPFPCvt") ||
          HasCpuFeature("hw.optional.arm.AdvSIMD_HPFPCvt")) &&
         HasCpuFeature("hw.optional.arm.FEAT_DotProd") &&
-        HasCpuFeature("hw.optional.arm.FEAT_BF16")) {
+        HasCpuFeature("hw.optional.arm.FEAT_BF16") &&
+        HasCpuFeature("hw.optional.arm.FEAT_I8MM")) {
       bits |= HWY_NEON_BF16;
     }
   }
@@ -502,8 +507,10 @@ static int64_t DetectTargets() {
 
 #if defined(HWCAP_ASIMDHP) && defined(HWCAP_ASIMDDP) && defined(HWCAP2_BF16)
     const CapBits hw2 = getauxval(AT_HWCAP2);
-    const int64_t kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP;
-    if ((hw & kGroupF16Dot) == kGroupF16Dot && (hw2 & HWCAP2_BF16)) {
+    constexpr CapBits kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP;
+    constexpr CapBits kGroupBF16 = HWCAP2_BF16 | HWCAP2_I8MM;
+    if ((hw & kGroupF16Dot) == kGroupF16Dot &&
+        (hw2 & kGroupBF16) == kGroupBF16) {
       bits |= HWY_NEON_BF16;
     }
 #endif  // HWCAP_ASIMDHP && HWCAP_ASIMDDP && HWCAP2_BF16
@@ -522,8 +529,13 @@ static int64_t DetectTargets() {
 #ifndef HWCAP2_SVEAES
 #define HWCAP2_SVEAES (1 << 2)
 #endif
+#ifndef HWCAP2_SVEI8MM
+#define HWCAP2_SVEI8MM (1 << 9)
+#endif
+  constexpr CapBits kGroupSVE2 =
+      HWCAP2_SVE2 | HWCAP2_SVEAES | HWCAP2_SVEI8MM | HWCAP2_I8MM;
   const CapBits hw2 = getauxval(AT_HWCAP2);
-  if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
+  if ((hw2 & kGroupSVE2) == kGroupSVE2) {
     bits |= HWY_SVE2;
   }
 
diff --git a/third_party/highway/hwy/timer.h b/third_party/highway/hwy/timer.h
@@ -232,6 +232,50 @@ static HWY_INLINE Ticks Stop() {
 
 }  // namespace timer
 
+// Wrapper around Start/Stop that checks whether the CPU supports Stop.
+class Timer {
+ public:
+  Timer() {
+    char cpu100[100];
+    have_timer_stop_ = platform::HaveTimerStop(cpu100);
+  }
+
+  // Before/After have fences to prevent the measured code 'leaking out'.
+  timer::Ticks Before() const { return timer::Start(); }
+  timer::Ticks After() const {
+    return have_timer_stop_ ? timer::Stop() : timer::Start();
+  }
+
+ private:
+  bool have_timer_stop_;
+};
+
+static inline double Seconds(timer::Ticks ticks) {
+  return static_cast<double>(ticks) / platform::InvariantTicksPerSecond();
+}
+
+// Measures elapsed time since construction, with automatic reset.
+class Stopwatch {
+ public:
+  explicit Stopwatch(const Timer& timestamps) : timer_(timestamps) { Reset(); }
+
+  timer::Ticks Origin() const { return t0_; }
+  void Reset() { t0_ = timer_.Before(); }
+
+  // Also resets the start time to the current time to enable reuse without a
+  // second call to the timer.
+  timer::Ticks Elapsed() {
+    const timer::Ticks t1 = timer_.After();
+    const timer::Ticks elapsed = t1 - t0_;
+    t0_ = t1;
+    return elapsed;
+  }
+
+ private:
+  const Timer& timer_;
+  timer::Ticks t0_;
+};
+
 }  // namespace hwy
 
 #endif  // HIGHWAY_HWY_TIMER_H_

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	media/highway/moz.yaml	\|	6	+++---
M	media/highway/mozilla.patch	\|	4	++--
M	third_party/highway/BUILD	\|	172	++++---------------------------------------------------------------------------
M	third_party/highway/CMakeLists.txt	\|	48	++++--------------------------------------------
M	third_party/highway/LICENSE	\|	174	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
D	third_party/highway/LICENSE-BSD3	\|	27	---------------------------
M	third_party/highway/README.md	\|	39	+++++++++++++++++++++++++--------------
M	third_party/highway/hwy/aligned_allocator.h	\|	2	+-
M	third_party/highway/hwy/auto_tune.h	\|	34	+++++++++++++++++++++++-----------
M	third_party/highway/hwy/base.h	\|	66	+++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	third_party/highway/hwy/bit_set.h	\|	268	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M	third_party/highway/hwy/bit_set_test.cc	\|	113	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	third_party/highway/hwy/cache_control.h	\|	5	+++--
M	third_party/highway/hwy/contrib/algo/find_test.cc	\|	4	++--
M	third_party/highway/hwy/contrib/algo/transform_test.cc	\|	12	++++++------
M	third_party/highway/hwy/contrib/math/math-inl.h	\|	1	-
M	third_party/highway/hwy/contrib/math/math_test.cc	\|	496	+++----------------------------------------------------------------------------
M	third_party/highway/hwy/contrib/matvec/matvec-inl.h	\|	8	++++----
M	third_party/highway/hwy/contrib/matvec/matvec_test.cc	\|	3	+--
M	third_party/highway/hwy/contrib/random/random-inl.h	\|	6	++----
M	third_party/highway/hwy/contrib/sort/BUILD	\|	2	++
M	third_party/highway/hwy/contrib/sort/algo-inl.h	\|	38	+++++++++-----------------------------
M	third_party/highway/hwy/contrib/sort/bench_sort.cc	\|	3	+--
M	third_party/highway/hwy/contrib/sort/print_network.cc	\|	2	+-
M	third_party/highway/hwy/contrib/sort/result-inl.h	\|	20	++++++++++----------
M	third_party/highway/hwy/contrib/sort/shared-inl.h	\|	14	+++++++-------
M	third_party/highway/hwy/contrib/sort/sort_test.cc	\|	10	+++++++---
M	third_party/highway/hwy/contrib/sort/sort_unit_test.cc	\|	12	+++++++-----
M	third_party/highway/hwy/contrib/sort/sorting_networks-inl.h	\|	10	++++++++++
M	third_party/highway/hwy/contrib/sort/vqsort_f16a.cc	\|	7	++++---
M	third_party/highway/hwy/contrib/sort/vqsort_f16d.cc	\|	7	++++---
M	third_party/highway/hwy/contrib/thread_pool/futex.h	\|	8	++++++++
M	third_party/highway/hwy/contrib/thread_pool/spin.h	\|	27	+++++++++++++++++----------
M	third_party/highway/hwy/contrib/thread_pool/thread_pool.h	\|	1689	+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M	third_party/highway/hwy/contrib/thread_pool/thread_pool_test.cc	\|	67	+++++++++++++++++++++++++++++++------------------------------------
M	third_party/highway/hwy/contrib/thread_pool/topology.cc	\|	37	++++++++++++++++++++++++++++++++-----
M	third_party/highway/hwy/detect_compiler_arch.h	\|	26	++++++++++++++++++++++----
M	third_party/highway/hwy/detect_targets.h	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M	third_party/highway/hwy/highway.h	\|	15	+++++++++++++++
M	third_party/highway/hwy/highway_test.cc	\|	2	+-
M	third_party/highway/hwy/nanobenchmark.h	\|	6	+++---
M	third_party/highway/hwy/ops/arm_neon-inl.h	\|	22	+++++++++-------------
M	third_party/highway/hwy/ops/arm_sve-inl.h	\|	36	+++++++++++++++++++++++++++++++-----
M	third_party/highway/hwy/ops/generic_ops-inl.h	\|	83	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M	third_party/highway/hwy/ops/loongarch_lasx-inl.h	\|	21	+++++++++++++++++++--
M	third_party/highway/hwy/ops/loongarch_lsx-inl.h	\|	23	++++++++++++++++++++++-
M	third_party/highway/hwy/ops/ppc_vsx-inl.h	\|	13	+++++++++++++
M	third_party/highway/hwy/ops/rvv-inl.h	\|	66	+++++++++++++++++++++++++++++++++++++++++++-----------------------
M	third_party/highway/hwy/ops/scalar-inl.h	\|	11	+++++++++++
M	third_party/highway/hwy/ops/set_macros-inl.h	\|	113	+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M	third_party/highway/hwy/ops/x86_128-inl.h	\|	53	+++++++++++++++++++++++++++++++++++++++++++++++------
M	third_party/highway/hwy/ops/x86_256-inl.h	\|	23	++++++++++++++++++++---
M	third_party/highway/hwy/ops/x86_512-inl.h	\|	35	++++++++++++++++++++++++++++++-----
M	third_party/highway/hwy/perf_counters.cc	\|	2	+-
M	third_party/highway/hwy/perf_counters.h	\|	2	+-
M	third_party/highway/hwy/print-inl.h	\|	2	++
M	third_party/highway/hwy/print.cc	\|	22	++++++++++++++--------
M	third_party/highway/hwy/profiler.cc	\|	60	++++++++++++++++++++++++++++++------------------------------
M	third_party/highway/hwy/profiler.h	\|	617	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	third_party/highway/hwy/stats.cc	\|	8	++++----
M	third_party/highway/hwy/targets.cc	\|	22	+++++++++++++++++-----
M	third_party/highway/hwy/timer.h	\|	44	++++++++++++++++++++++++++++++++++++++++++++