commit 5ce1cf127034ed8ac82ae2910cd2865aae2aedcc parent eb8e7f929baf5979ed6defb9986b3f45c4a9e747 Author: Updatebot <updatebot@mozilla.com> Date: Fri, 21 Nov 2025 21:28:59 +0000 Bug 1999339 - Update libvpx to 9a7674e1a83d1261a49776c8794b87c9bccc85d7 r=chunmin Differential Revision: https://phabricator.services.mozilla.com/D272057 Diffstat:
23 files changed, 1130 insertions(+), 345 deletions(-)
diff --git a/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc @@ -69,6 +69,7 @@ #include <algorithm> #include <memory> +#include "third_party/nalloc/nalloc.h" #include "vpx/vp8dx.h" #include "vpx/vpx_decoder.h" #include "vpx_ports/mem_ops.h" @@ -85,25 +86,40 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { if (size <= IVF_FILE_HDR_SZ) { return 0; } + nalloc_init(nullptr); vpx_codec_ctx_t codec; // Set thread count in the range [1, 64]. const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; vpx_codec_dec_cfg_t cfg = { threads, 0, 0 }; - if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, 0)) { + vpx_codec_flags_t flags = 0; + if ((data[IVF_FILE_HDR_SZ] & 0x40) != 0) { + flags |= VPX_CODEC_USE_POSTPROC; + } + vpx_codec_err_t err = + vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, flags); + if (err == VPX_CODEC_INCAPABLE) { + // vpx_codec_dec_init may fail with VPX_CODEC_USE_POSTPROC + // if the library is configured with --disable-postproc. + flags = 0; + if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, flags)) { + return 0; + } + } else if (err != 0) { return 0; } + nalloc_start(data, size); + if (threads > 1) { const int enable = (data[IVF_FILE_HDR_SZ] & 0xa0) != 0; - const vpx_codec_err_t err = - vpx_codec_control(&codec, VP9D_SET_LOOP_FILTER_OPT, enable); - static_cast<void>(err); + err = vpx_codec_control(&codec, VP9D_SET_LOOP_FILTER_OPT, enable); } data += IVF_FILE_HDR_SZ; size -= IVF_FILE_HDR_SZ; + int frame_cnt = 0; while (size > IVF_FRAME_HDR_SZ) { size_t frame_size = mem_get_le32(data); size -= IVF_FRAME_HDR_SZ; @@ -112,9 +128,20 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { vpx_codec_stream_info_t stream_info; stream_info.sz = sizeof(stream_info); - vpx_codec_err_t err = vpx_codec_peek_stream_info(VPXD_INTERFACE(DECODER), - data, size, &stream_info); - static_cast<void>(err); + err = vpx_codec_peek_stream_info(VPXD_INTERFACE(DECODER), data, size, + &stream_info); + + ++frame_cnt; + if (flags & VPX_CODEC_USE_POSTPROC) { + if (frame_cnt % 16 == 4) { + vp8_postproc_cfg_t pp = { 0, 0, 0 }; + if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) goto fail; + } else if (frame_cnt % 16 == 12) { + vp8_postproc_cfg_t pp = { VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, + 0 }; + if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) goto fail; + } + } err = vpx_codec_decode(&codec, data, frame_size, nullptr, 0); static_cast<void>(err); @@ -125,6 +152,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { data += frame_size; size -= frame_size; } +fail: vpx_codec_destroy(&codec); + nalloc_end(); return 0; } diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc @@ -1816,6 +1816,18 @@ WRAP12TAP(convolve12_vert_neon, 12) WRAP12TAP(convolve12_neon, 12) #endif // HAVE_NEON +#if HAVE_SVE2 +WRAP12TAP(convolve12_horiz_sve2, 8) +WRAP12TAP(convolve12_vert_sve2, 8) +WRAP12TAP(convolve12_sve2, 8) +WRAP12TAP(convolve12_horiz_sve2, 10) +WRAP12TAP(convolve12_vert_sve2, 10) +WRAP12TAP(convolve12_sve2, 10) +WRAP12TAP(convolve12_horiz_sve2, 12) +WRAP12TAP(convolve12_vert_sve2, 12) +WRAP12TAP(convolve12_sve2, 12) +#endif // HAVE_SVE2 + WRAP12TAP(convolve12_horiz_c, 8) WRAP12TAP(convolve12_vert_c, 8) WRAP12TAP(convolve12_c, 8) @@ -2180,6 +2192,29 @@ const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2), ALL_SIZES(convolve12_sve2) }; INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_sve2)); + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +const ConvolveFunctions12Tap convolve12tap_8bit_sve2( + wrap_convolve12_horiz_sve2_8, wrap_convolve12_vert_sve2_8, + wrap_convolve12_sve2_8, 8); + +const ConvolveFunctions12Tap convolve12tap_10bit_sve2( + wrap_convolve12_horiz_sve2_10, wrap_convolve12_vert_sve2_10, + wrap_convolve12_sve2_10, 10); + +const ConvolveFunctions12Tap convolve12tap_12bit_sve2( + wrap_convolve12_horiz_sve2_12, wrap_convolve12_vert_sve2_12, + wrap_convolve12_sve2_12, 12); + +const Convolve12TapParam kArrayConvolve12Tap_sve2[] = { + ALL_SIZES_12TAP(convolve12tap_8bit_sve2), + ALL_SIZES_12TAP(convolve12tap_10bit_sve2), + ALL_SIZES_12TAP(convolve12tap_12bit_sve2) +}; + +INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_sve2)); +#endif // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SVE2 diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc @@ -1786,11 +1786,11 @@ TEST(EncodeAPI, Buganizer441668134) { // Encode a few frames, with realtime mode and tile_rows set to 1, // with row-mt enabled. This triggers an assertion in vp9_bitstream.c (in -// function write_modes()), as in the issue:42105459. In this test it happens on -// very first encoded frame since lag_in_frames = 0. Issue is due to enabling -// TILE_ROWS: passes if tile_rows is disabled (set to 0), or if height is above -// 64 (so both row-tiles are non-empty). -TEST(EncodeAPI, DISABLED_Buganizer442105459) { +// function write_modes()), as in the issue:442105459. In this test it happens +// on very first encoded frame since lag_in_frames = 0. Issue is due to enabling +// TILE_ROWS, with number of tile_rows more than the number of superblocks. +// This test sets 2 tile_rows with height corresponding to 1 superblock (sb). +TEST(EncodeAPI, Buganizer442105459_2RowTiles) { // Initialize VP9 encoder interface vpx_codec_iface_t *iface = vpx_codec_vp9_cx(); // Get default encoder configuration @@ -1798,7 +1798,7 @@ TEST(EncodeAPI, DISABLED_Buganizer442105459) { ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); // Configure encoder cfg.g_w = 946u; - cfg.g_h = 64u; + cfg.g_h = 64u; // 1 sb row, 2 tile_rows set below. cfg.g_threads = 1; cfg.g_profile = 0; cfg.g_bit_depth = VPX_BITS_8; @@ -1815,16 +1815,99 @@ TEST(EncodeAPI, DISABLED_Buganizer442105459) { // Set control parameters vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, -5); vpx_codec_control_(&ctx, VP9E_SET_TILE_ROWS, 1); + vpx_codec_control_(&ctx, VP9E_SET_TILE_COLUMNS, 1); vpx_codec_control_(&ctx, VP9E_SET_ROW_MT, 1); // Image format selection vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420; // Allocate image with varied alignment vpx_image_t *img = vpx_img_alloc(nullptr, img_fmt, cfg.g_w, cfg.g_h, 1); + for (unsigned int y = 0; y < img->d_h; y++) { + for (unsigned int x = 0; x < img->d_w; x++) { + img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF; + } + } + const unsigned int uv_height = (img->d_h + 1) >> 1; + for (int i : { VPX_PLANE_U, VPX_PLANE_V }) { + memset(img->planes[i], 0, img->stride[i] * uv_height); + } + // Encode with dynamic configuration changes + int num_frames = 2; + // Per-frame constants captured from the original run (indices consumed per + // frame) + const vpx_codec_pts_t frame_pts_mul[] = { 33333UL, 33333UL }; + const unsigned long frame_durations[] = { 33333UL, 33333UL }; + const vpx_enc_deadline_t frame_deadlines[] = { VPX_DL_REALTIME, + VPX_DL_REALTIME }; + for (int frame = 0; frame < num_frames; frame++) { + // Encode frame + vpx_codec_pts_t pts = frame * frame_pts_mul[frame]; + unsigned long duration = frame_durations[frame]; + vpx_enc_deadline_t deadline = frame_deadlines[frame]; + ASSERT_EQ(vpx_codec_encode(&ctx, img, pts, duration, /*flags*/ 0, deadline), + VPX_CODEC_OK); + } + // Flush encoder. + ASSERT_EQ(vpx_codec_encode(&ctx, nullptr, 0, 0, 0, VPX_DL_REALTIME), 0); + // Get remaining data + vpx_codec_iter_t iter = nullptr; + while (vpx_codec_get_cx_data(&ctx, &iter) != nullptr) { + // Process remaining packets + } + vpx_img_free(img); + vpx_codec_destroy(&ctx); +} + +// Encode a few frames, with realtime mode and tile_rows set to 1, +// with row-mt enabled. This triggers an assertion in vp9_bitstream.c (in +// function write_modes()), as in the issue:442105459. In this test it happens +// on very first encoded frame since lag_in_frames = 0. Issue is due to enabling +// TILE_ROWS, with number of tile_rows more than the number of superblocks. +// This test sets 4 tile_rows with height corresponding to 3 superblocks. +TEST(EncodeAPI, Buganizer442105459_4RowTiles) { + // Initialize VP9 encoder interface + vpx_codec_iface_t *iface = vpx_codec_vp9_cx(); + // Get default encoder configuration + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Configure encoder + cfg.g_w = 946u; + cfg.g_h = 192u; // 3 sb rows, 4 tile_rows set below. + cfg.g_threads = 1; + cfg.g_profile = 0; + cfg.g_bit_depth = VPX_BITS_8; + // Rate control targeting deeper encoding paths + cfg.rc_target_bitrate = 100; + cfg.rc_min_quantizer = 0; + cfg.rc_max_quantizer = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.ss_number_layers = 1; + cfg.g_lag_in_frames = 0; + // Initialize encoder context + vpx_codec_ctx_t ctx; + ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, 0), VPX_CODEC_OK); + // Set control parameters + vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, -5); + vpx_codec_control_(&ctx, VP9E_SET_TILE_ROWS, 2); + vpx_codec_control_(&ctx, VP9E_SET_TILE_COLUMNS, 1); + vpx_codec_control_(&ctx, VP9E_SET_ROW_MT, 1); + // Image format selection + vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420; + // Allocate image with varied alignment + vpx_image_t *img = vpx_img_alloc(nullptr, img_fmt, cfg.g_w, cfg.g_h, 1); + for (unsigned int y = 0; y < img->d_h; y++) { + for (unsigned int x = 0; x < img->d_w; x++) { + img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF; + } + } + const unsigned int uv_height = (img->d_h + 1) >> 1; + for (int i : { VPX_PLANE_U, VPX_PLANE_V }) { + memset(img->planes[i], 0, img->stride[i] * uv_height); + } // Encode with dynamic configuration changes int num_frames = 2; // Per-frame constants captured from the original run (indices consumed per // frame) - const unsigned long frame_pts_mul[] = { 33333UL, 33333UL }; + const vpx_codec_pts_t frame_pts_mul[] = { 33333UL, 33333UL }; const unsigned long frame_durations[] = { 33333UL, 33333UL }; const vpx_enc_deadline_t frame_deadlines[] = { VPX_DL_REALTIME, VPX_DL_REALTIME }; @@ -1837,10 +1920,10 @@ TEST(EncodeAPI, DISABLED_Buganizer442105459) { VPX_CODEC_OK); } // Flush encoder. - ASSERT_EQ(vpx_codec_encode(&ctx, NULL, 0, 0, 0, VPX_DL_REALTIME), 0); + ASSERT_EQ(vpx_codec_encode(&ctx, nullptr, 0, 0, 0, VPX_DL_REALTIME), 0); // Get remaining data - vpx_codec_iter_t iter = NULL; - while (vpx_codec_get_cx_data(&ctx, &iter) != NULL) { + vpx_codec_iter_t iter = nullptr; + while (vpx_codec_get_cx_data(&ctx, &iter) != nullptr) { // Process remaining packets } vpx_img_free(img); diff --git a/media/libvpx/libvpx/third_party/nalloc/LICENSE b/media/libvpx/libvpx/third_party/nalloc/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Catena cyber + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/media/libvpx/libvpx/third_party/nalloc/README.libvpx b/media/libvpx/libvpx/third_party/nalloc/README.libvpx @@ -0,0 +1,11 @@ +Name: nalloc +URL: https://github.com/catenacyber/nallocfuzz +Version: dc351a94bbded5ede5b7550d6d08e78e0cc6dcef +License: MIT +License File: LICENSE + +Description: +Nalloc is a tool to inject allocation failures while fuzzing. + +Local Modifications: +None diff --git a/media/libvpx/libvpx/third_party/nalloc/nalloc.h b/media/libvpx/libvpx/third_party/nalloc/nalloc.h @@ -0,0 +1,330 @@ +/* + MIT License + + Copyright (c) 2025 Catena cyber + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef NALLOC_H_ +#define NALLOC_H_ + +#if defined(__clang__) && defined(__has_feature) +#if __has_feature(address_sanitizer) +#define NALLOC_ASAN 1 +#endif +#endif + +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef __cplusplus +extern "C" { +#endif + +static const uint32_t nalloc_crc32_table[] = { + 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, + 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, + 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, + 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, + 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, + 0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, + 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef, + 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, + 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, + 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, + 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, + 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, + 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, + 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, + 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, + 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, + 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, + 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, + 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050, + 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, + 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, + 0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, + 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, + 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, + 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, + 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, + 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9, + 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b, + 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, + 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, + 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71, + 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, + 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, + 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, + 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, + 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, + 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, + 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, + 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676, + 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, + 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, + 0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, + 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 +}; + +// Nallocfuzz data to take a decision +uint32_t nalloc_random_state = 0; +__thread unsigned int nalloc_running = 0; +bool nalloc_initialized = false; +uint32_t nalloc_runs = 0; + +// Nalloc fuzz parameters +uint32_t nalloc_bitmask = 0xFF; +bool nalloc_random_bitmask = true; +uint32_t nalloc_magic = 0x294cee63; +bool nalloc_verbose = false; + +#ifdef NALLOC_ASAN +extern void __sanitizer_print_stack_trace(void); +#endif + +// Generic init, using env variables to get parameters +void nalloc_init(const char *prog) { + if (nalloc_initialized) { + return; + } + nalloc_initialized = true; + char *bitmask = getenv("NALLOC_FREQ"); + if (bitmask) { + int shift = atoi(bitmask); + if (shift > 0 && shift < 31) { + nalloc_bitmask = 1 << shift; + nalloc_random_bitmask = false; + } else if (shift == 0) { + nalloc_random_bitmask = false; + nalloc_bitmask = 0; + } + } else if (prog == NULL || strstr(prog, "nalloc") == NULL) { + nalloc_random_bitmask = false; + nalloc_bitmask = 0; + return; + } + + char *magic = getenv("NALLOC_MAGIC"); + if (magic) { + nalloc_magic = (uint32_t)strtol(magic, NULL, 0); + } + + char *verbose = getenv("NALLOC_VERBOSE"); + if (verbose) { + nalloc_verbose = true; + } +} + +// add one byte to the CRC +static inline void nalloc_random_update(uint8_t b) { + nalloc_random_state = + ((uint32_t)((uint32_t)nalloc_random_state << 8)) ^ + nalloc_crc32_table[((nalloc_random_state >> 24) ^ b) & 0xFF]; +} + +// Start the failure injections, using a buffer as seed +static int nalloc_start(const uint8_t *data, size_t size) { + if (nalloc_random_bitmask) { + if (nalloc_random_state & 0x10) { + nalloc_bitmask = 0xFFFFFFFF; + } else { + nalloc_bitmask = 1 << (5 + (nalloc_random_state & 0xF)); + } + } else if (nalloc_bitmask == 0) { + // nalloc disabled + return 0; + } + nalloc_random_state = 0; + for (size_t i = 0; i < size; i++) { + nalloc_random_update(data[i]); + } + if (__sync_fetch_and_add(&nalloc_running, 1)) { + __sync_fetch_and_sub(&nalloc_running, 1); + return 0; + } + nalloc_runs++; + return 1; +} + +// Stop the failure injections +static void nalloc_end() { __sync_fetch_and_sub(&nalloc_running, 1); } + +static bool nalloc_backtrace_exclude(size_t size, const char *op) { + if (nalloc_verbose) { + fprintf(stderr, "failed %s(%zu) \n", op, size); +#ifdef NALLOC_ASAN + __sanitizer_print_stack_trace(); +#endif + } + + return false; +} + +// +static bool nalloc_fail(size_t size, const char *op) { + // do not fail before thread init + if (nalloc_runs == 0) { + return false; + } + if (__sync_fetch_and_add(&nalloc_running, 1) != 1) { + // do not fail allocations outside of fuzzer input + // and do not fail inside of this function + __sync_fetch_and_sub(&nalloc_running, 1); + return false; + } + nalloc_random_update((uint8_t)size); + if (size >= 0x100) { + nalloc_random_update((uint8_t)(size >> 8)); + if (size >= 0x10000) { + nalloc_random_update((uint8_t)(size >> 16)); + // bigger may already fail or oom + } + } + if (((nalloc_random_state ^ nalloc_magic) & nalloc_bitmask) == 0) { + if (nalloc_backtrace_exclude(size, op)) { + __sync_fetch_and_sub(&nalloc_running, 1); + return false; + } + __sync_fetch_and_sub(&nalloc_running, 1); + return true; + } + __sync_fetch_and_sub(&nalloc_running, 1); + return false; +} + +// ASAN interceptor for libc routines +#ifdef NALLOC_ASAN +extern void *__interceptor_malloc(size_t); +extern void *__interceptor_calloc(size_t, size_t); +extern void *__interceptor_realloc(void *, size_t); +extern void *__interceptor_reallocarray(void *, size_t, size_t); + +extern ssize_t __interceptor_read(int, void *, size_t); +extern ssize_t __interceptor_write(int, const void *, size_t); +extern ssize_t __interceptor_recv(int, void *, size_t, int); +extern ssize_t __interceptor_send(int, const void *, size_t, int); + +#define nalloc_malloc(s) __interceptor_malloc(s) +#define nalloc_calloc(s, n) __interceptor_calloc(s, n) +#define nalloc_realloc(p, s) __interceptor_realloc(p, s) +#define nalloc_reallocarray(p, s, n) __interceptor_reallocarray(p, s, n) + +#define nalloc_read(f, b, s) __interceptor_read(f, b, s) +#define nalloc_write(f, b, s) __interceptor_write(f, b, s) +#define nalloc_recv(f, b, s, x) __interceptor_recv(f, b, s, x) +#define nalloc_send(f, b, s, x) __interceptor_send(f, b, s, x) + +#else +extern void *__libc_malloc(size_t); +extern void *__libc_calloc(size_t, size_t); +extern void *__libc_realloc(void *, size_t); +extern void *__libc_reallocarray(void *, size_t, size_t); + +extern ssize_t __read(int, void *, size_t); +extern ssize_t __write(int, const void *, size_t); +extern ssize_t __recv(int, void *, size_t, int); +extern ssize_t __send(int, const void *, size_t, int); + +#define nalloc_malloc(s) __libc_malloc(s) +#define nalloc_calloc(s, n) __libc_calloc(s, n) +#define nalloc_realloc(p, s) __libc_realloc(p, s) +#define nalloc_reallocarray(p, s, n) __libc_reallocarray(p, s, n) + +#define nalloc_read(f, b, s) __read(f, b, s) +#define nalloc_write(f, b, s) __write(f, b, s) +#define nalloc_recv(f, b, s, x) __recv(f, b, s, x) +#define nalloc_send(f, b, s, x) __send(f, b, s, x) +#endif + +// nalloc standard function overwrites with pseudo-random failures +ssize_t read(int fd, void *buf, size_t count) { + if (nalloc_fail(count, "read")) { + errno = EIO; + return -1; + } + return nalloc_read(fd, buf, count); +} + +ssize_t write(int fd, const void *buf, size_t count) { + if (nalloc_fail(count, "write")) { + errno = EIO; + return -1; + } + return nalloc_write(fd, buf, count); +} + +ssize_t recv(int fd, void *buf, size_t count, int flags) { + if (nalloc_fail(count, "recv")) { + errno = EIO; + return -1; + } + return nalloc_recv(fd, buf, count, flags); +} + +ssize_t send(int fd, const void *buf, size_t count, int flags) { + if (nalloc_fail(count, "send")) { + errno = EIO; + return -1; + } + return nalloc_send(fd, buf, count, flags); +} + +void *calloc(size_t nmemb, size_t size) { + if (nalloc_fail(size, "calloc")) { + errno = ENOMEM; + return NULL; + } + return nalloc_calloc(nmemb, size); +} + +void *malloc(size_t size) { + if (nalloc_fail(size, "malloc")) { + errno = ENOMEM; + return NULL; + } + return nalloc_malloc(size); +} + +void *realloc(void *ptr, size_t size) { + if (nalloc_fail(size, "realloc")) { + errno = ENOMEM; + return NULL; + } + return nalloc_realloc(ptr, size); +} + +void *reallocarray(void *ptr, size_t nmemb, size_t size) { + if (nalloc_fail(size, "reallocarray")) { + errno = ENOMEM; + return NULL; + } + return nalloc_reallocarray(ptr, nmemb, size); +} + +#ifdef __cplusplus +} // extern "C" { +#endif + +#endif // NALLOC_H_ diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -1022,19 +1022,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, - &sd, dst_time_stamp, dst_end_time_stamp)) { - VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; - res = update_error_state(ctx, &cpi->common.error); - } + if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; + res = update_error_state(ctx, &cpi->common.error); } /* reset for next frame */ diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -206,13 +206,13 @@ if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_convolve12_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve12_vert ssse3 avx2 neon/; + specialize qw/vpx_highbd_convolve12_vert ssse3 avx2 neon sve2/; add_proto qw/void vpx_highbd_convolve12_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve12_horiz ssse3 avx2 neon/; + specialize qw/vpx_highbd_convolve12_horiz ssse3 avx2 neon sve2/; add_proto qw/void vpx_highbd_convolve12/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve12 ssse3 avx2 neon/; + specialize qw/vpx_highbd_convolve12 ssse3 avx2 neon sve2/; } } diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2025 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <arm_neon_sve_bridge.h> +#include <arm_sve.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDotProdPermuteTbl[32]) = { + // clang-format off + 0, 1, 2, 3, 1, 2, 3, 4, + 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 0, + 6, 7, 0, 1, 7, 0, 1, 2, + // clang-format on +}; + +static INLINE uint16x8_t highbd_convolve12_8_h( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t filter_0_7, const int16x8_t filter_4_11, + const uint16x8x4_t perm_tbl, const uint16x8_t max) { + int16x8_t perm_samples[8]; + + perm_samples[0] = vpx_tbl_s16(s0, perm_tbl.val[0]); + perm_samples[1] = vpx_tbl_s16(s0, perm_tbl.val[1]); + perm_samples[2] = vpx_tbl2_s16(s0, s1, perm_tbl.val[2]); + perm_samples[3] = vpx_tbl2_s16(s0, s1, perm_tbl.val[3]); + perm_samples[4] = vpx_tbl_s16(s1, perm_tbl.val[0]); + perm_samples[5] = vpx_tbl_s16(s1, perm_tbl.val[1]); + perm_samples[6] = vpx_tbl2_s16(s1, s2, perm_tbl.val[2]); + perm_samples[7] = vpx_tbl2_s16(s1, s2, perm_tbl.val[3]); + + int64x2_t sum01 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[0], filter_0_7, 0); + sum01 = vpx_dotq_lane_s16(sum01, perm_samples[2], filter_0_7, 1); + sum01 = vpx_dotq_lane_s16(sum01, perm_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[1], filter_0_7, 0); + sum23 = vpx_dotq_lane_s16(sum23, perm_samples[3], filter_0_7, 1); + sum23 = vpx_dotq_lane_s16(sum23, perm_samples[5], filter_4_11, 1); + + int64x2_t sum45 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[2], filter_0_7, 0); + sum45 = vpx_dotq_lane_s16(sum45, perm_samples[4], filter_0_7, 1); + sum45 = vpx_dotq_lane_s16(sum45, perm_samples[6], filter_4_11, 1); + + int64x2_t sum67 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[3], filter_0_7, 0); + sum67 = vpx_dotq_lane_s16(sum67, perm_samples[5], filter_0_7, 1); + sum67 = vpx_dotq_lane_s16(sum67, perm_samples[7], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +void vpx_highbd_convolve12_horiz_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Scaling not supported by SVE2 implementation. + if (x_step_q4 != 16) { + vpx_highbd_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 0); + + const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]); + const int16x8_t filter_4_11 = vld1q_s16(filter[x0_q4] + 4); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdPermuteTbl); + + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + permute_tbl.val[2] = vsetq_lane_u16(svcnth(), permute_tbl.val[2], 7); + permute_tbl.val[3] = vsetq_lane_u16(svcnth(), permute_tbl.val[3], 5); + uint16x8_t permute_tbl_3_offsets = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + permute_tbl.val[3] = + vaddq_u16(permute_tbl.val[3], permute_tbl_3_offsets); // 2, 3, 6, 7 + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[3], s1[3]; + + load_s16_8x3(s + 0 * src_stride, 8, &s0[0], &s0[1], &s0[2]); + load_s16_8x3(s + 1 * src_stride, 8, &s1[0], &s1[1], &s1[2]); + + uint16x8_t d0 = highbd_convolve12_8_h(s0[0], s0[1], s0[2], filter_0_7, + filter_4_11, permute_tbl, max); + uint16x8_t d1 = highbd_convolve12_8_h(s1[0], s1[1], s1[2], filter_0_7, + filter_4_11, permute_tbl, max); + + vst1q_u16(d + 0 * dst_stride, d0); + vst1q_u16(d + 1 * dst_stride, d1); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); +} + +static INLINE uint16x4_t highbd_convolve12_4_v(const int16x8_t s0[2], + const int16x8_t s1[2], + const int16x8_t s2[2], + const int16x8_t filter_0_7, + const int16x8_t filter_4_11, + const uint16x4_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); + sum01 = vpx_dotq_lane_s16(sum01, s1[0], filter_0_7, 1); + sum01 = vpx_dotq_lane_s16(sum01, s2[0], filter_4_11, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); + sum23 = vpx_dotq_lane_s16(sum23, s1[1], filter_0_7, 1); + sum23 = vpx_dotq_lane_s16(sum23, s2[1], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +void vpx_highbd_convolve12_vert_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Scaling not supported by SVE2 implementation. + if (y_step_q4 != 16) { + vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 0); + + const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]); + const int16x8_t filter_4_11 = vld1q_s16(filter[y0_q4] + 4); + + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); + transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]); + transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]); + transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]); + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_s16_4x4(s8, s9, sA, sB, &s89AB[0], &s89AB[1]); + transpose_concat_s16_4x4(s9, sA, sB, sC, &s9ABC[0], &s9ABC[1]); + transpose_concat_s16_4x4(sA, sB, sC, sD, &sABCD[0], &sABCD[1]); + transpose_concat_s16_4x4(sB, sC, sD, sE, &sBCDE[0], &sBCDE[1]); + + uint16x4_t d0 = highbd_convolve12_4_v(s0123, s4567, s89AB, filter_0_7, + filter_4_11, max); + uint16x4_t d1 = highbd_convolve12_4_v(s1234, s5678, s9ABC, filter_0_7, + filter_4_11, max); + uint16x4_t d2 = highbd_convolve12_4_v(s2345, s6789, sABCD, filter_0_7, + filter_4_11, max); + uint16x4_t d3 = highbd_convolve12_4_v(s3456, s789A, sBCDE, filter_0_7, + filter_4_11, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - reusing as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s8 = sC; + s9 = sD; + sA = sE; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 4; + dst += 4; + w -= 4; + } while (w != 0); +} + +void vpx_highbd_convolve12_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // Scaling not supported by SVE2 implementation. + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + DECLARE_ALIGNED(32, uint16_t, im_block[BW * (BH + MAX_FILTER_TAP)]); + + const int im_stride = BW; + // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior + // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 4.) + const int im_height = h + MAX_FILTER_TAP; + const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1; + + // Filter starting border_offset rows up. + vpx_highbd_convolve12_horiz_sve2( + src - src_stride * border_offset, src_stride, im_block, im_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd); + + vpx_highbd_convolve12_vert_sve2(im_block + im_stride * border_offset, + im_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" #include "vp9/encoder/vp9_temporal_filter.h" DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { @@ -145,32 +146,6 @@ static INLINE uint8x8_t convolve12_8_v( return vqrshrun_n_s16(sum, FILTER_BITS); } -static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b0, - int8x16_t *b1) { - // Transpose 8-bit elements and concatenate result rows as follows: - // a0: 00, 01, 02, 03, 04, 05, 06, 07 - // a1: 10, 11, 12, 13, 14, 15, 16, 17 - // a2: 20, 21, 22, 23, 24, 25, 26, 27 - // a3: 30, 31, 32, 33, 34, 35, 36, 37 - // - // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - - int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); - int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); - int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); - int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); - - int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; - int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; - - int8x16x2_t a0123 = vzipq_s8(a02, a13); - - *b0 = a0123.val[0]; - *b1 = a0123.val[1]; -} - void vpx_convolve12_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, @@ -221,14 +196,14 @@ void vpx_convolve12_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s789A_lo, s789A_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); - transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); + transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_s8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_s8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { uint8x8_t tB, tC, tD, tE; @@ -241,7 +216,7 @@ void vpx_convolve12_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, sBCDE_lo, sBCDE_hi; - transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); + transpose_concat_s8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" #include "vp9/encoder/vp9_temporal_filter.h" DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = { @@ -139,32 +140,6 @@ static INLINE uint8x8_t convolve12_8_v( return vqrshrun_n_s16(sum, FILTER_BITS); } -static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1) { - // Transpose 8-bit elements and concatenate result rows as follows: - // a0: 00, 01, 02, 03, 04, 05, 06, 07 - // a1: 10, 11, 12, 13, 14, 15, 16, 17 - // a2: 20, 21, 22, 23, 24, 25, 26, 27 - // a3: 30, 31, 32, 33, 34, 35, 36, 37 - // - // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - - uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); - uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); - uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); - uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); - - uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; - uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; - - uint8x16x2_t a0123 = vzipq_u8(a02, a13); - - *b0 = a0123.val[0]; - *b1 = a0123.val[1]; -} - void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, @@ -202,14 +177,14 @@ void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s789A_lo, s789A_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); - transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); + transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_u8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_u8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { uint8x8_t sB, sC, sD, sE; @@ -217,7 +192,7 @@ void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, sBCDE_lo, sBCDE_hi; - transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); + transpose_concat_u8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -1420,7 +1420,14 @@ static void set_tile_limits(VP9_COMP *cpi) { cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; + + // Max allowed number of tile_rows is 4 (so log2_tile_rows = 2), and each + // tile_row contains a multiple of superblocks. + const int sb64_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> 3; + const int max_log2_tile_rows = (sb64_rows >= 4) ? 2 + : (sb64_rows >= 2) ? 1 + : 0; + cm->log2_tile_rows = VPXMIN(cpi->oxcf.tile_rows, max_log2_tile_rows); if (cpi->oxcf.target_level == LEVEL_AUTO) { const int level_tile_cols = diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -1478,22 +1478,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, timebase_units_to_ticks(timebase_in_ts, pts_end); res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - // Store the original flags in to the frame buffer. Will extract the - // key frame flag when we actually encode this frame. - if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, dst_time_stamp, dst_end_time_stamp)) { - res = update_error_state(ctx, &cpi->common.error); - } - ctx->next_frame_flags = 0; + res = update_error_state(ctx, &cpi->common.error); } + ctx->next_frame_flags = 0; } cx_data = ctx->cx_data; diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk @@ -129,6 +129,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/highbd_temporal_filter_ssse3.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/highbd_temporal_filter_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c +VP9_CX_SRCS-$(HAVE_SVE2) += encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm @@ -179,6 +180,7 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filt VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c @@ -36,72 +36,6 @@ DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6, 1, 3, 5, 7 }; -static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, - int16x8_t res[2]) { - // Transpose 16-bit elements: - // s0: 00, 01, 02, 03 - // s1: 10, 11, 12, 13 - // s2: 20, 21, 22, 23 - // s3: 30, 31, 32, 33 - // - // res[0]: 00 10 20 30 01 11 21 31 - // res[1]: 02 12 22 32 03 13 23 33 - - int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); - int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); - int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); - int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); - - int16x8_t s02 = vzip1q_s16(s0q, s2q); - int16x8_t s13 = vzip1q_s16(s1q, s3q); - - int16x8x2_t s0123 = vzipq_s16(s02, s13); - - res[0] = s0123.val[0]; - res[1] = s0123.val[1]; -} - -static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - int16x8_t res[4]) { - // Transpose 16-bit elements: - // s0: 00, 01, 02, 03, 04, 05, 06, 07 - // s1: 10, 11, 12, 13, 14, 15, 16, 17 - // s2: 20, 21, 22, 23, 24, 25, 26, 27 - // s3: 30, 31, 32, 33, 34, 35, 36, 37 - // - // res[0]: 00 10 20 30 01 11 21 31 - // res[1]: 02 12 22 32 03 13 23 33 - // res[2]: 04 14 24 34 05 15 25 35 - // res[3]: 06 16 26 36 07 17 27 37 - - int16x8x2_t s02 = vzipq_s16(s0, s2); - int16x8x2_t s13 = vzipq_s16(s1, s3); - - int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]); - int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]); - - res[0] = s0123_lo.val[0]; - res[1] = s0123_lo.val[1]; - res[2] = s0123_hi.val[0]; - res[3] = s0123_hi.val[1]; -} - -static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4], - int16x8_t res[4], uint16x8_t idx) { - res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); - res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); - res[2] = vpx_tbl2_s16(s0[2], s1[2], idx); - res[3] = vpx_tbl2_s16(s0[3], s1[3], idx); -} - -static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2], - int16x8_t res[2], uint16x8_t idx) { - res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); - res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); -} - static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2], int16x8_t s_hi[2], int16x8_t filter, @@ -169,10 +103,10 @@ static INLINE void highbd_convolve8_8tap_vert_sve2( s += 7 * src_stride; int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; - transpose_concat_4x4(s0, s1, s2, s3, s0123); - transpose_concat_4x4(s1, s2, s3, s4, s1234); - transpose_concat_4x4(s2, s3, s4, s5, s2345); - transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); do { int16x4_t s7, s8, s9, sA; @@ -180,7 +114,7 @@ static INLINE void highbd_convolve8_8tap_vert_sve2( load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; - transpose_concat_4x4(s7, s8, s9, sA, s789A); + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); @@ -219,17 +153,22 @@ static INLINE void highbd_convolve8_8tap_vert_sve2( s += 7 * src_stride; int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; - transpose_concat_8x4(s0, s1, s2, s3, s0123); - transpose_concat_8x4(s1, s2, s3, s4, s1234); - transpose_concat_8x4(s2, s3, s4, s5, s2345); - transpose_concat_8x4(s3, s4, s5, s6, s3456); + transpose_concat_s16_8x4(s0, s1, s2, s3, &s0123[0], &s0123[1], &s0123[2], + &s0123[3]); + transpose_concat_s16_8x4(s1, s2, s3, s4, &s1234[0], &s1234[1], &s1234[2], + &s1234[3]); + transpose_concat_s16_8x4(s2, s3, s4, s5, &s2345[0], &s2345[1], &s2345[2], + &s2345[3]); + transpose_concat_s16_8x4(s3, s4, s5, s6, &s3456[0], &s3456[1], &s3456[2], + &s3456[3]); do { int16x8_t s7, s8, s9, sA; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; - transpose_concat_8x4(s7, s8, s9, sA, s789A); + transpose_concat_s16_8x4(s7, s8, s9, sA, &s789A[0], &s789A[1], + &s789A[2], &s789A[3]); vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); @@ -343,10 +282,10 @@ void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src, s += 7 * src_stride; int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; - transpose_concat_4x4(s0, s1, s2, s3, s0123); - transpose_concat_4x4(s1, s2, s3, s4, s1234); - transpose_concat_4x4(s2, s3, s4, s5, s2345); - transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); do { int16x4_t s7, s8, s9, sA; @@ -354,7 +293,7 @@ void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src, load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; - transpose_concat_4x4(s7, s8, s9, sA, s789A); + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); @@ -398,17 +337,22 @@ void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src, s += 7 * src_stride; int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; - transpose_concat_8x4(s0, s1, s2, s3, s0123); - transpose_concat_8x4(s1, s2, s3, s4, s1234); - transpose_concat_8x4(s2, s3, s4, s5, s2345); - transpose_concat_8x4(s3, s4, s5, s6, s3456); + transpose_concat_s16_8x4(s0, s1, s2, s3, &s0123[0], &s0123[1], &s0123[2], + &s0123[3]); + transpose_concat_s16_8x4(s1, s2, s3, s4, &s1234[0], &s1234[1], &s1234[2], + &s1234[3]); + transpose_concat_s16_8x4(s2, s3, s4, s5, &s2345[0], &s2345[1], &s2345[2], + &s2345[3]); + transpose_concat_s16_8x4(s3, s4, s5, s6, &s3456[0], &s3456[1], &s3456[2], + &s3456[3]); do { int16x8_t s7, s8, s9, sA; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; - transpose_concat_8x4(s7, s8, s9, sA, s789A); + transpose_concat_s16_8x4(s7, s8, s9, sA, &s789A[0], &s789A[1], + &s789A[2], &s789A[3]); vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h @@ -40,11 +40,11 @@ static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) { static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1, const int16_t c2, const int16_t c3) { return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) | - ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48)); + ((uint64_t)(uint16_t)c2 << 32) | ((uint64_t)c3 << 48)); } static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) { - return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32)); + return vcreate_s32((uint32_t)c0 | ((uint64_t)(uint32_t)c1 << 32)); } static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1, @@ -499,6 +499,34 @@ static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p, *s3 = vld1_s16(s); } +static INLINE void load_s16_4x11(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6, int16x4_t *s7, int16x4_t *s8, + int16x4_t *s9, int16x4_t *s10) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); + s += p; + *s8 = vld1_s16(s); + s += p; + *s9 = vld1_s16(s); + s += p; + *s10 = vld1_s16(s); +} + static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2, const uint16x4_t s3) { diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h @@ -1549,4 +1549,158 @@ static INLINE void load_and_transpose_s32_8x8( transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7); } + +static INLINE void transpose_concat_s16_4x4(const int16x4_t a0, + const int16x4_t a1, + const int16x4_t a2, + const int16x4_t a3, int16x8_t *b0, + int16x8_t *b1) { + // Transpose 16-bit elements: + // a0: 00, 01, 02, 03 + // a1: 10, 11, 12, 13 + // a2: 20, 21, 22, 23 + // a3: 30, 31, 32, 33 + // + // b0: 00 10 20 30 01 11 21 31 + // b1: 02 12 22 32 03 13 23 33 + + int16x8_t a0q = vcombine_s16(a0, vdup_n_s16(0)); + int16x8_t a1q = vcombine_s16(a1, vdup_n_s16(0)); + int16x8_t a2q = vcombine_s16(a2, vdup_n_s16(0)); + int16x8_t a3q = vcombine_s16(a3, vdup_n_s16(0)); + + int16x8_t a02 = vzipq_s16(a0q, a2q).val[0]; + int16x8_t a13 = vzipq_s16(a1q, a3q).val[0]; + + int16x8x2_t a0123 = vzipq_s16(a02, a13); + + *b0 = a0123.val[0]; + *b1 = a0123.val[1]; +} + +static INLINE void transpose_concat_s16_8x4(const int16x8_t a0, + const int16x8_t a1, + const int16x8_t a2, + const int16x8_t a3, int16x8_t *b0, + int16x8_t *b1, int16x8_t *b2, + int16x8_t *b3) { + // Transpose 16-bit elements: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00 10 20 30 01 11 21 31 + // b1: 02 12 22 32 03 13 23 33 + // b2: 04 14 24 34 05 15 25 35 + // b3: 06 16 26 36 07 17 27 37 + + int16x8x2_t a02 = vzipq_s16(a0, a2); + int16x8x2_t a13 = vzipq_s16(a1, a3); + + int16x8x2_t a0123_lo = vzipq_s16(a02.val[0], a13.val[0]); + int16x8x2_t a0123_hi = vzipq_s16(a02.val[1], a13.val[1]); + + *b0 = a0123_lo.val[0]; + *b1 = a0123_lo.val[1]; + *b2 = a0123_hi.val[0]; + *b3 = a0123_hi.val[1]; +} + +static INLINE void transpose_concat_s8_8x4(int8x8_t a0, int8x8_t a1, + int8x8_t a2, int8x8_t a3, + int8x16_t *b0, int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; + int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; + + int8x16x2_t a0123 = vzipq_s8(a02, a13); + + *b0 = a0123.val[0]; + *b1 = a0123.val[1]; +} + +static INLINE void transpose_concat_u8_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; + uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; + + uint8x16x2_t a0123 = vzipq_u8(a02, a13); + + *b0 = a0123.val[0]; + *b1 = a0123.val[1]; +} + +static INLINE void transpose_concat_s8_4x4(int8x8_t a0, int8x8_t a1, + int8x8_t a2, int8x8_t a3, + int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; + int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; + + *b = vzipq_s8(a02, a13).val[0]; +} + +static INLINE void transpose_concat_u8_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; + uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; + + *b = vzipq_u8(a02, a13).val[0]; +} + #endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -360,53 +360,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, } } -static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b) { - // Transpose 8-bit elements and concatenate result rows as follows: - // a0: 00, 01, 02, 03, XX, XX, XX, XX - // a1: 10, 11, 12, 13, XX, XX, XX, XX - // a2: 20, 21, 22, 23, XX, XX, XX, XX - // a3: 30, 31, 32, 33, XX, XX, XX, XX - // - // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - - int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); - int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); - int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); - int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); - - int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; - int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; - - *b = vzipq_s8(a02, a13).val[0]; -} - -static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b0, - int8x16_t *b1) { - // Transpose 8-bit elements and concatenate result rows as follows: - // a0: 00, 01, 02, 03, 04, 05, 06, 07 - // a1: 10, 11, 12, 13, 14, 15, 16, 17 - // a2: 20, 21, 22, 23, 24, 25, 26, 27 - // a3: 30, 31, 32, 33, 34, 35, 36, 37 - // - // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - - int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); - int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); - int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); - int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); - - int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; - int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; - - int8x16x2_t a0123 = vzipq_s8(a02, a13); - - *b0 = a0123.val[0]; - *b1 = a0123.val[1]; -} - static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, const int8x16_t samples_hi, const int8x8_t filters) { @@ -464,10 +417,10 @@ static INLINE void convolve_8tap_vert_neon_dotprod( // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. int8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_s8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_s8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_s8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_s8_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; @@ -479,7 +432,7 @@ static INLINE void convolve_8tap_vert_neon_dotprod( int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910); + transpose_concat_s8_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT = { { s3456, s78910 } }; @@ -531,10 +484,10 @@ static INLINE void convolve_8tap_vert_neon_dotprod( // (see horizontal case) required before computing the dot product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; @@ -546,7 +499,7 @@ static INLINE void convolve_8tap_vert_neon_dotprod( int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + transpose_concat_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; @@ -655,10 +608,10 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. int8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_s8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_s8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_s8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_s8_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; @@ -670,7 +623,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910); + transpose_concat_s8_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT = { { s3456, s78910 } }; @@ -728,10 +681,10 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, // (see horizontal case) required before computing the dot product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; @@ -743,7 +696,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + transpose_concat_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -340,54 +340,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } } -static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b) { - // Transpose 8-bit elements and concatenate result rows as follows: - // a0: 00, 01, 02, 03, XX, XX, XX, XX - // a1: 10, 11, 12, 13, XX, XX, XX, XX - // a2: 20, 21, 22, 23, XX, XX, XX, XX - // a3: 30, 31, 32, 33, XX, XX, XX, XX - // - // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - - uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); - uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); - uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); - uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); - - uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; - uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; - - *b = vzipq_u8(a02, a13).val[0]; -} - -static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1) { - // Transpose 8-bit elements and concatenate result rows as follows: - // a0: 00, 01, 02, 03, 04, 05, 06, 07 - // a1: 10, 11, 12, 13, 14, 15, 16, 17 - // a2: 20, 21, 22, 23, 24, 25, 26, 27 - // a3: 30, 31, 32, 33, 34, 35, 36, 37 - // - // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - - uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); - uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); - uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); - uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); - - uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; - uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; - - uint8x16x2_t a0123 = vzipq_u8(a02, a13); - - *b0 = a0123.val[0]; - *b1 = a0123.val[1]; -} - static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { @@ -432,17 +384,17 @@ static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src, // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_u8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_u8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_u8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_u8_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910); + transpose_concat_u8_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT = { { s3456, s78910 } }; @@ -485,17 +437,17 @@ static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src, // (see horizontal case) required before computing the dot product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + transpose_concat_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; @@ -594,17 +546,17 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_u8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_u8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_u8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_u8_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910); + transpose_concat_u8_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT = { { s3456, s78910 } }; @@ -653,17 +605,17 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, // (see horizontal case) required before computing the dot product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + transpose_concat_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h @@ -29,4 +29,18 @@ static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1, svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); } +static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4], + int16x8_t res[4], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); + res[2] = vpx_tbl2_s16(s0[2], s1[2], idx); + res[3] = vpx_tbl2_s16(s0[3], s1[3], idx); +} + +static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2], + int16x8_t res[2], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); +} + #endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h @@ -48,4 +48,9 @@ static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) { svset_neonq_u16(svundef_u16(), indices))); } +static INLINE int16x8_t vpx_tbl_s16(int16x8_t data, uint16x8_t indices) { + return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), data), + svset_neonq_u16(svundef_u16(), indices))); +} + #endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c @@ -16,7 +16,7 @@ #include <sys/sysctl.h> #endif -#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) +#if !CONFIG_RUNTIME_CPU_DETECT static int arm_get_cpu_caps(void) { // This function should actually be a no-op. There is no way to adjust any of @@ -29,7 +29,7 @@ static int arm_get_cpu_caps(void) { return flags; } -#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT // sysctlbyname() parameter documentation for instruction set characteristics: // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: f32182fc9455d7979236dffca35c8baf232a74ec (Mon Oct 06 20:24:46 2025). + release: 9a7674e1a83d1261a49776c8794b87c9bccc85d7 (Tue Nov 04 19:55:43 2025). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: f32182fc9455d7979236dffca35c8baf232a74ec + revision: 9a7674e1a83d1261a49776c8794b87c9bccc85d7 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/