differ_vector_sse2.cc (3219B)
1 /* 2 * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/desktop_capture/differ_vector_sse2.h" 12 13 #include <cstdint> 14 15 #if defined(_MSC_VER) 16 #include <intrin.h> 17 #else 18 #include <emmintrin.h> 19 #include <mmintrin.h> 20 #endif 21 22 namespace webrtc { 23 24 extern bool VectorDifference_SSE2_W16(const uint8_t* image1, 25 const uint8_t* image2) { 26 __m128i acc = _mm_setzero_si128(); 27 __m128i v0; 28 __m128i v1; 29 __m128i sad; 30 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); 31 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); 32 v0 = _mm_loadu_si128(i1); 33 v1 = _mm_loadu_si128(i2); 34 sad = _mm_sad_epu8(v0, v1); 35 acc = _mm_adds_epu16(acc, sad); 36 v0 = _mm_loadu_si128(i1 + 1); 37 v1 = _mm_loadu_si128(i2 + 1); 38 sad = _mm_sad_epu8(v0, v1); 39 acc = _mm_adds_epu16(acc, sad); 40 v0 = _mm_loadu_si128(i1 + 2); 41 v1 = _mm_loadu_si128(i2 + 2); 42 sad = _mm_sad_epu8(v0, v1); 43 acc = _mm_adds_epu16(acc, sad); 44 v0 = _mm_loadu_si128(i1 + 3); 45 v1 = _mm_loadu_si128(i2 + 3); 46 sad = _mm_sad_epu8(v0, v1); 47 acc = _mm_adds_epu16(acc, sad); 48 49 // This essential means sad = acc >> 64. We only care about the lower 16 50 // bits. 51 sad = _mm_shuffle_epi32(acc, 0xEE); 52 sad = _mm_adds_epu16(sad, acc); 53 return _mm_cvtsi128_si32(sad) != 0; 54 } 55 56 extern bool VectorDifference_SSE2_W32(const uint8_t* image1, 57 const uint8_t* image2) { 58 __m128i acc = _mm_setzero_si128(); 59 __m128i v0; 60 __m128i v1; 61 __m128i sad; 62 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); 63 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); 64 v0 = _mm_loadu_si128(i1); 65 v1 = _mm_loadu_si128(i2); 66 sad = _mm_sad_epu8(v0, v1); 67 acc = _mm_adds_epu16(acc, sad); 68 v0 = _mm_loadu_si128(i1 + 1); 69 v1 = _mm_loadu_si128(i2 + 1); 70 sad = _mm_sad_epu8(v0, v1); 71 acc = _mm_adds_epu16(acc, sad); 72 v0 = _mm_loadu_si128(i1 + 2); 73 v1 = _mm_loadu_si128(i2 + 2); 74 sad = _mm_sad_epu8(v0, v1); 75 acc = _mm_adds_epu16(acc, sad); 76 v0 = _mm_loadu_si128(i1 + 3); 77 v1 = _mm_loadu_si128(i2 + 3); 78 sad = _mm_sad_epu8(v0, v1); 79 acc = _mm_adds_epu16(acc, sad); 80 v0 = _mm_loadu_si128(i1 + 4); 81 v1 = _mm_loadu_si128(i2 + 4); 82 sad = _mm_sad_epu8(v0, v1); 83 acc = _mm_adds_epu16(acc, sad); 84 v0 = _mm_loadu_si128(i1 + 5); 85 v1 = _mm_loadu_si128(i2 + 5); 86 sad = _mm_sad_epu8(v0, v1); 87 acc = _mm_adds_epu16(acc, sad); 88 v0 = _mm_loadu_si128(i1 + 6); 89 v1 = _mm_loadu_si128(i2 + 6); 90 sad = _mm_sad_epu8(v0, v1); 91 acc = _mm_adds_epu16(acc, sad); 92 v0 = _mm_loadu_si128(i1 + 7); 93 v1 = _mm_loadu_si128(i2 + 7); 94 sad = _mm_sad_epu8(v0, v1); 95 acc = _mm_adds_epu16(acc, sad); 96 97 // This essential means sad = acc >> 64. We only care about the lower 16 98 // bits. 99 sad = _mm_shuffle_epi32(acc, 0xEE); 100 sad = _mm_adds_epu16(sad, acc); 101 return _mm_cvtsi128_si32(sad) != 0; 102 } 103 104 } // namespace webrtc