idctdsp.asm (3918B)
1 ;****************************************************************************** 2 ;* SIMD-optimized IDCT-related routines 3 ;* Copyright (c) 2008 Loren Merritt 4 ;* Copyright (c) 2003-2013 Michael Niedermayer 5 ;* Copyright (c) 2013 Daniel Kang 6 ;* 7 ;* This file is part of FFmpeg. 8 ;* 9 ;* FFmpeg is free software; you can redistribute it and/or 10 ;* modify it under the terms of the GNU Lesser General Public 11 ;* License as published by the Free Software Foundation; either 12 ;* version 2.1 of the License, or (at your option) any later version. 13 ;* 14 ;* FFmpeg is distributed in the hope that it will be useful, 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 ;* Lesser General Public License for more details. 18 ;* 19 ;* You should have received a copy of the GNU Lesser General Public 20 ;* License along with FFmpeg; if not, write to the Free Software 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 ;****************************************************************************** 23 24 %include "libavutil/x86/x86util.asm" 25 26 SECTION_RODATA 27 28 cextern pb_80 29 30 SECTION .text 31 32 ;-------------------------------------------------------------------------- 33 ;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, 34 ; ptrdiff_t line_size) 35 ;-------------------------------------------------------------------------- 36 37 %macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 38 mova m1, [blockq+mmsize*0+%1] 39 mova m2, [blockq+mmsize*2+%1] 40 packsswb m1, [blockq+mmsize*1+%1] 41 packsswb m2, [blockq+mmsize*3+%1] 42 paddb m1, m0 43 paddb m2, m0 44 movq [pixelsq+lsizeq*0], m1 45 movhps [pixelsq+lsizeq*1], m1 46 movq [pixelsq+lsizeq*2], m2 47 movhps [pixelsq+lsize3q ], m2 48 %endmacro 49 50 INIT_XMM sse2 51 cglobal put_signed_pixels_clamped, 3, 4, 3, block, pixels, lsize, lsize3 52 mova m0, [pb_80] 53 lea lsize3q, [lsizeq*3] 54 PUT_SIGNED_PIXELS_CLAMPED_HALF 0 55 lea pixelsq, [pixelsq+lsizeq*4] 56 PUT_SIGNED_PIXELS_CLAMPED_HALF 64 57 RET 58 59 ;-------------------------------------------------------------------------- 60 ; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, 61 ; ptrdiff_t line_size); 62 ;-------------------------------------------------------------------------- 63 ; %1 = block offset 64 %macro PUT_PIXELS_CLAMPED_HALF 1 65 mova m0, [blockq+mmsize*0+%1] 66 mova m1, [blockq+mmsize*2+%1] 67 packuswb m0, [blockq+mmsize*1+%1] 68 packuswb m1, [blockq+mmsize*3+%1] 69 movq [pixelsq], m0 70 movhps [lsizeq+pixelsq], m0 71 movq [2*lsizeq+pixelsq], m1 72 movhps [lsize3q+pixelsq], m1 73 %endmacro 74 75 INIT_XMM sse2 76 cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 77 lea lsize3q, [lsizeq*3] 78 PUT_PIXELS_CLAMPED_HALF 0 79 lea pixelsq, [pixelsq+lsizeq*4] 80 PUT_PIXELS_CLAMPED_HALF 64 81 RET 82 83 ;-------------------------------------------------------------------------- 84 ; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, 85 ; ptrdiff_t line_size); 86 ;-------------------------------------------------------------------------- 87 ; %1 = block offset 88 %macro ADD_PIXELS_CLAMPED 1 89 mova m0, [blockq+mmsize*0+%1] 90 mova m1, [blockq+mmsize*1+%1] 91 movq m2, [pixelsq] 92 movq m3, [pixelsq+lsizeq] 93 punpcklbw m2, m4 94 punpcklbw m3, m4 95 paddsw m0, m2 96 paddsw m1, m3 97 packuswb m0, m1 98 movq [pixelsq], m0 99 movhps [pixelsq+lsizeq], m0 100 %endmacro 101 102 INIT_XMM sse2 103 cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize 104 pxor m4, m4 105 ADD_PIXELS_CLAMPED 0 106 lea pixelsq, [pixelsq+lsizeq*2] 107 ADD_PIXELS_CLAMPED 32 108 lea pixelsq, [pixelsq+lsizeq*2] 109 ADD_PIXELS_CLAMPED 64 110 lea pixelsq, [pixelsq+lsizeq*2] 111 ADD_PIXELS_CLAMPED 96 112 RET