tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

celt_pitch_xcorr_arm.s (20117B)


      1 ; Copyright (c) 2007-2008 CSIRO
      2 ; Copyright (c) 2007-2009 Xiph.Org Foundation
      3 ; Copyright (c) 2013      Parrot
      4 ; Written by Aurélien Zanelli
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions
      8 ; are met:
      9 ;
     10 ; - Redistributions of source code must retain the above copyright
     11 ; notice, this list of conditions and the following disclaimer.
     12 ;
     13 ; - Redistributions in binary form must reproduce the above copyright
     14 ; notice, this list of conditions and the following disclaimer in the
     15 ; documentation and/or other materials provided with the distribution.
     16 ;
     17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29  AREA  |.text|, CODE, READONLY
     30 
     31  GET    celt/arm/armopts.s
     32 
     33 IF OPUS_ARM_MAY_HAVE_EDSP
     34  EXPORT celt_pitch_xcorr_edsp
     35 ENDIF
     36 
     37 IF OPUS_ARM_MAY_HAVE_NEON
     38  EXPORT celt_pitch_xcorr_neon
     39 ENDIF
     40 
     41 IF OPUS_ARM_MAY_HAVE_NEON
     42 
     43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
     44 xcorr_kernel_neon PROC
     45 xcorr_kernel_neon_start
     46  ; input:
     47  ;   r3     = int         len
     48  ;   r4     = opus_val16 *x
     49  ;   r5     = opus_val16 *y
     50  ;   q0     = opus_val32  sum[4]
     51  ; output:
     52  ;   q0     = opus_val32  sum[4]
     53  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
     54  ; internal usage:
     55  ;   r12 = int j
     56  ;   d3  = y_3|y_2|y_1|y_0
     57  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
     58  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
     59  ;   q8  = scratch
     60  ;
     61  ; Load y[0...3]
     62  ; This requires len>0 to always be valid (which we assert in the C code).
     63  VLD1.16      {d5}, [r5]!
     64  SUBS         r12, r3, #8
     65  BLE xcorr_kernel_neon_process4
     66 ; Process 8 samples at a time.
     67 ; This loop loads one y value more than we actually need. Therefore we have to
     68 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
     69 ; reading past the end of the array.
     70 xcorr_kernel_neon_process8
     71  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
     72  ; - 2 cycles of ARM insrtuctions,
     73  ; - 10 cycles of load/store/byte permute instructions, and
     74  ; - 9 cycles of data processing instructions.
     75  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
     76  ; latter two categories, meaning the whole loop should run in 10 cycles per
     77  ; iteration, barring cache misses.
     78  ;
     79  ; Load x[0...7]
     80  VLD1.16      {d6, d7}, [r4]!
     81  ; Unlike VMOV, VAND is a data processing instruction (and doesn't get
     82  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
     83  VAND         d3, d5, d5
     84  SUBS         r12, r12, #8
     85  ; Load y[4...11]
     86  VLD1.16      {d4, d5}, [r5]!
     87  VMLAL.S16    q0, d3, d6[0]
     88  VEXT.16      d16, d3, d4, #1
     89  VMLAL.S16    q0, d4, d7[0]
     90  VEXT.16      d17, d4, d5, #1
     91  VMLAL.S16    q0, d16, d6[1]
     92  VEXT.16      d16, d3, d4, #2
     93  VMLAL.S16    q0, d17, d7[1]
     94  VEXT.16      d17, d4, d5, #2
     95  VMLAL.S16    q0, d16, d6[2]
     96  VEXT.16      d16, d3, d4, #3
     97  VMLAL.S16    q0, d17, d7[2]
     98  VEXT.16      d17, d4, d5, #3
     99  VMLAL.S16    q0, d16, d6[3]
    100  VMLAL.S16    q0, d17, d7[3]
    101  BGT xcorr_kernel_neon_process8
    102 ; Process 4 samples here if we have > 4 left (still reading one extra y value).
    103 xcorr_kernel_neon_process4
    104  ADDS         r12, r12, #4
    105  BLE xcorr_kernel_neon_process2
    106  ; Load x[0...3]
    107  VLD1.16      d6, [r4]!
    108  ; Use VAND since it's a data processing instruction again.
    109  VAND         d4, d5, d5
    110  SUB          r12, r12, #4
    111  ; Load y[4...7]
    112  VLD1.16      d5, [r5]!
    113  VMLAL.S16    q0, d4, d6[0]
    114  VEXT.16      d16, d4, d5, #1
    115  VMLAL.S16    q0, d16, d6[1]
    116  VEXT.16      d16, d4, d5, #2
    117  VMLAL.S16    q0, d16, d6[2]
    118  VEXT.16      d16, d4, d5, #3
    119  VMLAL.S16    q0, d16, d6[3]
    120 ; Process 2 samples here if we have > 2 left (still reading one extra y value).
    121 xcorr_kernel_neon_process2
    122  ADDS         r12, r12, #2
    123  BLE xcorr_kernel_neon_process1
    124  ; Load x[0...1]
    125  VLD2.16      {d6[],d7[]}, [r4]!
    126  ; Use VAND since it's a data processing instruction again.
    127  VAND         d4, d5, d5
    128  SUB          r12, r12, #2
    129  ; Load y[4...5]
    130  VLD1.32      {d5[]}, [r5]!
    131  VMLAL.S16    q0, d4, d6
    132  VEXT.16      d16, d4, d5, #1
    133  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
    134  ; instead of VEXT, since it's a data-processing instruction.
    135  VSRI.64      d5, d4, #32
    136  VMLAL.S16    q0, d16, d7
    137 ; Process 1 sample using the extra y value we loaded above.
    138 xcorr_kernel_neon_process1
    139  ; Load next *x
    140  VLD1.16      {d6[]}, [r4]!
    141  ADDS         r12, r12, #1
    142  ; y[0...3] are left in d5 from prior iteration(s) (if any)
    143  VMLAL.S16    q0, d5, d6
    144  MOVLE        pc, lr
    145 ; Now process 1 last sample, not reading ahead.
    146  ; Load last *y
    147  VLD1.16      {d4[]}, [r5]!
    148  VSRI.64      d4, d5, #16
    149  ; Load last *x
    150  VLD1.16      {d6[]}, [r4]!
    151  VMLAL.S16    q0, d4, d6
    152  MOV          pc, lr
    153  ENDP
    154 
    155 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
    156 ;  opus_val32 *xcorr, int len, int max_pitch, int arch)
    157 celt_pitch_xcorr_neon PROC
    158  ; input:
    159  ;   r0  = opus_val16 *_x
    160  ;   r1  = opus_val16 *_y
    161  ;   r2  = opus_val32 *xcorr
    162  ;   r3  = int         len
    163  ; output:
    164  ;   r0  = int         maxcorr
    165  ; internal usage:
    166  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
    167  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
    168  ;   r6  = int         max_pitch
    169  ;   r12 = int         j
    170  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
    171  ; ignored:
    172  ;         int         arch
    173  STMFD        sp!, {r4-r6, lr}
    174  LDR          r6, [sp, #16]
    175  VMOV.S32     q15, #1
    176  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
    177  SUBS         r6, r6, #4
    178  BLT celt_pitch_xcorr_neon_process4_done
    179 celt_pitch_xcorr_neon_process4
    180  ; xcorr_kernel_neon parameters:
    181  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
    182  MOV          r4, r0
    183  MOV          r5, r1
    184  VEOR         q0, q0, q0
    185  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
    186  ; So we don't save/restore any other registers.
    187  BL xcorr_kernel_neon_start
    188  SUBS         r6, r6, #4
    189  VST1.32      {q0}, [r2]!
    190  ; _y += 4
    191  ADD          r1, r1, #8
    192  VMAX.S32     q15, q15, q0
    193  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
    194  BGE celt_pitch_xcorr_neon_process4
    195 ; We have less than 4 sums left to compute.
    196 celt_pitch_xcorr_neon_process4_done
    197  ADDS         r6, r6, #4
    198  ; Reduce maxcorr to a single value
    199  VMAX.S32     d30, d30, d31
    200  VPMAX.S32    d30, d30, d30
    201  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
    202  BLE celt_pitch_xcorr_neon_done
    203 ; Now compute each remaining sum one at a time.
    204 celt_pitch_xcorr_neon_process_remaining
    205  MOV          r4, r0
    206  MOV          r5, r1
    207  VMOV.I32     q0, #0
    208  SUBS         r12, r3, #8
    209  BLT celt_pitch_xcorr_neon_process_remaining4
    210 ; Sum terms 8 at a time.
    211 celt_pitch_xcorr_neon_process_remaining_loop8
    212  ; Load x[0...7]
    213  VLD1.16      {q1}, [r4]!
    214  ; Load y[0...7]
    215  VLD1.16      {q2}, [r5]!
    216  SUBS         r12, r12, #8
    217  VMLAL.S16    q0, d4, d2
    218  VMLAL.S16    q0, d5, d3
    219  BGE celt_pitch_xcorr_neon_process_remaining_loop8
    220 ; Sum terms 4 at a time.
    221 celt_pitch_xcorr_neon_process_remaining4
    222  ADDS         r12, r12, #4
    223  BLT celt_pitch_xcorr_neon_process_remaining4_done
    224  ; Load x[0...3]
    225  VLD1.16      {d2}, [r4]!
    226  ; Load y[0...3]
    227  VLD1.16      {d3}, [r5]!
    228  SUB          r12, r12, #4
    229  VMLAL.S16    q0, d3, d2
    230 celt_pitch_xcorr_neon_process_remaining4_done
    231  ; Reduce the sum to a single value.
    232  VADD.S32     d0, d0, d1
    233  VPADDL.S32   d0, d0
    234  ADDS         r12, r12, #4
    235  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
    236 ; Sum terms 1 at a time.
    237 celt_pitch_xcorr_neon_process_remaining_loop1
    238  VLD1.16      {d2[]}, [r4]!
    239  VLD1.16      {d3[]}, [r5]!
    240  SUBS         r12, r12, #1
    241  VMLAL.S16    q0, d2, d3
    242  BGT celt_pitch_xcorr_neon_process_remaining_loop1
    243 celt_pitch_xcorr_neon_process_remaining_loop_done
    244  VST1.32      {d0[0]}, [r2]!
    245  VMAX.S32     d30, d30, d0
    246  SUBS         r6, r6, #1
    247  ; _y++
    248  ADD          r1, r1, #2
    249  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
    250  BGT celt_pitch_xcorr_neon_process_remaining
    251 celt_pitch_xcorr_neon_done
    252  VMOV.32      r0, d30[0]
    253  LDMFD        sp!, {r4-r6, pc}
    254  ENDP
    255 
    256 ENDIF
    257 
    258 IF OPUS_ARM_MAY_HAVE_EDSP
    259 
    260 ; This will get used on ARMv7 devices without NEON, so it has been optimized
    261 ; to take advantage of dual-issuing where possible.
    262 xcorr_kernel_edsp PROC
    263 xcorr_kernel_edsp_start
    264  ; input:
    265  ;   r3      = int         len
    266  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
    267  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
    268  ;   r6...r9 = opus_val32  sum[4]
    269  ; output:
    270  ;   r6...r9 = opus_val32  sum[4]
    271  ; preserved: r0-r5
    272  ; internal usage
    273  ;   r2      = int         j
    274  ;   r12,r14 = opus_val16  x[4]
    275  ;   r10,r11 = opus_val16  y[4]
    276  STMFD        sp!, {r2,r4,r5,lr}
    277  LDR          r10, [r5], #4      ; Load y[0...1]
    278  SUBS         r2, r3, #4         ; j = len-4
    279  LDR          r11, [r5], #4      ; Load y[2...3]
    280  BLE xcorr_kernel_edsp_process4_done
    281  LDR          r12, [r4], #4      ; Load x[0...1]
    282  ; Stall
    283 xcorr_kernel_edsp_process4
    284  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
    285  ; other. Every other instruction here dual-issues with a multiply, and is
    286  ; thus "free". There should be no stalls in the body of the loop.
    287  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
    288  LDR          r14, [r4], #4      ; Load x[2...3]
    289  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
    290  SUBS         r2, r2, #4         ; j-=4
    291  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
    292  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
    293  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
    294  LDR          r10, [r5], #4      ; Load y[4...5]
    295  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
    296  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
    297  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
    298  LDRGT        r12, [r4], #4      ; Load x[0...1]
    299  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
    300  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
    301  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
    302  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
    303  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
    304  LDR          r11, [r5], #4      ; Load y[6...7]
    305  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
    306  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
    307  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
    308  BGT xcorr_kernel_edsp_process4
    309 xcorr_kernel_edsp_process4_done
    310  ADDS         r2, r2, #4
    311  BLE xcorr_kernel_edsp_done
    312  LDRH         r12, [r4], #2      ; r12 = *x++
    313  SUBS         r2, r2, #1         ; j--
    314  ; Stall
    315  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
    316  LDRHGT       r14, [r4], #2      ; r14 = *x++
    317  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
    318  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
    319  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
    320  BLE xcorr_kernel_edsp_done
    321  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
    322  SUBS         r2, r2, #1         ; j--
    323  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
    324  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
    325  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
    326  LDRHGT       r12, [r4], #2      ; r12 = *x++
    327  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
    328  BLE xcorr_kernel_edsp_done
    329  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
    330  CMP          r2, #1             ; j--
    331  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
    332  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
    333  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
    334  LDRHGT       r14, [r4]          ; r14 = *x
    335  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
    336  BLE xcorr_kernel_edsp_done
    337  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
    338  LDRH         r11, [r5]          ; r11 = y_6 = *y
    339  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
    340  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
    341  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
    342 xcorr_kernel_edsp_done
    343  LDMFD        sp!, {r2,r4,r5,pc}
    344  ENDP
    345 
    346 celt_pitch_xcorr_edsp PROC
    347  ; input:
    348  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
    349  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
    350  ;   r2  = opus_val32 *xcorr
    351  ;   r3  = int         len
    352  ; output:
    353  ;   r0  = maxcorr
    354  ; internal usage
    355  ;   r4  = opus_val16 *x
    356  ;   r5  = opus_val16 *y
    357  ;   r6  = opus_val32  sum0
    358  ;   r7  = opus_val32  sum1
    359  ;   r8  = opus_val32  sum2
    360  ;   r9  = opus_val32  sum3
    361  ;   r1  = int         max_pitch
    362  ;   r12 = int         j
    363  ; ignored:
    364  ;         int         arch
    365  STMFD        sp!, {r4-r11, lr}
    366  MOV          r5, r1
    367  LDR          r1, [sp, #36]
    368  MOV          r4, r0
    369  TST          r5, #3
    370  ; maxcorr = 1
    371  MOV          r0, #1
    372  BEQ          celt_pitch_xcorr_edsp_process1u_done
    373 ; Compute one sum at the start to make y 32-bit aligned.
    374  SUBS         r12, r3, #4
    375  ; r14 = sum = 0
    376  MOV          r14, #0
    377  LDRH         r8, [r5], #2
    378  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
    379  LDR          r6, [r4], #4
    380  MOV          r8, r8, LSL #16
    381 celt_pitch_xcorr_edsp_process1u_loop4
    382  LDR          r9, [r5], #4
    383  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
    384  LDR          r7, [r4], #4
    385  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
    386  LDR          r8, [r5], #4
    387  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
    388  SUBS         r12, r12, #4         ; j-=4
    389  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
    390  LDRGT        r6, [r4], #4
    391  BGT celt_pitch_xcorr_edsp_process1u_loop4
    392  MOV          r8, r8, LSR #16
    393 celt_pitch_xcorr_edsp_process1u_loop4_done
    394  ADDS         r12, r12, #4
    395 celt_pitch_xcorr_edsp_process1u_loop1
    396  LDRHGE       r6, [r4], #2
    397  ; Stall
    398  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
    399  SUBSGE       r12, r12, #1
    400  LDRHGT       r8, [r5], #2
    401  BGT celt_pitch_xcorr_edsp_process1u_loop1
    402  ; Restore _x
    403  SUB          r4, r4, r3, LSL #1
    404  ; Restore and advance _y
    405  SUB          r5, r5, r3, LSL #1
    406  ; maxcorr = max(maxcorr, sum)
    407  CMP          r0, r14
    408  ADD          r5, r5, #2
    409  MOVLT        r0, r14
    410  SUBS         r1, r1, #1
    411  ; xcorr[i] = sum
    412  STR          r14, [r2], #4
    413  BLE celt_pitch_xcorr_edsp_done
    414 celt_pitch_xcorr_edsp_process1u_done
    415  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
    416  SUBS         r1, r1, #4
    417  BLT celt_pitch_xcorr_edsp_process2
    418 celt_pitch_xcorr_edsp_process4
    419  ; xcorr_kernel_edsp parameters:
    420  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
    421  MOV          r6, #0
    422  MOV          r7, #0
    423  MOV          r8, #0
    424  MOV          r9, #0
    425  BL xcorr_kernel_edsp_start  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
    426  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
    427  CMP          r0, r6
    428  ; _y+=4
    429  ADD          r5, r5, #8
    430  MOVLT        r0, r6
    431  CMP          r0, r7
    432  MOVLT        r0, r7
    433  CMP          r0, r8
    434  MOVLT        r0, r8
    435  CMP          r0, r9
    436  MOVLT        r0, r9
    437  STMIA        r2!, {r6-r9}
    438  SUBS         r1, r1, #4
    439  BGE celt_pitch_xcorr_edsp_process4
    440 celt_pitch_xcorr_edsp_process2
    441  ADDS         r1, r1, #2
    442  BLT celt_pitch_xcorr_edsp_process1a
    443  SUBS         r12, r3, #4
    444  ; {r10, r11} = {sum0, sum1} = {0, 0}
    445  MOV          r10, #0
    446  MOV          r11, #0
    447  LDR          r8, [r5], #4
    448  BLE celt_pitch_xcorr_edsp_process2_loop_done
    449  LDR          r6, [r4], #4
    450  LDR          r9, [r5], #4
    451 celt_pitch_xcorr_edsp_process2_loop4
    452  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
    453  LDR          r7, [r4], #4
    454  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
    455  SUBS         r12, r12, #4         ; j-=4
    456  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
    457  LDR          r8, [r5], #4
    458  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
    459  LDRGT        r6, [r4], #4
    460  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
    461  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
    462  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
    463  LDRGT        r9, [r5], #4
    464  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
    465  BGT celt_pitch_xcorr_edsp_process2_loop4
    466 celt_pitch_xcorr_edsp_process2_loop_done
    467  ADDS         r12, r12, #2
    468  BLE  celt_pitch_xcorr_edsp_process2_1
    469  LDR          r6, [r4], #4
    470  ; Stall
    471  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
    472  LDR          r9, [r5], #4
    473  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
    474  SUB          r12, r12, #2
    475  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
    476  MOV          r8, r9
    477  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
    478 celt_pitch_xcorr_edsp_process2_1
    479  LDRH         r6, [r4], #2
    480  ADDS         r12, r12, #1
    481  ; Stall
    482  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
    483  LDRHGT       r7, [r4], #2
    484  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
    485  BLE celt_pitch_xcorr_edsp_process2_done
    486  LDRH         r9, [r5], #2
    487  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
    488  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
    489 celt_pitch_xcorr_edsp_process2_done
    490  ; Restore _x
    491  SUB          r4, r4, r3, LSL #1
    492  ; Restore and advance _y
    493  SUB          r5, r5, r3, LSL #1
    494  ; maxcorr = max(maxcorr, sum0)
    495  CMP          r0, r10
    496  ADD          r5, r5, #2
    497  MOVLT        r0, r10
    498  SUB          r1, r1, #2
    499  ; maxcorr = max(maxcorr, sum1)
    500  CMP          r0, r11
    501  ; xcorr[i] = sum
    502  STR          r10, [r2], #4
    503  MOVLT        r0, r11
    504  STR          r11, [r2], #4
    505 celt_pitch_xcorr_edsp_process1a
    506  ADDS         r1, r1, #1
    507  BLT celt_pitch_xcorr_edsp_done
    508  SUBS         r12, r3, #4
    509  ; r14 = sum = 0
    510  MOV          r14, #0
    511  BLT celt_pitch_xcorr_edsp_process1a_loop_done
    512  LDR          r6, [r4], #4
    513  LDR          r8, [r5], #4
    514  LDR          r7, [r4], #4
    515  LDR          r9, [r5], #4
    516 celt_pitch_xcorr_edsp_process1a_loop4
    517  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
    518  SUBS         r12, r12, #4         ; j-=4
    519  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
    520  LDRGE        r6, [r4], #4
    521  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
    522  LDRGE        r8, [r5], #4
    523  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
    524  LDRGE        r7, [r4], #4
    525  LDRGE        r9, [r5], #4
    526  BGE celt_pitch_xcorr_edsp_process1a_loop4
    527 celt_pitch_xcorr_edsp_process1a_loop_done
    528  ADDS         r12, r12, #2
    529  LDRGE        r6, [r4], #4
    530  LDRGE        r8, [r5], #4
    531  ; Stall
    532  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
    533  SUBGE        r12, r12, #2
    534  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
    535  ADDS         r12, r12, #1
    536  LDRHGE       r6, [r4], #2
    537  LDRHGE       r8, [r5], #2
    538  ; Stall
    539  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
    540  ; maxcorr = max(maxcorr, sum)
    541  CMP          r0, r14
    542  ; xcorr[i] = sum
    543  STR          r14, [r2], #4
    544  MOVLT        r0, r14
    545 celt_pitch_xcorr_edsp_done
    546  LDMFD        sp!, {r4-r11, pc}
    547  ENDP
    548 
    549 ENDIF
    550 
    551 END