mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
Also avoid REX prefixes while just at it. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
442 lines
14 KiB
NASM
442 lines
14 KiB
NASM
;******************************************************************************
|
|
;* MMX/SSE2-optimized functions for the VP3 decoder
|
|
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
; MMX-optimized functions cribbed from the original VP3 source code.
|
|
|
|
SECTION_RODATA
|
|
|
|
vp3_idct_data: times 8 dw 64277
|
|
times 8 dw 60547
|
|
times 8 dw 54491
|
|
times 8 dw 46341
|
|
times 8 dw 36410
|
|
times 8 dw 25080
|
|
times 8 dw 12785
|
|
|
|
cextern pb_80
|
|
|
|
cextern pw_4
|
|
cextern pw_8
|
|
|
|
SECTION .text
|
|
|
|
; in: p0 in m5, p1 in m4, p2 in m2, p3 in m1, all unpacked;
|
|
; m0 must be zeroed
|
|
; out: p1 in m4, p2 in m2
|
|
%macro VP3_LOOP_FILTER 0
|
|
psubw m5, m1
|
|
mova m3, m2
|
|
paddw m5, [pw_4]
|
|
psubw m3, m4
|
|
mova m1, m3
|
|
paddw m1, m5
|
|
mova m5, [r2+516] ; 2 * filter limit
|
|
paddw m3, m3
|
|
paddw m3, m1
|
|
psraw m3, 3
|
|
|
|
; We use that clamp(2clamp(x,2f),2f)-clamp(x,2f)
|
|
; (with f = filter limit and clamping to the interval [-f,f])
|
|
; gives the desired filter value
|
|
psubw m0, m5
|
|
pminsw m3, m5
|
|
pmaxsw m3, m0
|
|
mova m1, m3
|
|
paddw m1, m1
|
|
pminsw m1, m5
|
|
pmaxsw m1, m0
|
|
psubw m1, m3
|
|
psubw m2, m1
|
|
paddw m4, m1
|
|
|
|
packuswb m4, m4
|
|
packuswb m2, m2
|
|
%endmacro
|
|
|
|
%macro STORE_4_WORDS 1
|
|
%if ARCH_X86_64
|
|
movq r2, %1
|
|
mov [r0 -1], r2w
|
|
shr r2, 16
|
|
mov [r0+r1 -1], r2w
|
|
shr r2, 16
|
|
%else
|
|
movd r2d, %1
|
|
mov [r0 -1], r2w
|
|
psrlq %1, 32
|
|
shr r2d, 16
|
|
mov [r0+r1 -1], r2w
|
|
movd r2d, %1
|
|
%endif
|
|
mov [r0+r1*2-1], r2w
|
|
shr r2d, 16
|
|
mov [r0+r3 -1], r2w
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
cglobal vp3_v_loop_filter, 3, 3, 6
|
|
movq m1, [r0+r1 ]
|
|
neg r1
|
|
movq m2, [r0 ]
|
|
movq m4, [r0+r1 ]
|
|
movq m5, [r0+r1*2]
|
|
|
|
pxor m0, m0
|
|
punpcklbw m1, m0
|
|
punpcklbw m2, m0
|
|
punpcklbw m4, m0
|
|
punpcklbw m5, m0
|
|
|
|
VP3_LOOP_FILTER
|
|
|
|
movq [r0+r1], m4
|
|
movq [r0 ], m2
|
|
RET
|
|
|
|
%macro TRANSPOSE4x4 1
|
|
movd %1, [r0 -2]
|
|
movd m2, [r0+r1 -2]
|
|
movd m3, [r0+r1*2-2]
|
|
movd m4, [r0+r3 -2]
|
|
punpcklbw %1, m2
|
|
punpcklbw m3, m4
|
|
punpcklwd %1, m3
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
cglobal vp3_h_loop_filter, 3, 4, 6
|
|
lea r3, [r1*3]
|
|
|
|
TRANSPOSE4x4 m5
|
|
lea r0, [r0+r1*4]
|
|
TRANSPOSE4x4 m0
|
|
mova m2, m5
|
|
punpckldq m5, m0
|
|
punpckhdq m2, m0
|
|
pxor m0, m0
|
|
mova m4, m5
|
|
punpcklbw m5, m0
|
|
punpckhbw m4, m0
|
|
mova m1, m2
|
|
punpcklbw m2, m0
|
|
punpckhbw m1, m0
|
|
|
|
VP3_LOOP_FILTER
|
|
|
|
punpcklbw m4, m2
|
|
mova m2, m4
|
|
punpckhqdq m2, m2
|
|
|
|
STORE_4_WORDS m2
|
|
sub r0, r3
|
|
sub r0, r1
|
|
STORE_4_WORDS m4
|
|
RET
|
|
|
|
%macro PAVGB_NO_RND 0
|
|
pxor m0, m4
|
|
pxor m1, m4
|
|
pavgb m0, m1
|
|
pxor m0, m4
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
; void ff_vp3_put_no_rnd_pixels8_l2_sse2(uint8_t *dst, const uint8_t *a,
|
|
; const uint8_t *b, ptrdiff_t stride,
|
|
; int h)
|
|
cglobal vp3_put_no_rnd_pixels8_l2, 5, 6, 5, dst, src1, src2, stride, h, stride3
|
|
lea stride3q,[strideq+strideq*2]
|
|
pcmpeqb m4, m4
|
|
.loop:
|
|
movq m0, [src1q]
|
|
movq m1, [src2q]
|
|
movhps m0, [src1q+strideq]
|
|
movhps m1, [src2q+strideq]
|
|
PAVGB_NO_RND
|
|
movq [dstq], m0
|
|
movhps [dstq+strideq], m0
|
|
|
|
movq m0, [src1q+strideq*2]
|
|
movq m1, [src2q+strideq*2]
|
|
movhps m0, [src1q+stride3q]
|
|
movhps m1, [src2q+stride3q]
|
|
PAVGB_NO_RND
|
|
movq [dstq+strideq*2], m0
|
|
movhps [dstq+stride3q], m0
|
|
|
|
lea src1q, [src1q+strideq*4]
|
|
lea src2q, [src2q+strideq*4]
|
|
lea dstq, [dstq+strideq*4]
|
|
sub hd, 4
|
|
jnz .loop
|
|
RET
|
|
|
|
%macro VP3_1D_IDCT_SSE2 0
|
|
movdqa m2, I(3) ; xmm2 = i3
|
|
movdqa m6, C(3) ; xmm6 = c3
|
|
movdqa m4, m2 ; xmm4 = i3
|
|
movdqa m7, I(5) ; xmm7 = i5
|
|
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
|
|
movdqa m1, C(5) ; xmm1 = c5
|
|
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
|
|
movdqa m5, m1 ; xmm5 = c5
|
|
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
|
|
movdqa m3, I(1) ; xmm3 = i1
|
|
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
|
|
movdqa m0, C(1) ; xmm0 = c1
|
|
paddw m4, m2 ; xmm4 = c3 * i3
|
|
paddw m6, m7 ; xmm6 = c3 * i5
|
|
paddw m2, m1 ; xmm2 = c5 * i3
|
|
movdqa m1, I(7) ; xmm1 = i7
|
|
paddw m7, m5 ; xmm7 = c5 * i5
|
|
movdqa m5, m0 ; xmm5 = c1
|
|
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
|
|
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
|
|
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
|
|
movdqa m7, C(7) ; xmm7 = c7
|
|
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
|
|
paddw m0, m3 ; xmm0 = c1 * i1
|
|
pmulhw m3, m7 ; xmm3 = c7 * i1
|
|
movdqa m2, I(2) ; xmm2 = i2
|
|
pmulhw m7, m1 ; xmm7 = c7 * i7
|
|
paddw m5, m1 ; xmm5 = c1 * i7
|
|
movdqa m1, m2 ; xmm1 = i2
|
|
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
|
|
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
|
|
movdqa m5, I(6) ; xmm5 = i6
|
|
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
|
|
movdqa m7, m5 ; xmm7 = i6
|
|
psubsw m0, m4 ; xmm0 = A - C
|
|
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
|
|
paddw m2, m1 ; xmm2 = i2 * c2
|
|
pmulhw m1, C(6) ; xmm1 = c6 * i2
|
|
paddsw m4, m4 ; xmm4 = C + C
|
|
paddsw m4, m0 ; xmm4 = A + C = C.
|
|
psubsw m3, m6 ; xmm3 = B - D
|
|
paddw m5, m7 ; xmm5 = c2 * i6
|
|
paddsw m6, m6 ; xmm6 = D + D
|
|
pmulhw m7, C(6) ; xmm7 = c6 * i6
|
|
paddsw m6, m3 ; xmm6 = B + D = D.
|
|
movdqa I(1), m4 ; Save C. at I(1)
|
|
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
|
|
movdqa m4, C(4) ; xmm4 = C4
|
|
movdqa m5, m3 ; xmm5 = B - D
|
|
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
|
|
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
|
|
movdqa I(2), m6 ; save D. at I(2)
|
|
movdqa m2, m0 ; xmm2 = A - C
|
|
movdqa m6, I(0) ; xmm6 = i0
|
|
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
|
|
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
|
|
movdqa m3, I(4) ; xmm3 = i4
|
|
psubsw m5, m1 ; xmm5 = B. - H = B..
|
|
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
|
|
psubsw m6, m3 ; xmm6 = i0 - i4
|
|
movdqa m0, m6 ; xmm0 = i0 - i4
|
|
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
|
|
paddsw m3, m3 ; xmm3 = i4 + i4
|
|
paddsw m1, m1 ; xmm1 = H + H
|
|
paddsw m3, m0 ; xmm3 = i0 + i4
|
|
paddsw m1, m5 ; xmm1 = B. + H = H.
|
|
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
|
|
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
|
|
psubsw m6, m2 ; xmm6 = F - A. = F.
|
|
paddsw m2, m2 ; xmm2 = A. + A.
|
|
movdqa m0, I(1) ; Load C. from I(1)
|
|
paddsw m2, m6 ; xmm2 = F + A. = A..
|
|
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
|
|
psubsw m2, m1 ; xmm2 = A.. - H. = R2
|
|
ADD(m2) ; Adjust R2 and R1 before shifting
|
|
paddsw m1, m1 ; xmm1 = H. + H.
|
|
paddsw m1, m2 ; xmm1 = A.. + H. = R1
|
|
SHIFT(m2) ; xmm2 = op2
|
|
psubsw m4, m7 ; xmm4 = E - G = E.
|
|
SHIFT(m1) ; xmm1 = op1
|
|
movdqa m3, I(2) ; Load D. from I(2)
|
|
paddsw m7, m7 ; xmm7 = G + G
|
|
paddsw m7, m4 ; xmm7 = E + G = G.
|
|
psubsw m4, m3 ; xmm4 = E. - D. = R4
|
|
ADD(m4) ; Adjust R4 and R3 before shifting
|
|
paddsw m3, m3 ; xmm3 = D. + D.
|
|
paddsw m3, m4 ; xmm3 = E. + D. = R3
|
|
SHIFT(m4) ; xmm4 = op4
|
|
psubsw m6, m5 ; xmm6 = F. - B..= R6
|
|
SHIFT(m3) ; xmm3 = op3
|
|
ADD(m6) ; Adjust R6 and R5 before shifting
|
|
paddsw m5, m5 ; xmm5 = B.. + B..
|
|
paddsw m5, m6 ; xmm5 = F. + B.. = R5
|
|
SHIFT(m6) ; xmm6 = op6
|
|
SHIFT(m5) ; xmm5 = op5
|
|
psubsw m7, m0 ; xmm7 = G. - C. = R7
|
|
ADD(m7) ; Adjust R7 and R0 before shifting
|
|
paddsw m0, m0 ; xmm0 = C. + C.
|
|
paddsw m0, m7 ; xmm0 = G. + C.
|
|
SHIFT(m7) ; xmm7 = op7
|
|
SHIFT(m0) ; xmm0 = op0
|
|
%endmacro
|
|
|
|
%macro PUT_BLOCK 8
|
|
movdqa O(0), m%1
|
|
movdqa O(1), m%2
|
|
movdqa O(2), m%3
|
|
movdqa O(3), m%4
|
|
movdqa O(4), m%5
|
|
movdqa O(5), m%6
|
|
movdqa O(6), m%7
|
|
movdqa O(7), m%8
|
|
%endmacro
|
|
|
|
%macro VP3_IDCT 0
|
|
%define I(x) [blockq+16*x]
|
|
%define O(x) [blockq+16*x]
|
|
%define C(x) [vp3_idct_data+16*(x-1)]
|
|
%define SHIFT(x)
|
|
%define ADD(x)
|
|
VP3_1D_IDCT_SSE2
|
|
%if ARCH_X86_64
|
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
|
%else
|
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq], [blockq+16]
|
|
%endif
|
|
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
|
|
|
%define SHIFT(x) psraw x, 4
|
|
%define ADD(x) paddsw x, [pw_8]
|
|
VP3_1D_IDCT_SSE2
|
|
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
; void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
|
|
cglobal vp3_idct_put, 3, 4, 9, dst, s, block, s3
|
|
VP3_IDCT
|
|
|
|
mova m4, [pb_80]
|
|
lea s3q, [sq*3]
|
|
mova m0, [r2+mmsize*0]
|
|
mova m1, [r2+mmsize*2]
|
|
mova m2, [r2+mmsize*4]
|
|
mova m3, [r2+mmsize*6]
|
|
packsswb m0, [r2+mmsize*1]
|
|
packsswb m1, [r2+mmsize*3]
|
|
packsswb m2, [r2+mmsize*5]
|
|
packsswb m3, [r2+mmsize*7]
|
|
paddb m0, m4
|
|
paddb m1, m4
|
|
paddb m2, m4
|
|
paddb m3, m4
|
|
movq [dstq ], m0
|
|
movhps [dstq+sq], m0
|
|
movq [dstq+sq*2], m1
|
|
movhps [dstq+s3q], m1
|
|
lea dstq, [dstq+sq*4]
|
|
movq [dstq ], m2
|
|
movhps [dstq+sq], m2
|
|
movq [dstq+sq*2], m3
|
|
movhps [dstq+s3q], m3
|
|
|
|
pxor m0, m0
|
|
%assign offset 0
|
|
%rep 128/mmsize
|
|
mova [blockq+offset], m0
|
|
%assign offset offset+mmsize
|
|
%endrep
|
|
RET
|
|
|
|
; void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
|
|
cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3
|
|
VP3_IDCT
|
|
|
|
lea s3q, [sq*3]
|
|
pxor m4, m4
|
|
%assign i 0
|
|
%rep 2
|
|
movq m0, [dstq]
|
|
movq m1, [dstq+sq]
|
|
movq m2, [dstq+sq*2]
|
|
movq m3, [dstq+s3q]
|
|
punpcklbw m0, m4
|
|
punpcklbw m1, m4
|
|
punpcklbw m2, m4
|
|
punpcklbw m3, m4
|
|
paddsw m0, [blockq+ 0+i]
|
|
paddsw m1, [blockq+16+i]
|
|
paddsw m2, [blockq+32+i]
|
|
paddsw m3, [blockq+48+i]
|
|
packuswb m0, m1
|
|
packuswb m2, m3
|
|
movq [dstq ], m0
|
|
movhps [dstq+sq], m0
|
|
movq [dstq+sq*2], m2
|
|
movhps [dstq+s3q], m2
|
|
%if i == 0
|
|
lea dstq, [dstq+sq*4]
|
|
%endif
|
|
%assign i i+64
|
|
%endrep
|
|
%assign i 0
|
|
%rep 128/mmsize
|
|
mova [blockq+i], m4
|
|
%assign i i+mmsize
|
|
%endrep
|
|
RET
|
|
|
|
%macro DC_ADD 0
|
|
movq m2, [dstq ]
|
|
movq m3, [dstq+sq ]
|
|
paddusb m2, m0
|
|
movq m4, [dstq+sq*2]
|
|
paddusb m3, m0
|
|
movq m5, [dstq+s3q ]
|
|
paddusb m4, m0
|
|
paddusb m5, m0
|
|
psubusb m2, m1
|
|
psubusb m3, m1
|
|
movq [dstq ], m2
|
|
psubusb m4, m1
|
|
movq [dstq+sq ], m3
|
|
psubusb m5, m1
|
|
movq [dstq+sq*2], m4
|
|
movq [dstq+s3q ], m5
|
|
%endmacro
|
|
|
|
INIT_MMX mmxext
|
|
; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block)
|
|
cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
|
|
movsx dcd, word [blockq]
|
|
mov word [blockq], 0
|
|
%define s3q blockq
|
|
lea s3q, [sq*3]
|
|
add dcd, 15
|
|
sar dcd, 5
|
|
movd m0, dcd
|
|
pshufw m0, m0, 0x0
|
|
pxor m1, m1
|
|
psubw m1, m0
|
|
packuswb m0, m0
|
|
packuswb m1, m1
|
|
DC_ADD
|
|
lea dstq, [dstq+sq*4]
|
|
DC_ADD
|
|
RET
|