Files
FFmpeg/libavcodec/x86/vp3dsp.asm
Andreas Rheinhardt ed59fc77e8 avcodec/x86/vp3dsp: Use named args in idct functions
Also avoid REX prefixes while just at it.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-04-19 08:21:17 +02:00

442 lines
14 KiB
NASM

;******************************************************************************
;* MMX/SSE2-optimized functions for the VP3 decoder
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
; MMX-optimized functions cribbed from the original VP3 source code.
SECTION_RODATA
vp3_idct_data: times 8 dw 64277
times 8 dw 60547
times 8 dw 54491
times 8 dw 46341
times 8 dw 36410
times 8 dw 25080
times 8 dw 12785
cextern pb_80
cextern pw_4
cextern pw_8
SECTION .text
; in: p0 in m5, p1 in m4, p2 in m2, p3 in m1, all unpacked;
; m0 must be zeroed
; out: p1 in m4, p2 in m2
%macro VP3_LOOP_FILTER 0
psubw m5, m1
mova m3, m2
paddw m5, [pw_4]
psubw m3, m4
mova m1, m3
paddw m1, m5
mova m5, [r2+516] ; 2 * filter limit
paddw m3, m3
paddw m3, m1
psraw m3, 3
; We use that clamp(2clamp(x,2f),2f)-clamp(x,2f)
; (with f = filter limit and clamping to the interval [-f,f])
; gives the desired filter value
psubw m0, m5
pminsw m3, m5
pmaxsw m3, m0
mova m1, m3
paddw m1, m1
pminsw m1, m5
pmaxsw m1, m0
psubw m1, m3
psubw m2, m1
paddw m4, m1
packuswb m4, m4
packuswb m2, m2
%endmacro
%macro STORE_4_WORDS 1
%if ARCH_X86_64
movq r2, %1
mov [r0 -1], r2w
shr r2, 16
mov [r0+r1 -1], r2w
shr r2, 16
%else
movd r2d, %1
mov [r0 -1], r2w
psrlq %1, 32
shr r2d, 16
mov [r0+r1 -1], r2w
movd r2d, %1
%endif
mov [r0+r1*2-1], r2w
shr r2d, 16
mov [r0+r3 -1], r2w
%endmacro
INIT_XMM sse2
cglobal vp3_v_loop_filter, 3, 3, 6
movq m1, [r0+r1 ]
neg r1
movq m2, [r0 ]
movq m4, [r0+r1 ]
movq m5, [r0+r1*2]
pxor m0, m0
punpcklbw m1, m0
punpcklbw m2, m0
punpcklbw m4, m0
punpcklbw m5, m0
VP3_LOOP_FILTER
movq [r0+r1], m4
movq [r0 ], m2
RET
%macro TRANSPOSE4x4 1
movd %1, [r0 -2]
movd m2, [r0+r1 -2]
movd m3, [r0+r1*2-2]
movd m4, [r0+r3 -2]
punpcklbw %1, m2
punpcklbw m3, m4
punpcklwd %1, m3
%endmacro
INIT_XMM sse2
cglobal vp3_h_loop_filter, 3, 4, 6
lea r3, [r1*3]
TRANSPOSE4x4 m5
lea r0, [r0+r1*4]
TRANSPOSE4x4 m0
mova m2, m5
punpckldq m5, m0
punpckhdq m2, m0
pxor m0, m0
mova m4, m5
punpcklbw m5, m0
punpckhbw m4, m0
mova m1, m2
punpcklbw m2, m0
punpckhbw m1, m0
VP3_LOOP_FILTER
punpcklbw m4, m2
mova m2, m4
punpckhqdq m2, m2
STORE_4_WORDS m2
sub r0, r3
sub r0, r1
STORE_4_WORDS m4
RET
%macro PAVGB_NO_RND 0
pxor m0, m4
pxor m1, m4
pavgb m0, m1
pxor m0, m4
%endmacro
INIT_XMM sse2
; void ff_vp3_put_no_rnd_pixels8_l2_sse2(uint8_t *dst, const uint8_t *a,
; const uint8_t *b, ptrdiff_t stride,
; int h)
cglobal vp3_put_no_rnd_pixels8_l2, 5, 6, 5, dst, src1, src2, stride, h, stride3
lea stride3q,[strideq+strideq*2]
pcmpeqb m4, m4
.loop:
movq m0, [src1q]
movq m1, [src2q]
movhps m0, [src1q+strideq]
movhps m1, [src2q+strideq]
PAVGB_NO_RND
movq [dstq], m0
movhps [dstq+strideq], m0
movq m0, [src1q+strideq*2]
movq m1, [src2q+strideq*2]
movhps m0, [src1q+stride3q]
movhps m1, [src2q+stride3q]
PAVGB_NO_RND
movq [dstq+strideq*2], m0
movhps [dstq+stride3q], m0
lea src1q, [src1q+strideq*4]
lea src2q, [src2q+strideq*4]
lea dstq, [dstq+strideq*4]
sub hd, 4
jnz .loop
RET
%macro VP3_1D_IDCT_SSE2 0
movdqa m2, I(3) ; xmm2 = i3
movdqa m6, C(3) ; xmm6 = c3
movdqa m4, m2 ; xmm4 = i3
movdqa m7, I(5) ; xmm7 = i5
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
movdqa m1, C(5) ; xmm1 = c5
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
movdqa m5, m1 ; xmm5 = c5
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
movdqa m3, I(1) ; xmm3 = i1
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
movdqa m0, C(1) ; xmm0 = c1
paddw m4, m2 ; xmm4 = c3 * i3
paddw m6, m7 ; xmm6 = c3 * i5
paddw m2, m1 ; xmm2 = c5 * i3
movdqa m1, I(7) ; xmm1 = i7
paddw m7, m5 ; xmm7 = c5 * i5
movdqa m5, m0 ; xmm5 = c1
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
movdqa m7, C(7) ; xmm7 = c7
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
paddw m0, m3 ; xmm0 = c1 * i1
pmulhw m3, m7 ; xmm3 = c7 * i1
movdqa m2, I(2) ; xmm2 = i2
pmulhw m7, m1 ; xmm7 = c7 * i7
paddw m5, m1 ; xmm5 = c1 * i7
movdqa m1, m2 ; xmm1 = i2
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
movdqa m5, I(6) ; xmm5 = i6
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
movdqa m7, m5 ; xmm7 = i6
psubsw m0, m4 ; xmm0 = A - C
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
paddw m2, m1 ; xmm2 = i2 * c2
pmulhw m1, C(6) ; xmm1 = c6 * i2
paddsw m4, m4 ; xmm4 = C + C
paddsw m4, m0 ; xmm4 = A + C = C.
psubsw m3, m6 ; xmm3 = B - D
paddw m5, m7 ; xmm5 = c2 * i6
paddsw m6, m6 ; xmm6 = D + D
pmulhw m7, C(6) ; xmm7 = c6 * i6
paddsw m6, m3 ; xmm6 = B + D = D.
movdqa I(1), m4 ; Save C. at I(1)
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
movdqa m4, C(4) ; xmm4 = C4
movdqa m5, m3 ; xmm5 = B - D
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
movdqa I(2), m6 ; save D. at I(2)
movdqa m2, m0 ; xmm2 = A - C
movdqa m6, I(0) ; xmm6 = i0
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
movdqa m3, I(4) ; xmm3 = i4
psubsw m5, m1 ; xmm5 = B. - H = B..
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
psubsw m6, m3 ; xmm6 = i0 - i4
movdqa m0, m6 ; xmm0 = i0 - i4
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
paddsw m3, m3 ; xmm3 = i4 + i4
paddsw m1, m1 ; xmm1 = H + H
paddsw m3, m0 ; xmm3 = i0 + i4
paddsw m1, m5 ; xmm1 = B. + H = H.
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
psubsw m6, m2 ; xmm6 = F - A. = F.
paddsw m2, m2 ; xmm2 = A. + A.
movdqa m0, I(1) ; Load C. from I(1)
paddsw m2, m6 ; xmm2 = F + A. = A..
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
psubsw m2, m1 ; xmm2 = A.. - H. = R2
ADD(m2) ; Adjust R2 and R1 before shifting
paddsw m1, m1 ; xmm1 = H. + H.
paddsw m1, m2 ; xmm1 = A.. + H. = R1
SHIFT(m2) ; xmm2 = op2
psubsw m4, m7 ; xmm4 = E - G = E.
SHIFT(m1) ; xmm1 = op1
movdqa m3, I(2) ; Load D. from I(2)
paddsw m7, m7 ; xmm7 = G + G
paddsw m7, m4 ; xmm7 = E + G = G.
psubsw m4, m3 ; xmm4 = E. - D. = R4
ADD(m4) ; Adjust R4 and R3 before shifting
paddsw m3, m3 ; xmm3 = D. + D.
paddsw m3, m4 ; xmm3 = E. + D. = R3
SHIFT(m4) ; xmm4 = op4
psubsw m6, m5 ; xmm6 = F. - B..= R6
SHIFT(m3) ; xmm3 = op3
ADD(m6) ; Adjust R6 and R5 before shifting
paddsw m5, m5 ; xmm5 = B.. + B..
paddsw m5, m6 ; xmm5 = F. + B.. = R5
SHIFT(m6) ; xmm6 = op6
SHIFT(m5) ; xmm5 = op5
psubsw m7, m0 ; xmm7 = G. - C. = R7
ADD(m7) ; Adjust R7 and R0 before shifting
paddsw m0, m0 ; xmm0 = C. + C.
paddsw m0, m7 ; xmm0 = G. + C.
SHIFT(m7) ; xmm7 = op7
SHIFT(m0) ; xmm0 = op0
%endmacro
%macro PUT_BLOCK 8
movdqa O(0), m%1
movdqa O(1), m%2
movdqa O(2), m%3
movdqa O(3), m%4
movdqa O(4), m%5
movdqa O(5), m%6
movdqa O(6), m%7
movdqa O(7), m%8
%endmacro
%macro VP3_IDCT 0
%define I(x) [blockq+16*x]
%define O(x) [blockq+16*x]
%define C(x) [vp3_idct_data+16*(x-1)]
%define SHIFT(x)
%define ADD(x)
VP3_1D_IDCT_SSE2
%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq], [blockq+16]
%endif
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%define SHIFT(x) psraw x, 4
%define ADD(x) paddsw x, [pw_8]
VP3_1D_IDCT_SSE2
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
INIT_XMM sse2
; void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
cglobal vp3_idct_put, 3, 4, 9, dst, s, block, s3
VP3_IDCT
mova m4, [pb_80]
lea s3q, [sq*3]
mova m0, [r2+mmsize*0]
mova m1, [r2+mmsize*2]
mova m2, [r2+mmsize*4]
mova m3, [r2+mmsize*6]
packsswb m0, [r2+mmsize*1]
packsswb m1, [r2+mmsize*3]
packsswb m2, [r2+mmsize*5]
packsswb m3, [r2+mmsize*7]
paddb m0, m4
paddb m1, m4
paddb m2, m4
paddb m3, m4
movq [dstq ], m0
movhps [dstq+sq], m0
movq [dstq+sq*2], m1
movhps [dstq+s3q], m1
lea dstq, [dstq+sq*4]
movq [dstq ], m2
movhps [dstq+sq], m2
movq [dstq+sq*2], m3
movhps [dstq+s3q], m3
pxor m0, m0
%assign offset 0
%rep 128/mmsize
mova [blockq+offset], m0
%assign offset offset+mmsize
%endrep
RET
; void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3
VP3_IDCT
lea s3q, [sq*3]
pxor m4, m4
%assign i 0
%rep 2
movq m0, [dstq]
movq m1, [dstq+sq]
movq m2, [dstq+sq*2]
movq m3, [dstq+s3q]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
paddsw m0, [blockq+ 0+i]
paddsw m1, [blockq+16+i]
paddsw m2, [blockq+32+i]
paddsw m3, [blockq+48+i]
packuswb m0, m1
packuswb m2, m3
movq [dstq ], m0
movhps [dstq+sq], m0
movq [dstq+sq*2], m2
movhps [dstq+s3q], m2
%if i == 0
lea dstq, [dstq+sq*4]
%endif
%assign i i+64
%endrep
%assign i 0
%rep 128/mmsize
mova [blockq+i], m4
%assign i i+mmsize
%endrep
RET
%macro DC_ADD 0
movq m2, [dstq ]
movq m3, [dstq+sq ]
paddusb m2, m0
movq m4, [dstq+sq*2]
paddusb m3, m0
movq m5, [dstq+s3q ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
movq [dstq ], m2
psubusb m4, m1
movq [dstq+sq ], m3
psubusb m5, m1
movq [dstq+sq*2], m4
movq [dstq+s3q ], m5
%endmacro
INIT_MMX mmxext
; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block)
cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
movsx dcd, word [blockq]
mov word [blockq], 0
%define s3q blockq
lea s3q, [sq*3]
add dcd, 15
sar dcd, 5
movd m0, dcd
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
DC_ADD
lea dstq, [dstq+sq*4]
DC_ADD
RET