avcodec/x86/vp3dsp: Port ff_vp3_idct_dc_add_mmxext to SSE2

This change should improve performance on Skylake and later
Intel CPUs (which have only half the ports for saturated adds/subs
for mmx register compared to xmm register): llvm-mca predicts
a 25% performance improvement on Skylake.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-04-12 23:50:52 +02:00
parent e7a613b274
commit 415b466d41
3 changed files with 9 additions and 12 deletions

View File

@@ -1,5 +1,5 @@
;******************************************************************************
;* MMX/SSE2-optimized functions for the VP3 decoder
;* SSE2-optimized functions for the VP3 decoder
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
;*
;* This file is part of FFmpeg.
@@ -21,7 +21,7 @@
%include "libavutil/x86/x86util.asm"
; MMX-optimized functions cribbed from the original VP3 source code.
; ASM-optimized functions cribbed from the original VP3 source code.
SECTION_RODATA
@@ -421,9 +421,9 @@ cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3
movq [dstq+s3q ], m5
%endmacro
INIT_MMX mmxext
; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block)
cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
INIT_XMM sse2
; void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
cglobal vp3_idct_dc_add, 3, 4, 6, dst, s, block, dc
movsx dcd, word [blockq]
mov word [blockq], 0
%define s3q blockq
@@ -431,7 +431,7 @@ cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
add dcd, 15
sar dcd, 5
movd m0, dcd
pshufw m0, m0, 0x0
SPLATW m0, m0
pxor m1, m1
psubw m1, m0
packuswb m0, m0

View File

@@ -29,7 +29,7 @@
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
@@ -44,15 +44,12 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2;
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
c->idct_dc_add = ff_vp3_idct_dc_add_sse2;
c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;

View File

@@ -92,7 +92,7 @@ static void vp3_check_idct(int nb_bits)
BUF_SIZE = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
};
declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
declare_func(void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
DECLARE_ALIGNED(16, int16_t, block_new)[64];
DECLARE_ALIGNED(16, int16_t, block_ref)[64];