From 415b466d41ac81856abc76d7a9341132b0f668b0 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sun, 12 Apr 2026 23:50:52 +0200 Subject: [PATCH] avcodec/x86/vp3dsp: Port ff_vp3_idct_dc_add_mmxext to SSE2 This change should improve performance on Skylake and later Intel CPUs (which have only half the ports for saturated adds/subs for mmx register compared to xmm register): llvm-mca predicts a 25% performance improvement on Skylake. Signed-off-by: Andreas Rheinhardt --- libavcodec/x86/vp3dsp.asm | 12 ++++++------ libavcodec/x86/vp3dsp_init.c | 7 ++----- tests/checkasm/vp3dsp.c | 2 +- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 096c47057a..e9bc184800 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* MMX/SSE2-optimized functions for the VP3 decoder +;* SSE2-optimized functions for the VP3 decoder ;* Copyright (c) 2007 Aurelien Jacobs ;* ;* This file is part of FFmpeg. @@ -21,7 +21,7 @@ %include "libavutil/x86/x86util.asm" -; MMX-optimized functions cribbed from the original VP3 source code. +; ASM-optimized functions cribbed from the original VP3 source code. SECTION_RODATA @@ -421,9 +421,9 @@ cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3 movq [dstq+s3q ], m5 %endmacro -INIT_MMX mmxext -; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block) -cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc +INIT_XMM sse2 +; void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block) +cglobal vp3_idct_dc_add, 3, 4, 6, dst, s, block, dc movsx dcd, word [blockq] mov word [blockq], 0 %define s3q blockq @@ -431,7 +431,7 @@ cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc add dcd, 15 sar dcd, 5 movd m0, dcd - pshufw m0, m0, 0x0 + SPLATW m0, m0 pxor m1, m1 psubw m1, m0 packuswb m0, m0 diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index 4a416c6f9a..0ba4f00ed6 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -29,7 +29,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block); -void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride, int *bounding_values); @@ -44,15 +44,12 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMXEXT(cpu_flags)) { - c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; - } - if (EXTERNAL_SSE2(cpu_flags)) { c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2; c->idct_put = ff_vp3_idct_put_sse2; c->idct_add = ff_vp3_idct_add_sse2; + c->idct_dc_add = ff_vp3_idct_dc_add_sse2; c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2; c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2; diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c index f75e4a0617..7890364ee5 100644 --- a/tests/checkasm/vp3dsp.c +++ b/tests/checkasm/vp3dsp.c @@ -92,7 +92,7 @@ static void vp3_check_idct(int nb_bits) BUF_SIZE = MAX_STRIDE * (NB_LINES - 1) + WIDTH, }; - declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block); + declare_func(void, uint8_t *dest, ptrdiff_t stride, int16_t *block); DECLARE_ALIGNED(16, int16_t, block_new)[64]; DECLARE_ALIGNED(16, int16_t, block_ref)[64];