avcodec/x86/vp3dsp: Port ff_vp3_idct_dc_add_mmxext to SSE2

This change should improve performance on Skylake and later Intel CPUs (which have only half the ports for saturated adds/subs for mmx register compared to xmm register): llvm-mca predicts a 25% performance improvement on Skylake. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-04-20 12:50:49 +08:00 · 2026-04-12 23:50:52 +02:00
parent e7a613b274
commit 415b466d41
3 changed files with 9 additions and 12 deletions
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -1,5 +1,5 @@
 ;******************************************************************************
-;* MMX/SSE2-optimized functions for the VP3 decoder
+;* SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
 ;* This file is part of FFmpeg.
@@ -21,7 +21,7 @@

 %include "libavutil/x86/x86util.asm"

-; MMX-optimized functions cribbed from the original VP3 source code.
+; ASM-optimized functions cribbed from the original VP3 source code.

 SECTION_RODATA

@@ -421,9 +421,9 @@ cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3
    movq [dstq+s3q ], m5
 %endmacro

-INIT_MMX mmxext
-; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block)
-cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
+INIT_XMM sse2
+; void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
+cglobal vp3_idct_dc_add, 3, 4, 6, dst, s, block, dc
    movsx        dcd, word [blockq]
    mov word [blockq], 0
 %define s3q blockq
@@ -431,7 +431,7 @@ cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
    add          dcd, 15
    sar          dcd, 5
    movd          m0, dcd
-    pshufw        m0, m0, 0x0
+    SPLATW        m0, m0
    pxor          m1, m1
    psubw         m1, m0
    packuswb      m0, m0
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -29,7 +29,7 @@
 void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);

-void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);

 void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
                               int *bounding_values);
@@ -44,15 +44,12 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
 {
    int cpu_flags = av_get_cpu_flags();

-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
-    }
-
    if (EXTERNAL_SSE2(cpu_flags)) {
        c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2;

        c->idct_put  = ff_vp3_idct_put_sse2;
        c->idct_add  = ff_vp3_idct_add_sse2;
+        c->idct_dc_add = ff_vp3_idct_dc_add_sse2;

        c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
        c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -92,7 +92,7 @@ static void vp3_check_idct(int nb_bits)
        BUF_SIZE     = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
    };

-    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    declare_func(void, uint8_t *dest, ptrdiff_t stride, int16_t *block);

    DECLARE_ALIGNED(16, int16_t, block_new)[64];
    DECLARE_ALIGNED(16, int16_t, block_ref)[64];