From 56b97c03d4e68aa7ed42b5bfd470166a7235e1aa Mon Sep 17 00:00:00 2001 From: zuxy Date: Sun, 29 Mar 2026 19:31:29 -0700 Subject: [PATCH] avcodec/x86/h264_intrapred: Replace pred8x8_top_dc_8_mmxext with SSE2 More about deprecating MMX than any performance gain; nearly identical performance numbers on my Zen 4 (1.36x vs c), but llvm-mca predicts >60% perf gain on Intel CPUs newer than Skylake. Signed-off-by: Zuxy Meng --- libavcodec/x86/h264_intrapred.asm | 44 ++++++++++++---------------- libavcodec/x86/h264_intrapred_init.c | 9 ++++-- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index a8a630dbe6..2e4a46ae76 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -606,37 +606,31 @@ INIT_MMX ssse3 PRED8x8_H ;----------------------------------------------------------------------------- -; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) +; void ff_pred8x8_top_dc_8_sse2(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- -INIT_MMX mmxext +INIT_XMM sse2 cglobal pred8x8_top_dc_8, 2,5 sub r0, r1 - movq mm0, [r0] - pxor mm1, mm1 - pxor mm2, mm2 + movq xmm0, [r0] + pxor xmm1, xmm1 lea r2, [r0+r1*2] - punpckhbw mm1, mm0 - punpcklbw mm0, mm2 - psadbw mm1, mm2 ; s1 + punpcklbw xmm0, xmm1 + psadbw xmm0, xmm1 ; s0,0,0,0,s1,0,0,0 (w) lea r3, [r2+r1*2] - psadbw mm0, mm2 ; s0 - psrlw mm1, 1 - psrlw mm0, 1 - pavgw mm1, mm2 - lea r4, [r3+r1*2] - pavgw mm0, mm2 - pshufw mm1, mm1, 0 - pshufw mm0, mm0, 0 ; dc0 (w) - packuswb mm0, mm1 ; dc0,dc1 (b) - movq [r0+r1*1], mm0 - movq [r0+r1*2], mm0 + psrlw xmm0, 1 + pavgw xmm0, xmm1 + pshuflw xmm0, xmm0, 0 ; dc0,dc0,dc0,dc0,dc1,0,0,0 + pshufhw xmm0, xmm0, 0 ; dc0,dc1 (w) + packuswb xmm0, xmm1 ; dc0,dc1 (b) + movq [r0+r1*1], xmm0 + movq [r0+r1*2], xmm0 lea r0, [r3+r1*2] - movq [r2+r1*1], mm0 - movq [r2+r1*2], mm0 - movq [r3+r1*1], mm0 - movq [r3+r1*2], mm0 - movq [r0+r1*1], mm0 - movq [r0+r1*2], mm0 + movq [r2+r1*1], xmm0 + movq [r2+r1*2], xmm0 + movq [r3+r1*1], xmm0 + movq [r3+r1*2], xmm0 + movq [r0+r1*1], xmm0 + movq [r0+r1*2], xmm0 RET ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index aa9bc721f0..0476980a57 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -113,7 +113,7 @@ PRED16x16(plane_svq3, 8, ssse3) PRED16x16(tm_vp8, 8, sse2) PRED16x16(tm_vp8, 8, avx2) -PRED8x8(top_dc, 8, mmxext) +PRED8x8(top_dc, 8, sse2) PRED8x8(dc_rv40, 8, mmxext) PRED8x8(dc, 8, mmxext) PRED8x8(vertical, 8, sse2) @@ -187,7 +187,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { if (chroma_format_idc <= 1) { - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext; h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext; } } @@ -210,8 +209,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2; h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2; h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2; - if (chroma_format_idc <= 1) + if (chroma_format_idc <= 1) { h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_sse2; + if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { + h->pred8x8 [TOP_DC_PRED8x8] = ff_pred8x8_top_dc_8_sse2; + } + } if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2; h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;