avcodec/x86/h264_intrapred: Replace pred8x8_top_dc_8_mmxext with SSE2

More about deprecating MMX than any performance gain; nearly identical
performance numbers on my Zen 4 (1.36x vs c), but llvm-mca predicts
>60% perf gain on Intel CPUs newer than Skylake.

Signed-off-by: Zuxy Meng <zuxy.meng@gmail.com>
This commit is contained in:
zuxy
2026-03-29 19:31:29 -07:00
committed by Zuxy Meng
parent c29465bcb6
commit 56b97c03d4
2 changed files with 25 additions and 28 deletions

View File

@@ -606,37 +606,31 @@ INIT_MMX ssse3
PRED8x8_H
;-----------------------------------------------------------------------------
; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
; void ff_pred8x8_top_dc_8_sse2(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
INIT_XMM sse2
cglobal pred8x8_top_dc_8, 2,5
sub r0, r1
movq mm0, [r0]
pxor mm1, mm1
pxor mm2, mm2
movq xmm0, [r0]
pxor xmm1, xmm1
lea r2, [r0+r1*2]
punpckhbw mm1, mm0
punpcklbw mm0, mm2
psadbw mm1, mm2 ; s1
punpcklbw xmm0, xmm1
psadbw xmm0, xmm1 ; s0,0,0,0,s1,0,0,0 (w)
lea r3, [r2+r1*2]
psadbw mm0, mm2 ; s0
psrlw mm1, 1
psrlw mm0, 1
pavgw mm1, mm2
lea r4, [r3+r1*2]
pavgw mm0, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
packuswb mm0, mm1 ; dc0,dc1 (b)
movq [r0+r1*1], mm0
movq [r0+r1*2], mm0
psrlw xmm0, 1
pavgw xmm0, xmm1
pshuflw xmm0, xmm0, 0 ; dc0,dc0,dc0,dc0,dc1,0,0,0
pshufhw xmm0, xmm0, 0 ; dc0,dc1 (w)
packuswb xmm0, xmm1 ; dc0,dc1 (b)
movq [r0+r1*1], xmm0
movq [r0+r1*2], xmm0
lea r0, [r3+r1*2]
movq [r2+r1*1], mm0
movq [r2+r1*2], mm0
movq [r3+r1*1], mm0
movq [r3+r1*2], mm0
movq [r0+r1*1], mm0
movq [r0+r1*2], mm0
movq [r2+r1*1], xmm0
movq [r2+r1*2], xmm0
movq [r3+r1*1], xmm0
movq [r3+r1*2], xmm0
movq [r0+r1*1], xmm0
movq [r0+r1*2], xmm0
RET
;-----------------------------------------------------------------------------

View File

@@ -113,7 +113,7 @@ PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, sse2)
PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(top_dc, 8, sse2)
PRED8x8(dc_rv40, 8, mmxext)
PRED8x8(dc, 8, mmxext)
PRED8x8(vertical, 8, sse2)
@@ -187,7 +187,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
if (chroma_format_idc <= 1) {
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
}
}
@@ -210,8 +209,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
if (chroma_format_idc <= 1)
if (chroma_format_idc <= 1) {
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_sse2;
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
h->pred8x8 [TOP_DC_PRED8x8] = ff_pred8x8_top_dc_8_sse2;
}
}
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;