mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avcodec/x86/h264_intrapred: Replace pred8x8_top_dc_8_mmxext with SSE2
More about deprecating MMX than any performance gain; nearly identical performance numbers on my Zen 4 (1.36x vs c), but llvm-mca predicts >60% perf gain on Intel CPUs newer than Skylake. Signed-off-by: Zuxy Meng <zuxy.meng@gmail.com>
This commit is contained in:
@@ -606,37 +606,31 @@ INIT_MMX ssse3
|
||||
PRED8x8_H
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
|
||||
; void ff_pred8x8_top_dc_8_sse2(uint8_t *src, ptrdiff_t stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_MMX mmxext
|
||||
INIT_XMM sse2
|
||||
cglobal pred8x8_top_dc_8, 2,5
|
||||
sub r0, r1
|
||||
movq mm0, [r0]
|
||||
pxor mm1, mm1
|
||||
pxor mm2, mm2
|
||||
movq xmm0, [r0]
|
||||
pxor xmm1, xmm1
|
||||
lea r2, [r0+r1*2]
|
||||
punpckhbw mm1, mm0
|
||||
punpcklbw mm0, mm2
|
||||
psadbw mm1, mm2 ; s1
|
||||
punpcklbw xmm0, xmm1
|
||||
psadbw xmm0, xmm1 ; s0,0,0,0,s1,0,0,0 (w)
|
||||
lea r3, [r2+r1*2]
|
||||
psadbw mm0, mm2 ; s0
|
||||
psrlw mm1, 1
|
||||
psrlw mm0, 1
|
||||
pavgw mm1, mm2
|
||||
lea r4, [r3+r1*2]
|
||||
pavgw mm0, mm2
|
||||
pshufw mm1, mm1, 0
|
||||
pshufw mm0, mm0, 0 ; dc0 (w)
|
||||
packuswb mm0, mm1 ; dc0,dc1 (b)
|
||||
movq [r0+r1*1], mm0
|
||||
movq [r0+r1*2], mm0
|
||||
psrlw xmm0, 1
|
||||
pavgw xmm0, xmm1
|
||||
pshuflw xmm0, xmm0, 0 ; dc0,dc0,dc0,dc0,dc1,0,0,0
|
||||
pshufhw xmm0, xmm0, 0 ; dc0,dc1 (w)
|
||||
packuswb xmm0, xmm1 ; dc0,dc1 (b)
|
||||
movq [r0+r1*1], xmm0
|
||||
movq [r0+r1*2], xmm0
|
||||
lea r0, [r3+r1*2]
|
||||
movq [r2+r1*1], mm0
|
||||
movq [r2+r1*2], mm0
|
||||
movq [r3+r1*1], mm0
|
||||
movq [r3+r1*2], mm0
|
||||
movq [r0+r1*1], mm0
|
||||
movq [r0+r1*2], mm0
|
||||
movq [r2+r1*1], xmm0
|
||||
movq [r2+r1*2], xmm0
|
||||
movq [r3+r1*1], xmm0
|
||||
movq [r3+r1*2], xmm0
|
||||
movq [r0+r1*1], xmm0
|
||||
movq [r0+r1*2], xmm0
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
@@ -113,7 +113,7 @@ PRED16x16(plane_svq3, 8, ssse3)
|
||||
PRED16x16(tm_vp8, 8, sse2)
|
||||
PRED16x16(tm_vp8, 8, avx2)
|
||||
|
||||
PRED8x8(top_dc, 8, mmxext)
|
||||
PRED8x8(top_dc, 8, sse2)
|
||||
PRED8x8(dc_rv40, 8, mmxext)
|
||||
PRED8x8(dc, 8, mmxext)
|
||||
PRED8x8(vertical, 8, sse2)
|
||||
@@ -187,7 +187,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
|
||||
}
|
||||
}
|
||||
@@ -210,8 +209,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
|
||||
if (chroma_format_idc <= 1)
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred8x8 [TOP_DC_PRED8x8] = ff_pred8x8_top_dc_8_sse2;
|
||||
}
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
|
||||
|
||||
Reference in New Issue
Block a user