mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-30 22:00:51 +08:00
avcodec/h264: add avx 8-bit h264_idct_dc_add
Haswell: - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext Skylake-U: - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
This commit is contained in:
@@ -1158,7 +1158,27 @@ INIT_XMM avx
|
||||
movd [%7+%8], %4
|
||||
%endmacro
|
||||
|
||||
%macro DC_ADD_INIT 1
|
||||
add %1d, 32
|
||||
sar %1d, 6
|
||||
movd m0, %1d
|
||||
pshuflw m0, m0, 0
|
||||
lea %1, [3*stride_q]
|
||||
pxor m1, m1
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
%endmacro
|
||||
|
||||
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
|
||||
movsxdifnidn stride_q, stride_d
|
||||
IDCT4_ADD dst_q, block_q, stride_q
|
||||
RET
|
||||
|
||||
cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
|
||||
movsxdifnidn stride_q, stride_d
|
||||
movsx r3d, word [block_q]
|
||||
mov dword [block_q], 0
|
||||
DC_ADD_INIT r3
|
||||
DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
|
||||
RET
|
||||
|
||||
@@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx)
|
||||
IDCT_ADD_FUNC(, 8, avx)
|
||||
IDCT_ADD_FUNC(, 10, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(_dc, 8, avx)
|
||||
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 10, sse2)
|
||||
@@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
}
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_8_avx;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
|
||||
Reference in New Issue
Block a user