mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-21 13:21:55 +08:00
avcodec/x86/h264_deblock: Avoid MMX in deblock_h_luma_8
Old benchmarks: h264_h_loop_filter_luma_8bpp_c: 59.9 ( 1.00x) h264_h_loop_filter_luma_8bpp_sse2: 67.9 ( 0.88x) h264_h_loop_filter_luma_8bpp_avx: 67.4 ( 0.89x) New benchmarks: h264_h_loop_filter_luma_8bpp_c: 60.0 ( 1.00x) h264_h_loop_filter_luma_8bpp_sse2: 65.4 ( 0.92x) h264_h_loop_filter_luma_8bpp_avx: 65.3 ( 0.92x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -37,35 +37,25 @@ cextern pb_0
|
||||
cextern pb_1
|
||||
cextern pb_3
|
||||
|
||||
; in: 4 rows of 8 bytes in m0..m3
|
||||
; out: 8 rows of 4 bytes in %1..%8
|
||||
%macro TRANSPOSE8x4B_STORE 8
|
||||
punpckhdq m4, m0, m0
|
||||
punpckhdq m5, m1, m1
|
||||
punpckhdq m6, m2, m2
|
||||
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
punpcklwd m1, m0, m2
|
||||
punpckhwd m0, m2
|
||||
movh %1, m1
|
||||
punpckhdq m1, m1
|
||||
movh %2, m1
|
||||
movh %3, m0
|
||||
punpckhdq m0, m0
|
||||
movh %4, m0
|
||||
|
||||
punpckhdq m3, m3
|
||||
punpcklbw m4, m5
|
||||
punpcklbw m6, m3
|
||||
punpcklwd m5, m4, m6
|
||||
punpckhwd m4, m6
|
||||
movh %5, m5
|
||||
punpckhdq m5, m5
|
||||
movh %6, m5
|
||||
movh %7, m4
|
||||
punpckhdq m4, m4
|
||||
movh %8, m4
|
||||
; in: 2 rows of 8 words in %1, %2
|
||||
; out: 8 rows of 4 bytes in %3..%10
|
||||
%macro TRANSPOSE8x4B_STORE 10
|
||||
punpcklwd m6, %1, %2
|
||||
movd %3, m6
|
||||
pshufd m7, m6, 00110001b
|
||||
punpckhqdq m6, m6
|
||||
movd %4, m7
|
||||
punpckhqdq m7, m7
|
||||
punpckhwd %1, %2
|
||||
movd %5, m6
|
||||
movd %6, m7
|
||||
pshufd m6, %1, 00110001b
|
||||
movd %7, %1
|
||||
punpckhqdq %1, %1
|
||||
movd %8, m6
|
||||
punpckhqdq m6, m6
|
||||
movd %9, %1
|
||||
movd %10, m6
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLY3 4
|
||||
@@ -297,8 +287,8 @@ cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
|
||||
; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
|
||||
; int8_t *tc0)
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_MMX cpuname
|
||||
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
|
||||
cglobal deblock_h_luma_8, 5,9,8,0x60+16*WIN64
|
||||
INIT_MMX cpuname
|
||||
movsxd r7, r1d
|
||||
lea r8, [r7+r7*2]
|
||||
lea r6, [r0-4]
|
||||
@@ -325,25 +315,30 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
|
||||
%endif
|
||||
call deblock_v_luma_8
|
||||
|
||||
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
|
||||
add r6, 2
|
||||
add r5, 2
|
||||
movq m0, [pix_tmp+0x18]
|
||||
movq m1, [pix_tmp+0x28]
|
||||
movq m2, [pix_tmp+0x38]
|
||||
movq m3, [pix_tmp+0x48]
|
||||
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
|
||||
|
||||
INIT_XMM cpuname
|
||||
|
||||
; transpose 16x4 (only the middle 4 rows were changed by the filter)
|
||||
mova m0, [pix_tmp+0x10]
|
||||
mova m1, [pix_tmp+0x20]
|
||||
mova m2, [pix_tmp+0x30]
|
||||
mova m3, [pix_tmp+0x40]
|
||||
|
||||
punpckhbw m4, m0, m1
|
||||
punpckhbw m5, m2, m3
|
||||
|
||||
TRANSPOSE8x4B_STORE m4, m5, PASS8ROWS(r6, r5, r7, r8)
|
||||
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
shl r7, 3
|
||||
sub r6, r7
|
||||
sub r5, r7
|
||||
shr r7, 3
|
||||
movq m0, [pix_tmp+0x10]
|
||||
movq m1, [pix_tmp+0x20]
|
||||
movq m2, [pix_tmp+0x30]
|
||||
movq m3, [pix_tmp+0x40]
|
||||
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
|
||||
|
||||
TRANSPOSE8x4B_STORE m0, m2, PASS8ROWS(r6, r5, r7, r8)
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@@ -499,8 +494,8 @@ cglobal deblock_v_luma_8, 5,5,8,2*%1
|
||||
; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
|
||||
; int8_t *tc0)
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_MMX cpuname
|
||||
cglobal deblock_h_luma_8, 0,5,8,0x60+12
|
||||
INIT_MMX cpuname
|
||||
mov r0, r0mp
|
||||
mov r3, r1m
|
||||
lea r4, [r3*3]
|
||||
@@ -524,24 +519,28 @@ cglobal deblock_h_luma_8, 0,5,8,0x60+12
|
||||
call deblock_v_luma_8
|
||||
ADD esp, 20
|
||||
|
||||
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
|
||||
mov r0, r0mp
|
||||
sub r0, 2
|
||||
INIT_XMM cpuname
|
||||
|
||||
movq m0, [pix_tmp+0x10]
|
||||
movq m1, [pix_tmp+0x20]
|
||||
lea r1, [r0+r4]
|
||||
movq m2, [pix_tmp+0x30]
|
||||
movq m3, [pix_tmp+0x40]
|
||||
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
|
||||
; transpose 16x4 (only the middle 4 rows were changed by the filter)
|
||||
mova m0, [pix_tmp+0x10]
|
||||
mova m1, [pix_tmp+0x20]
|
||||
mova m2, [pix_tmp+0x30]
|
||||
mova m3, [pix_tmp+0x40]
|
||||
|
||||
mov r0, r0mp
|
||||
punpcklbw m4, m0, m1
|
||||
sub r0, 2
|
||||
punpcklbw m5, m2, m3
|
||||
lea r1, [r0+r4]
|
||||
|
||||
TRANSPOSE8x4B_STORE m4, m5, PASS8ROWS(r0, r1, r3, r4)
|
||||
|
||||
punpckhbw m0, m1
|
||||
lea r0, [r0+r3*8]
|
||||
punpckhbw m2, m3
|
||||
lea r1, [r1+r3*8]
|
||||
movq m0, [pix_tmp+0x18]
|
||||
movq m1, [pix_tmp+0x28]
|
||||
movq m2, [pix_tmp+0x38]
|
||||
movq m3, [pix_tmp+0x48]
|
||||
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
|
||||
|
||||
TRANSPOSE8x4B_STORE m0, m2, PASS8ROWS(r0, r1, r3, r4)
|
||||
|
||||
RET
|
||||
%endmacro ; DEBLOCK_LUMA
|
||||
|
||||
Reference in New Issue
Block a user