mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avcodec/x86/vvc/of: Break dependency chain
Don't extract and update one word of one and the same register at a time; use separate src and dst registers, so that pextrw and bsr can be done in parallel. Also use movd instead of pinsrw for the first word. Old benchmarks: apply_bdof_8_8x16_c: 3275.2 ( 1.00x) apply_bdof_8_8x16_avx2: 487.6 ( 6.72x) apply_bdof_8_16x8_c: 3243.1 ( 1.00x) apply_bdof_8_16x8_avx2: 284.4 (11.40x) apply_bdof_8_16x16_c: 6501.8 ( 1.00x) apply_bdof_8_16x16_avx2: 570.0 (11.41x) apply_bdof_10_8x16_c: 3286.5 ( 1.00x) apply_bdof_10_8x16_avx2: 461.7 ( 7.12x) apply_bdof_10_16x8_c: 3274.5 ( 1.00x) apply_bdof_10_16x8_avx2: 271.4 (12.06x) apply_bdof_10_16x16_c: 6590.0 ( 1.00x) apply_bdof_10_16x16_avx2: 543.9 (12.12x) apply_bdof_12_8x16_c: 3307.6 ( 1.00x) apply_bdof_12_8x16_avx2: 462.2 ( 7.16x) apply_bdof_12_16x8_c: 3287.4 ( 1.00x) apply_bdof_12_16x8_avx2: 271.8 (12.10x) apply_bdof_12_16x16_c: 6465.7 ( 1.00x) apply_bdof_12_16x16_avx2: 543.8 (11.89x) New benchmarks: apply_bdof_8_8x16_c: 3255.7 ( 1.00x) apply_bdof_8_8x16_avx2: 349.3 ( 9.32x) apply_bdof_8_16x8_c: 3262.5 ( 1.00x) apply_bdof_8_16x8_avx2: 214.8 (15.19x) apply_bdof_8_16x16_c: 6471.6 ( 1.00x) apply_bdof_8_16x16_avx2: 429.8 (15.06x) apply_bdof_10_8x16_c: 3227.7 ( 1.00x) apply_bdof_10_8x16_avx2: 321.6 (10.04x) apply_bdof_10_16x8_c: 3250.2 ( 1.00x) apply_bdof_10_16x8_avx2: 201.2 (16.16x) apply_bdof_10_16x16_c: 6476.5 ( 1.00x) apply_bdof_10_16x16_avx2: 400.9 (16.16x) apply_bdof_12_8x16_c: 3230.7 ( 1.00x) apply_bdof_12_8x16_avx2: 321.8 (10.04x) apply_bdof_12_16x8_c: 3210.5 ( 1.00x) apply_bdof_12_16x8_avx2: 200.9 (15.98x) apply_bdof_12_16x16_c: 6474.5 ( 1.00x) apply_bdof_12_16x16_avx2: 400.2 (16.18x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -251,21 +251,25 @@ INIT_YMM avx2
|
||||
psrlw %3, %4
|
||||
%endmacro
|
||||
|
||||
%macro LOG2 2 ; dst/src, offset
|
||||
pextrw tmp0d, xm%1, %2
|
||||
%macro LOG2 3 ; dst, src, offset
|
||||
pextrw tmp0d, xm%2, %3
|
||||
bsr tmp0d, tmp0d
|
||||
pinsrw xm%1, tmp0d, %2
|
||||
%if %3 != 0
|
||||
pinsrw xm%1, tmp0d, %3
|
||||
%else
|
||||
movd xm%1, tmp0d
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOG2 1 ; dst/src
|
||||
LOG2 %1, 0
|
||||
LOG2 %1, 1
|
||||
LOG2 %1, 2
|
||||
LOG2 %1, 3
|
||||
LOG2 %1, 4
|
||||
LOG2 %1, 5
|
||||
LOG2 %1, 6
|
||||
LOG2 %1, 7
|
||||
%macro LOG2 2 ; dst, src
|
||||
LOG2 %1, %2, 0
|
||||
LOG2 %1, %2, 1
|
||||
LOG2 %1, %2, 2
|
||||
LOG2 %1, %2, 3
|
||||
LOG2 %1, %2, 4
|
||||
LOG2 %1, %2, 5
|
||||
LOG2 %1, %2, 6
|
||||
LOG2 %1, %2, 7
|
||||
%endmacro
|
||||
|
||||
; %1: 4 (sgx2, sgy2, sgxdi, gydi)
|
||||
@@ -277,8 +281,7 @@ INIT_YMM avx2
|
||||
|
||||
punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
|
||||
punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
|
||||
mova m10, m8
|
||||
LOG2 10 ; 4 (log2(sgx2), log2(sgy2))
|
||||
LOG2 10, 8 ; 4 (log2(sgx2), log2(sgy2))
|
||||
|
||||
; Promote to dword since vpsrlvw is AVX-512 only
|
||||
pmovsxwd m8, xm8
|
||||
|
||||
Reference in New Issue
Block a user