mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-05-01 06:13:08 +08:00
avcodec/x86/vvc/of: Avoid scalar log2
Instead convert the integers to floats and inspect the exponent. Old benchmarks: apply_bdof_8_8x16_c: 3295.2 ( 1.00x) apply_bdof_8_8x16_avx2: 312.7 (10.54x) apply_bdof_8_16x8_c: 3269.1 ( 1.00x) apply_bdof_8_16x8_avx2: 203.6 (16.05x) apply_bdof_8_16x16_c: 6584.8 ( 1.00x) apply_bdof_8_16x16_avx2: 413.6 (15.92x) apply_bdof_10_8x16_c: 3313.9 ( 1.00x) apply_bdof_10_8x16_avx2: 321.5 (10.31x) apply_bdof_10_16x8_c: 3306.5 ( 1.00x) apply_bdof_10_16x8_avx2: 200.4 (16.50x) apply_bdof_10_16x16_c: 6659.7 ( 1.00x) apply_bdof_10_16x16_avx2: 402.4 (16.55x) apply_bdof_12_8x16_c: 3305.7 ( 1.00x) apply_bdof_12_8x16_avx2: 321.8 (10.27x) apply_bdof_12_16x8_c: 3258.1 ( 1.00x) apply_bdof_12_16x8_avx2: 198.6 (16.41x) apply_bdof_12_16x16_c: 6600.2 ( 1.00x) apply_bdof_12_16x16_avx2: 392.6 (16.81x) New benchmarks: apply_bdof_8_8x16_c: 3269.9 ( 1.00x) apply_bdof_8_8x16_avx2: 266.5 (12.27x) apply_bdof_8_16x8_c: 3252.9 ( 1.00x) apply_bdof_8_16x8_avx2: 182.6 (17.81x) apply_bdof_8_16x16_c: 6596.7 ( 1.00x) apply_bdof_8_16x16_avx2: 362.7 (18.19x) apply_bdof_10_8x16_c: 3351.3 ( 1.00x) apply_bdof_10_8x16_avx2: 269.0 (12.46x) apply_bdof_10_16x8_c: 3329.1 ( 1.00x) apply_bdof_10_16x8_avx2: 174.5 (19.08x) apply_bdof_10_16x16_c: 6654.3 ( 1.00x) apply_bdof_10_16x16_avx2: 357.8 (18.60x) apply_bdof_12_8x16_c: 3274.1 ( 1.00x) apply_bdof_12_8x16_avx2: 276.0 (11.86x) apply_bdof_12_16x8_c: 3263.5 ( 1.00x) apply_bdof_12_16x8_avx2: 176.8 (18.46x) apply_bdof_12_16x16_c: 6576.4 ( 1.00x) apply_bdof_12_16x16_avx2: 357.8 (18.38x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -246,35 +246,15 @@ INIT_YMM avx2
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro LOG2 5 ; log_sum, src, cmp, shift, tmp
|
||||
pcmpgtw %5, %2, %3
|
||||
pandd %5, %4
|
||||
paddw %1, %5
|
||||
|
||||
psrlw %2, %5
|
||||
psrlw %4, 1
|
||||
psrlw %3, %4
|
||||
%endmacro
|
||||
|
||||
%macro LOG2 3 ; dst, src, offset
|
||||
pextrw tmp0d, xm%2, %3
|
||||
bsr tmp0d, tmp0d
|
||||
%if %3 != 0
|
||||
pinsrw xm%1, tmp0d, %3
|
||||
%else
|
||||
movd xm%1, tmp0d
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOG2 2 ; dst, src
|
||||
LOG2 %1, %2, 0
|
||||
LOG2 %1, %2, 1
|
||||
LOG2 %1, %2, 2
|
||||
LOG2 %1, %2, 3
|
||||
LOG2 %1, %2, 4
|
||||
LOG2 %1, %2, 5
|
||||
LOG2 %1, %2, 6
|
||||
LOG2 %1, %2, 7
|
||||
%macro LOG2 3 ; dst, src, tmp
|
||||
cvtdq2ps %1, %2
|
||||
; The exponent contains log2 biased by 127 unless the value is zero.
|
||||
; dst is only used as shift count where the value to be shifted is
|
||||
; always zero if src is zero, so avoid using saturated subtraction.
|
||||
pcmpeqd %3, %3
|
||||
psrld %3, 25 ; pd_127
|
||||
psrld %1, 23 ; floating point exponent
|
||||
psubd %1, %3
|
||||
%endmacro
|
||||
|
||||
; %1: 4 (sgx2, sgy2, sgxdi, gydi)
|
||||
@@ -286,11 +266,11 @@ INIT_YMM avx2
|
||||
|
||||
punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
|
||||
punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
|
||||
LOG2 10, 8 ; 4 (log2(sgx2), log2(sgy2))
|
||||
|
||||
; Promote to dword since vpsrlvw is AVX-512 only
|
||||
pmovzxwd m8, xm8
|
||||
pmovsxwd m9, xm9
|
||||
pmovsxwd m10, xm10
|
||||
LOG2 m10, m8, m7 ; 4 (log2(sgx2), log2(sgy2))
|
||||
|
||||
pslld m9, 2 ; 4 (log2(sgx2) << 2, log2(sgy2) << 2)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user