avcodec/x86/vvc/of: Avoid scalar log2

Instead convert the integers to floats and inspect the exponent.

Old benchmarks:
apply_bdof_8_8x16_c:                                  3295.2 ( 1.00x)
apply_bdof_8_8x16_avx2:                                312.7 (10.54x)
apply_bdof_8_16x8_c:                                  3269.1 ( 1.00x)
apply_bdof_8_16x8_avx2:                                203.6 (16.05x)
apply_bdof_8_16x16_c:                                 6584.8 ( 1.00x)
apply_bdof_8_16x16_avx2:                               413.6 (15.92x)
apply_bdof_10_8x16_c:                                 3313.9 ( 1.00x)
apply_bdof_10_8x16_avx2:                               321.5 (10.31x)
apply_bdof_10_16x8_c:                                 3306.5 ( 1.00x)
apply_bdof_10_16x8_avx2:                               200.4 (16.50x)
apply_bdof_10_16x16_c:                                6659.7 ( 1.00x)
apply_bdof_10_16x16_avx2:                              402.4 (16.55x)
apply_bdof_12_8x16_c:                                 3305.7 ( 1.00x)
apply_bdof_12_8x16_avx2:                               321.8 (10.27x)
apply_bdof_12_16x8_c:                                 3258.1 ( 1.00x)
apply_bdof_12_16x8_avx2:                               198.6 (16.41x)
apply_bdof_12_16x16_c:                                6600.2 ( 1.00x)
apply_bdof_12_16x16_avx2:                              392.6 (16.81x)

New benchmarks:
apply_bdof_8_8x16_c:                                  3269.9 ( 1.00x)
apply_bdof_8_8x16_avx2:                                266.5 (12.27x)
apply_bdof_8_16x8_c:                                  3252.9 ( 1.00x)
apply_bdof_8_16x8_avx2:                                182.6 (17.81x)
apply_bdof_8_16x16_c:                                 6596.7 ( 1.00x)
apply_bdof_8_16x16_avx2:                               362.7 (18.19x)
apply_bdof_10_8x16_c:                                 3351.3 ( 1.00x)
apply_bdof_10_8x16_avx2:                               269.0 (12.46x)
apply_bdof_10_16x8_c:                                 3329.1 ( 1.00x)
apply_bdof_10_16x8_avx2:                               174.5 (19.08x)
apply_bdof_10_16x16_c:                                6654.3 ( 1.00x)
apply_bdof_10_16x16_avx2:                              357.8 (18.60x)
apply_bdof_12_8x16_c:                                 3274.1 ( 1.00x)
apply_bdof_12_8x16_avx2:                               276.0 (11.86x)
apply_bdof_12_16x8_c:                                 3263.5 ( 1.00x)
apply_bdof_12_16x8_avx2:                               176.8 (18.46x)
apply_bdof_12_16x16_c:                                6576.4 ( 1.00x)
apply_bdof_12_16x16_avx2:                              357.8 (18.38x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-03-19 05:30:37 +01:00
parent 03b83f8feb
commit 2570f5d307

View File

@@ -246,35 +246,15 @@ INIT_YMM avx2
%endmacro
%macro LOG2 5 ; log_sum, src, cmp, shift, tmp
pcmpgtw %5, %2, %3
pandd %5, %4
paddw %1, %5
psrlw %2, %5
psrlw %4, 1
psrlw %3, %4
%endmacro
%macro LOG2 3 ; dst, src, offset
pextrw tmp0d, xm%2, %3
bsr tmp0d, tmp0d
%if %3 != 0
pinsrw xm%1, tmp0d, %3
%else
movd xm%1, tmp0d
%endif
%endmacro
%macro LOG2 2 ; dst, src
LOG2 %1, %2, 0
LOG2 %1, %2, 1
LOG2 %1, %2, 2
LOG2 %1, %2, 3
LOG2 %1, %2, 4
LOG2 %1, %2, 5
LOG2 %1, %2, 6
LOG2 %1, %2, 7
%macro LOG2 3 ; dst, src, tmp
cvtdq2ps %1, %2
; The exponent contains log2 biased by 127 unless the value is zero.
; dst is only used as shift count where the value to be shifted is
; always zero if src is zero, so avoid using saturated subtraction.
pcmpeqd %3, %3
psrld %3, 25 ; pd_127
psrld %1, 23 ; floating point exponent
psubd %1, %3
%endmacro
; %1: 4 (sgx2, sgy2, sgxdi, gydi)
@@ -286,11 +266,11 @@ INIT_YMM avx2
punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
LOG2 10, 8 ; 4 (log2(sgx2), log2(sgy2))
; Promote to dword since vpsrlvw is AVX-512 only
pmovzxwd m8, xm8
pmovsxwd m9, xm9
pmovsxwd m10, xm10
LOG2 m10, m8, m7 ; 4 (log2(sgx2), log2(sgy2))
pslld m9, 2 ; 4 (log2(sgx2) << 2, log2(sgy2) << 2)