avcodec/x86/vvc/of: Break dependency chain

Don't extract and update one word of one and the same register
at a time; use separate src and dst registers, so that pextrw
and bsr can be done in parallel. Also use movd instead of pinsrw
for the first word.

Old benchmarks:
apply_bdof_8_8x16_c:                                  3275.2 ( 1.00x)
apply_bdof_8_8x16_avx2:                                487.6 ( 6.72x)
apply_bdof_8_16x8_c:                                  3243.1 ( 1.00x)
apply_bdof_8_16x8_avx2:                                284.4 (11.40x)
apply_bdof_8_16x16_c:                                 6501.8 ( 1.00x)
apply_bdof_8_16x16_avx2:                               570.0 (11.41x)
apply_bdof_10_8x16_c:                                 3286.5 ( 1.00x)
apply_bdof_10_8x16_avx2:                               461.7 ( 7.12x)
apply_bdof_10_16x8_c:                                 3274.5 ( 1.00x)
apply_bdof_10_16x8_avx2:                               271.4 (12.06x)
apply_bdof_10_16x16_c:                                6590.0 ( 1.00x)
apply_bdof_10_16x16_avx2:                              543.9 (12.12x)
apply_bdof_12_8x16_c:                                 3307.6 ( 1.00x)
apply_bdof_12_8x16_avx2:                               462.2 ( 7.16x)
apply_bdof_12_16x8_c:                                 3287.4 ( 1.00x)
apply_bdof_12_16x8_avx2:                               271.8 (12.10x)
apply_bdof_12_16x16_c:                                6465.7 ( 1.00x)
apply_bdof_12_16x16_avx2:                              543.8 (11.89x)

New benchmarks:
apply_bdof_8_8x16_c:                                  3255.7 ( 1.00x)
apply_bdof_8_8x16_avx2:                                349.3 ( 9.32x)
apply_bdof_8_16x8_c:                                  3262.5 ( 1.00x)
apply_bdof_8_16x8_avx2:                                214.8 (15.19x)
apply_bdof_8_16x16_c:                                 6471.6 ( 1.00x)
apply_bdof_8_16x16_avx2:                               429.8 (15.06x)
apply_bdof_10_8x16_c:                                 3227.7 ( 1.00x)
apply_bdof_10_8x16_avx2:                               321.6 (10.04x)
apply_bdof_10_16x8_c:                                 3250.2 ( 1.00x)
apply_bdof_10_16x8_avx2:                               201.2 (16.16x)
apply_bdof_10_16x16_c:                                6476.5 ( 1.00x)
apply_bdof_10_16x16_avx2:                              400.9 (16.16x)
apply_bdof_12_8x16_c:                                 3230.7 ( 1.00x)
apply_bdof_12_8x16_avx2:                               321.8 (10.04x)
apply_bdof_12_16x8_c:                                 3210.5 ( 1.00x)
apply_bdof_12_16x8_avx2:                               200.9 (15.98x)
apply_bdof_12_16x16_c:                                6474.5 ( 1.00x)
apply_bdof_12_16x16_avx2:                              400.2 (16.18x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-02-19 02:08:32 +01:00
parent 19dc7b79a4
commit af3f8f5bd2

View File

@@ -251,21 +251,25 @@ INIT_YMM avx2
psrlw %3, %4
%endmacro
%macro LOG2 2 ; dst/src, offset
pextrw tmp0d, xm%1, %2
%macro LOG2 3 ; dst, src, offset
pextrw tmp0d, xm%2, %3
bsr tmp0d, tmp0d
pinsrw xm%1, tmp0d, %2
%if %3 != 0
pinsrw xm%1, tmp0d, %3
%else
movd xm%1, tmp0d
%endif
%endmacro
%macro LOG2 1 ; dst/src
LOG2 %1, 0
LOG2 %1, 1
LOG2 %1, 2
LOG2 %1, 3
LOG2 %1, 4
LOG2 %1, 5
LOG2 %1, 6
LOG2 %1, 7
%macro LOG2 2 ; dst, src
LOG2 %1, %2, 0
LOG2 %1, %2, 1
LOG2 %1, %2, 2
LOG2 %1, %2, 3
LOG2 %1, %2, 4
LOG2 %1, %2, 5
LOG2 %1, %2, 6
LOG2 %1, %2, 7
%endmacro
; %1: 4 (sgx2, sgy2, sgxdi, gydi)
@@ -277,8 +281,7 @@ INIT_YMM avx2
punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
mova m10, m8
LOG2 10 ; 4 (log2(sgx2), log2(sgy2))
LOG2 10, 8 ; 4 (log2(sgx2), log2(sgy2))
; Promote to dword since vpsrlvw is AVX-512 only
pmovsxwd m8, xm8