avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for avg

Up until now, there were two averaging assembly functions,
one for eight bit content and one for <=16 bit content;
there are also three C-wrappers around these functions,
for 8, 10 and 12 bpp. These wrappers simply forward the
maximum permissible value (i.e. (1<<bpp)-1) and promote
some integer values to ptrdiff_t.

Yet these wrappers are absolutely useless: The assembly functions
rederive the bpp from the maximum and only the integer part
of the promoted ptrdiff_t values is ever used. Of course,
these wrappers also entail an additional call (not a tail call,
because the additional maximum parameter is passed on the stack).

Remove the wrappers and add per-bpp assembly functions instead.
Given that the only difference between 10 and 12 bits are some
constants in registers, the main part of these functions can be
shared (given that this code uses a jumptable, it can even
be done without adding any additional jump).

Old benchmarks:
avg_8_2x2_c:                                            11.4 ( 1.00x)
avg_8_2x2_avx2:                                          7.9 ( 1.44x)
avg_8_4x4_c:                                            30.7 ( 1.00x)
avg_8_4x4_avx2:                                         10.4 ( 2.95x)
avg_8_8x8_c:                                           134.5 ( 1.00x)
avg_8_8x8_avx2:                                         16.6 ( 8.12x)
avg_8_16x16_c:                                         255.6 ( 1.00x)
avg_8_16x16_avx2:                                       28.2 ( 9.07x)
avg_8_32x32_c:                                         897.7 ( 1.00x)
avg_8_32x32_avx2:                                       83.9 (10.70x)
avg_8_64x64_c:                                        3320.0 ( 1.00x)
avg_8_64x64_avx2:                                      321.1 (10.34x)
avg_8_128x128_c:                                     12981.8 ( 1.00x)
avg_8_128x128_avx2:                                   1480.1 ( 8.77x)
avg_10_2x2_c:                                           12.0 ( 1.00x)
avg_10_2x2_avx2:                                         8.4 ( 1.43x)
avg_10_4x4_c:                                           34.9 ( 1.00x)
avg_10_4x4_avx2:                                         9.8 ( 3.56x)
avg_10_8x8_c:                                           76.8 ( 1.00x)
avg_10_8x8_avx2:                                        15.1 ( 5.08x)
avg_10_16x16_c:                                        256.6 ( 1.00x)
avg_10_16x16_avx2:                                      25.1 (10.20x)
avg_10_32x32_c:                                        932.9 ( 1.00x)
avg_10_32x32_avx2:                                      73.4 (12.72x)
avg_10_64x64_c:                                       3517.9 ( 1.00x)
avg_10_64x64_avx2:                                     414.8 ( 8.48x)
avg_10_128x128_c:                                    13695.3 ( 1.00x)
avg_10_128x128_avx2:                                  1648.1 ( 8.31x)
avg_12_2x2_c:                                           13.1 ( 1.00x)
avg_12_2x2_avx2:                                         8.6 ( 1.53x)
avg_12_4x4_c:                                           35.4 ( 1.00x)
avg_12_4x4_avx2:                                        10.1 ( 3.49x)
avg_12_8x8_c:                                           76.6 ( 1.00x)
avg_12_8x8_avx2:                                        16.7 ( 4.60x)
avg_12_16x16_c:                                        256.6 ( 1.00x)
avg_12_16x16_avx2:                                      25.5 (10.07x)
avg_12_32x32_c:                                        933.2 ( 1.00x)
avg_12_32x32_avx2:                                      75.7 (12.34x)
avg_12_64x64_c:                                       3519.1 ( 1.00x)
avg_12_64x64_avx2:                                     416.8 ( 8.44x)
avg_12_128x128_c:                                    13695.1 ( 1.00x)
avg_12_128x128_avx2:                                  1651.6 ( 8.29x)

New benchmarks:
avg_8_2x2_c:                                            11.5 ( 1.00x)
avg_8_2x2_avx2:                                          6.0 ( 1.91x)
avg_8_4x4_c:                                            29.7 ( 1.00x)
avg_8_4x4_avx2:                                          8.0 ( 3.72x)
avg_8_8x8_c:                                           131.4 ( 1.00x)
avg_8_8x8_avx2:                                         12.2 (10.74x)
avg_8_16x16_c:                                         254.3 ( 1.00x)
avg_8_16x16_avx2:                                       24.8 (10.25x)
avg_8_32x32_c:                                         897.7 ( 1.00x)
avg_8_32x32_avx2:                                       77.8 (11.54x)
avg_8_64x64_c:                                        3321.3 ( 1.00x)
avg_8_64x64_avx2:                                      318.7 (10.42x)
avg_8_128x128_c:                                     12988.4 ( 1.00x)
avg_8_128x128_avx2:                                   1430.1 ( 9.08x)
avg_10_2x2_c:                                           12.1 ( 1.00x)
avg_10_2x2_avx2:                                         5.7 ( 2.13x)
avg_10_4x4_c:                                           35.0 ( 1.00x)
avg_10_4x4_avx2:                                         9.0 ( 3.88x)
avg_10_8x8_c:                                           77.2 ( 1.00x)
avg_10_8x8_avx2:                                        12.4 ( 6.24x)
avg_10_16x16_c:                                        256.2 ( 1.00x)
avg_10_16x16_avx2:                                      24.3 (10.56x)
avg_10_32x32_c:                                        932.9 ( 1.00x)
avg_10_32x32_avx2:                                      71.9 (12.97x)
avg_10_64x64_c:                                       3516.8 ( 1.00x)
avg_10_64x64_avx2:                                     414.7 ( 8.48x)
avg_10_128x128_c:                                    13693.7 ( 1.00x)
avg_10_128x128_avx2:                                  1609.3 ( 8.51x)
avg_12_2x2_c:                                           14.1 ( 1.00x)
avg_12_2x2_avx2:                                         5.7 ( 2.48x)
avg_12_4x4_c:                                           35.8 ( 1.00x)
avg_12_4x4_avx2:                                         9.0 ( 3.96x)
avg_12_8x8_c:                                           76.9 ( 1.00x)
avg_12_8x8_avx2:                                        12.4 ( 6.22x)
avg_12_16x16_c:                                        256.5 ( 1.00x)
avg_12_16x16_avx2:                                      24.4 (10.50x)
avg_12_32x32_c:                                        934.1 ( 1.00x)
avg_12_32x32_avx2:                                      72.0 (12.97x)
avg_12_64x64_c:                                       3518.2 ( 1.00x)
avg_12_64x64_avx2:                                     414.8 ( 8.48x)
avg_12_128x128_c:                                    13689.5 ( 1.00x)
avg_12_128x128_avx2:                                  1611.1 ( 8.50x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-02-17 23:00:30 +01:00
parent 5a60b3f1a6
commit ea78402e9c
2 changed files with 28 additions and 38 deletions

View File

@@ -36,8 +36,6 @@
#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
#define AVG_BPC_PROTOTYPES(bpc, opt) \
void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
@@ -171,11 +169,6 @@ FW_PUT_16BPC_AVX2(10)
FW_PUT_16BPC_AVX2(12)
#define AVG_FUNCS(bpc, bd, opt) \
static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, int width, int height) \
{ \
BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
} \
static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, int width, int height, \
int denom, int w0, int w1, int o0, int o1) \
@@ -254,7 +247,9 @@ SAO_FILTER_FUNCS(12, avx2)
} while (0)
#define AVG_INIT(bd, opt) do { \
c->inter.avg = bf(vvc_avg, bd, opt); \
void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, int width, int height);\
c->inter.avg = bf(ff_vvc_avg, bd, opt); \
c->inter.w_avg = bf(vvc_w_avg, bd, opt); \
} while (0)

View File

@@ -35,23 +35,21 @@ SECTION_RODATA
%if HAVE_AVX2_EXTERNAL
pw_256 times 2 dw 256
%macro AVG_JMP_TABLE 3-*
%xdefine %1_%2_%3_table (%%table - 2*%4)
%xdefine %%base %1_%2_%3_table
%xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
%macro AVG_JMP_TABLE 4-*
%xdefine %1_%2_%4_table (%%table - 2*%5)
%xdefine %%base %1_%2_%4_table
%xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4)
%%table:
%rep %0 - 3
dd %%prefix %+ .w%4 - %%base
%rep %0 - 4
dd %%prefix %+ .w%5 - %%base
%rotate 1
%endrep
%endmacro
AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE w_avg, 8, 8bpc, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE w_avg, 16, 16bpc, avx2, 2, 4, 8, 16, 32, 64, 128
SECTION .text
@@ -72,9 +70,10 @@ SECTION .text
%endrep
%endmacro
%macro AVG_FN 2 ; bpc, op
%macro AVG_FN 2-3 1; bpc, op, instantiate implementation
jmp wq
%if %3
INIT_XMM cpuname
.w2:
movd xm0, [src0q]
@@ -128,6 +127,7 @@ INIT_YMM cpuname
.ret:
RET
%endif
%endmacro
%macro AVG 2 ; bpc, width
@@ -222,31 +222,24 @@ INIT_YMM cpuname
%define AVG_SRC_STRIDE MAX_PB_SIZE*2
;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
%macro VVC_AVG_AVX2 1
cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0,
; const int16_t *src1, int width, int height);
%macro VVC_AVG_AVX2 3
cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
movifnidn hd, hm
pcmpeqw m2, m2
%if %1 != 8
pxor m3, m3 ; pixel min
vpbroadcastw m4, bdm ; pixel max
%endif
movifnidn bdd, bdm
inc bdd
tzcnt bdd, bdd ; bit depth
sub bdd, 8
movd xm0, bdd
vpbroadcastd m2, [pw_256]
psllw m2, xm0 ; shift
lea r6, [avg_%1 %+ SUFFIX %+ _table]
tzcnt wd, wm
movsxd wq, dword [r6+wq*4]
psrlw m4, m2, 16-%2 ; pixel max
psubw m2, m4, m2 ; 1 << bpp
add wq, r6
AVG_FN %1, AVG
AVG_FN %1, AVG, %3
%endmacro
;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
@@ -298,9 +291,11 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0,
INIT_YMM avx2
VVC_AVG_AVX2 16
VVC_AVG_AVX2 16, 12, 0
VVC_AVG_AVX2 8
VVC_AVG_AVX2 16, 10, 1
VVC_AVG_AVX2 8, 8, 1
VVC_W_AVG_AVX2 16