avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for avg

Up until now, there were two averaging assembly functions, one for eight bit content and one for <=16 bit content; there are also three C-wrappers around these functions, for 8, 10 and 12 bpp. These wrappers simply forward the maximum permissible value (i.e. (1<<bpp)-1) and promote some integer values to ptrdiff_t. Yet these wrappers are absolutely useless: The assembly functions rederive the bpp from the maximum and only the integer part of the promoted ptrdiff_t values is ever used. Of course, these wrappers also entail an additional call (not a tail call, because the additional maximum parameter is passed on the stack). Remove the wrappers and add per-bpp assembly functions instead. Given that the only difference between 10 and 12 bits are some constants in registers, the main part of these functions can be shared (given that this code uses a jumptable, it can even be done without adding any additional jump). Old benchmarks: avg_8_2x2_c: 11.4 ( 1.00x) avg_8_2x2_avx2: 7.9 ( 1.44x) avg_8_4x4_c: 30.7 ( 1.00x) avg_8_4x4_avx2: 10.4 ( 2.95x) avg_8_8x8_c: 134.5 ( 1.00x) avg_8_8x8_avx2: 16.6 ( 8.12x) avg_8_16x16_c: 255.6 ( 1.00x) avg_8_16x16_avx2: 28.2 ( 9.07x) avg_8_32x32_c: 897.7 ( 1.00x) avg_8_32x32_avx2: 83.9 (10.70x) avg_8_64x64_c: 3320.0 ( 1.00x) avg_8_64x64_avx2: 321.1 (10.34x) avg_8_128x128_c: 12981.8 ( 1.00x) avg_8_128x128_avx2: 1480.1 ( 8.77x) avg_10_2x2_c: 12.0 ( 1.00x) avg_10_2x2_avx2: 8.4 ( 1.43x) avg_10_4x4_c: 34.9 ( 1.00x) avg_10_4x4_avx2: 9.8 ( 3.56x) avg_10_8x8_c: 76.8 ( 1.00x) avg_10_8x8_avx2: 15.1 ( 5.08x) avg_10_16x16_c: 256.6 ( 1.00x) avg_10_16x16_avx2: 25.1 (10.20x) avg_10_32x32_c: 932.9 ( 1.00x) avg_10_32x32_avx2: 73.4 (12.72x) avg_10_64x64_c: 3517.9 ( 1.00x) avg_10_64x64_avx2: 414.8 ( 8.48x) avg_10_128x128_c: 13695.3 ( 1.00x) avg_10_128x128_avx2: 1648.1 ( 8.31x) avg_12_2x2_c: 13.1 ( 1.00x) avg_12_2x2_avx2: 8.6 ( 1.53x) avg_12_4x4_c: 35.4 ( 1.00x) avg_12_4x4_avx2: 10.1 ( 3.49x) avg_12_8x8_c: 76.6 ( 1.00x) avg_12_8x8_avx2: 16.7 ( 4.60x) avg_12_16x16_c: 256.6 ( 1.00x) avg_12_16x16_avx2: 25.5 (10.07x) avg_12_32x32_c: 933.2 ( 1.00x) avg_12_32x32_avx2: 75.7 (12.34x) avg_12_64x64_c: 3519.1 ( 1.00x) avg_12_64x64_avx2: 416.8 ( 8.44x) avg_12_128x128_c: 13695.1 ( 1.00x) avg_12_128x128_avx2: 1651.6 ( 8.29x) New benchmarks: avg_8_2x2_c: 11.5 ( 1.00x) avg_8_2x2_avx2: 6.0 ( 1.91x) avg_8_4x4_c: 29.7 ( 1.00x) avg_8_4x4_avx2: 8.0 ( 3.72x) avg_8_8x8_c: 131.4 ( 1.00x) avg_8_8x8_avx2: 12.2 (10.74x) avg_8_16x16_c: 254.3 ( 1.00x) avg_8_16x16_avx2: 24.8 (10.25x) avg_8_32x32_c: 897.7 ( 1.00x) avg_8_32x32_avx2: 77.8 (11.54x) avg_8_64x64_c: 3321.3 ( 1.00x) avg_8_64x64_avx2: 318.7 (10.42x) avg_8_128x128_c: 12988.4 ( 1.00x) avg_8_128x128_avx2: 1430.1 ( 9.08x) avg_10_2x2_c: 12.1 ( 1.00x) avg_10_2x2_avx2: 5.7 ( 2.13x) avg_10_4x4_c: 35.0 ( 1.00x) avg_10_4x4_avx2: 9.0 ( 3.88x) avg_10_8x8_c: 77.2 ( 1.00x) avg_10_8x8_avx2: 12.4 ( 6.24x) avg_10_16x16_c: 256.2 ( 1.00x) avg_10_16x16_avx2: 24.3 (10.56x) avg_10_32x32_c: 932.9 ( 1.00x) avg_10_32x32_avx2: 71.9 (12.97x) avg_10_64x64_c: 3516.8 ( 1.00x) avg_10_64x64_avx2: 414.7 ( 8.48x) avg_10_128x128_c: 13693.7 ( 1.00x) avg_10_128x128_avx2: 1609.3 ( 8.51x) avg_12_2x2_c: 14.1 ( 1.00x) avg_12_2x2_avx2: 5.7 ( 2.48x) avg_12_4x4_c: 35.8 ( 1.00x) avg_12_4x4_avx2: 9.0 ( 3.96x) avg_12_8x8_c: 76.9 ( 1.00x) avg_12_8x8_avx2: 12.4 ( 6.22x) avg_12_16x16_c: 256.5 ( 1.00x) avg_12_16x16_avx2: 24.4 (10.50x) avg_12_32x32_c: 934.1 ( 1.00x) avg_12_32x32_avx2: 72.0 (12.97x) avg_12_64x64_c: 3518.2 ( 1.00x) avg_12_64x64_avx2: 414.8 ( 8.48x) avg_12_128x128_c: 13689.5 ( 1.00x) avg_12_128x128_avx2: 1611.1 ( 8.50x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-04-20 21:00:41 +08:00 · 2026-02-17 23:00:30 +01:00
parent 5a60b3f1a6
commit ea78402e9c
2 changed files with 28 additions and 38 deletions
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -36,8 +36,6 @@
 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt

 #define AVG_BPC_PROTOTYPES(bpc, opt)                                                                 \
-void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                    \
-    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);  \
 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                  \
    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,                       \
    intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, intptr_t pixel_max);
@@ -171,11 +169,6 @@ FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)

 #define AVG_FUNCS(bpc, bd, opt)                                                                     \
-static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                \
-    const int16_t *src0, const int16_t *src1, int width, int height)                                \
-{                                                                                                   \
-    BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd)  - 1);           \
-}                                                                                                   \
 static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,                              \
    const int16_t *src0, const int16_t *src1, int width, int height,                                \
    int denom, int w0, int w1, int o0, int o1)                                                      \
@@ -254,7 +247,9 @@ SAO_FILTER_FUNCS(12, avx2)
 } while (0)

 #define AVG_INIT(bd, opt) do {                                       \
-    c->inter.avg    = bf(vvc_avg, bd, opt);                          \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,     \
+    const int16_t *src0, const int16_t *src1, int width, int height);\
+    c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
    c->inter.w_avg  = bf(vvc_w_avg, bd, opt);                        \
 } while (0)

--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -35,23 +35,21 @@ SECTION_RODATA

 %if HAVE_AVX2_EXTERNAL

-pw_256  times 2 dw 256
-
-%macro AVG_JMP_TABLE 3-*
-    %xdefine %1_%2_%3_table (%%table - 2*%4)
-    %xdefine %%base %1_%2_%3_table
-    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
+%macro AVG_JMP_TABLE 4-*
+    %xdefine %1_%2_%4_table (%%table - 2*%5)
+    %xdefine %%base %1_%2_%4_table
+    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4)
    %%table:
-    %rep %0 - 3
-        dd %%prefix %+ .w%4 - %%base
+    %rep %0 - 4
+        dd %%prefix %+ .w%5 - %%base
        %rotate 1
    %endrep
 %endmacro

-AVG_JMP_TABLE    avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE    avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE  w_avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE  w_avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE    avg,  8,  8, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE    avg, 16, 10, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE  w_avg,  8,  8bpc, avx2,             2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE  w_avg, 16, 16bpc, avx2,             2, 4, 8, 16, 32, 64, 128

 SECTION .text

@@ -72,9 +70,10 @@ SECTION .text
    %endrep
 %endmacro

-%macro AVG_FN 2 ; bpc, op
+%macro AVG_FN 2-3 1; bpc, op, instantiate implementation
   jmp                  wq

+%if %3
 INIT_XMM cpuname
 .w2:
    movd                xm0, [src0q]
@@ -128,6 +127,7 @@ INIT_YMM cpuname

 .ret:
    RET
+%endif
 %endmacro

 %macro AVG   2 ; bpc, width
@@ -222,31 +222,24 @@ INIT_YMM cpuname

 %define AVG_SRC_STRIDE MAX_PB_SIZE*2

-;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
-;   const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
-%macro VVC_AVG_AVX2 1
-cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
+;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0,
+;                        const int16_t *src1, int width, int height);
+%macro VVC_AVG_AVX2 3
+cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
    movifnidn            hd, hm

+    pcmpeqw              m2, m2
 %if %1 != 8
    pxor                 m3, m3             ; pixel min
-    vpbroadcastw         m4, bdm            ; pixel max
 %endif

-    movifnidn           bdd, bdm
-    inc                 bdd
-    tzcnt               bdd, bdd            ; bit depth
-
-    sub                 bdd, 8
-    movd                xm0, bdd
-    vpbroadcastd         m2, [pw_256]
-    psllw                m2, xm0                ; shift
-
    lea                  r6, [avg_%1 %+ SUFFIX %+ _table]
    tzcnt                wd, wm
    movsxd               wq, dword [r6+wq*4]
+    psrlw                m4, m2, 16-%2      ; pixel max
+    psubw                m2, m4, m2         ; 1 << bpp
    add                  wq, r6
-    AVG_FN               %1, AVG
+    AVG_FN               %1, AVG, %3
 %endmacro

 ;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
@@ -298,9 +291,11 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0,

 INIT_YMM avx2

-VVC_AVG_AVX2 16
+VVC_AVG_AVX2 16, 12, 0

-VVC_AVG_AVX2 8
+VVC_AVG_AVX2 16, 10, 1
+
+VVC_AVG_AVX2 8, 8, 1

 VVC_W_AVG_AVX2 16