mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for avg
Up until now, there were two averaging assembly functions, one for eight bit content and one for <=16 bit content; there are also three C-wrappers around these functions, for 8, 10 and 12 bpp. These wrappers simply forward the maximum permissible value (i.e. (1<<bpp)-1) and promote some integer values to ptrdiff_t. Yet these wrappers are absolutely useless: The assembly functions rederive the bpp from the maximum and only the integer part of the promoted ptrdiff_t values is ever used. Of course, these wrappers also entail an additional call (not a tail call, because the additional maximum parameter is passed on the stack). Remove the wrappers and add per-bpp assembly functions instead. Given that the only difference between 10 and 12 bits are some constants in registers, the main part of these functions can be shared (given that this code uses a jumptable, it can even be done without adding any additional jump). Old benchmarks: avg_8_2x2_c: 11.4 ( 1.00x) avg_8_2x2_avx2: 7.9 ( 1.44x) avg_8_4x4_c: 30.7 ( 1.00x) avg_8_4x4_avx2: 10.4 ( 2.95x) avg_8_8x8_c: 134.5 ( 1.00x) avg_8_8x8_avx2: 16.6 ( 8.12x) avg_8_16x16_c: 255.6 ( 1.00x) avg_8_16x16_avx2: 28.2 ( 9.07x) avg_8_32x32_c: 897.7 ( 1.00x) avg_8_32x32_avx2: 83.9 (10.70x) avg_8_64x64_c: 3320.0 ( 1.00x) avg_8_64x64_avx2: 321.1 (10.34x) avg_8_128x128_c: 12981.8 ( 1.00x) avg_8_128x128_avx2: 1480.1 ( 8.77x) avg_10_2x2_c: 12.0 ( 1.00x) avg_10_2x2_avx2: 8.4 ( 1.43x) avg_10_4x4_c: 34.9 ( 1.00x) avg_10_4x4_avx2: 9.8 ( 3.56x) avg_10_8x8_c: 76.8 ( 1.00x) avg_10_8x8_avx2: 15.1 ( 5.08x) avg_10_16x16_c: 256.6 ( 1.00x) avg_10_16x16_avx2: 25.1 (10.20x) avg_10_32x32_c: 932.9 ( 1.00x) avg_10_32x32_avx2: 73.4 (12.72x) avg_10_64x64_c: 3517.9 ( 1.00x) avg_10_64x64_avx2: 414.8 ( 8.48x) avg_10_128x128_c: 13695.3 ( 1.00x) avg_10_128x128_avx2: 1648.1 ( 8.31x) avg_12_2x2_c: 13.1 ( 1.00x) avg_12_2x2_avx2: 8.6 ( 1.53x) avg_12_4x4_c: 35.4 ( 1.00x) avg_12_4x4_avx2: 10.1 ( 3.49x) avg_12_8x8_c: 76.6 ( 1.00x) avg_12_8x8_avx2: 16.7 ( 4.60x) avg_12_16x16_c: 256.6 ( 1.00x) avg_12_16x16_avx2: 25.5 (10.07x) avg_12_32x32_c: 933.2 ( 1.00x) avg_12_32x32_avx2: 75.7 (12.34x) avg_12_64x64_c: 3519.1 ( 1.00x) avg_12_64x64_avx2: 416.8 ( 8.44x) avg_12_128x128_c: 13695.1 ( 1.00x) avg_12_128x128_avx2: 1651.6 ( 8.29x) New benchmarks: avg_8_2x2_c: 11.5 ( 1.00x) avg_8_2x2_avx2: 6.0 ( 1.91x) avg_8_4x4_c: 29.7 ( 1.00x) avg_8_4x4_avx2: 8.0 ( 3.72x) avg_8_8x8_c: 131.4 ( 1.00x) avg_8_8x8_avx2: 12.2 (10.74x) avg_8_16x16_c: 254.3 ( 1.00x) avg_8_16x16_avx2: 24.8 (10.25x) avg_8_32x32_c: 897.7 ( 1.00x) avg_8_32x32_avx2: 77.8 (11.54x) avg_8_64x64_c: 3321.3 ( 1.00x) avg_8_64x64_avx2: 318.7 (10.42x) avg_8_128x128_c: 12988.4 ( 1.00x) avg_8_128x128_avx2: 1430.1 ( 9.08x) avg_10_2x2_c: 12.1 ( 1.00x) avg_10_2x2_avx2: 5.7 ( 2.13x) avg_10_4x4_c: 35.0 ( 1.00x) avg_10_4x4_avx2: 9.0 ( 3.88x) avg_10_8x8_c: 77.2 ( 1.00x) avg_10_8x8_avx2: 12.4 ( 6.24x) avg_10_16x16_c: 256.2 ( 1.00x) avg_10_16x16_avx2: 24.3 (10.56x) avg_10_32x32_c: 932.9 ( 1.00x) avg_10_32x32_avx2: 71.9 (12.97x) avg_10_64x64_c: 3516.8 ( 1.00x) avg_10_64x64_avx2: 414.7 ( 8.48x) avg_10_128x128_c: 13693.7 ( 1.00x) avg_10_128x128_avx2: 1609.3 ( 8.51x) avg_12_2x2_c: 14.1 ( 1.00x) avg_12_2x2_avx2: 5.7 ( 2.48x) avg_12_4x4_c: 35.8 ( 1.00x) avg_12_4x4_avx2: 9.0 ( 3.96x) avg_12_8x8_c: 76.9 ( 1.00x) avg_12_8x8_avx2: 12.4 ( 6.22x) avg_12_16x16_c: 256.5 ( 1.00x) avg_12_16x16_avx2: 24.4 (10.50x) avg_12_32x32_c: 934.1 ( 1.00x) avg_12_32x32_avx2: 72.0 (12.97x) avg_12_64x64_c: 3518.2 ( 1.00x) avg_12_64x64_avx2: 414.8 ( 8.48x) avg_12_128x128_c: 13689.5 ( 1.00x) avg_12_128x128_avx2: 1611.1 ( 8.50x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -36,8 +36,6 @@
|
||||
#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
|
||||
|
||||
#define AVG_BPC_PROTOTYPES(bpc, opt) \
|
||||
void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
|
||||
void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
|
||||
intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
|
||||
@@ -171,11 +169,6 @@ FW_PUT_16BPC_AVX2(10)
|
||||
FW_PUT_16BPC_AVX2(12)
|
||||
|
||||
#define AVG_FUNCS(bpc, bd, opt) \
|
||||
static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const int16_t *src0, const int16_t *src1, int width, int height) \
|
||||
{ \
|
||||
BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
|
||||
} \
|
||||
static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const int16_t *src0, const int16_t *src1, int width, int height, \
|
||||
int denom, int w0, int w1, int o0, int o1) \
|
||||
@@ -254,7 +247,9 @@ SAO_FILTER_FUNCS(12, avx2)
|
||||
} while (0)
|
||||
|
||||
#define AVG_INIT(bd, opt) do { \
|
||||
c->inter.avg = bf(vvc_avg, bd, opt); \
|
||||
void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const int16_t *src0, const int16_t *src1, int width, int height);\
|
||||
c->inter.avg = bf(ff_vvc_avg, bd, opt); \
|
||||
c->inter.w_avg = bf(vvc_w_avg, bd, opt); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -35,23 +35,21 @@ SECTION_RODATA
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
|
||||
pw_256 times 2 dw 256
|
||||
|
||||
%macro AVG_JMP_TABLE 3-*
|
||||
%xdefine %1_%2_%3_table (%%table - 2*%4)
|
||||
%xdefine %%base %1_%2_%3_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
|
||||
%macro AVG_JMP_TABLE 4-*
|
||||
%xdefine %1_%2_%4_table (%%table - 2*%5)
|
||||
%xdefine %%base %1_%2_%4_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4)
|
||||
%%table:
|
||||
%rep %0 - 3
|
||||
dd %%prefix %+ .w%4 - %%base
|
||||
%rep %0 - 4
|
||||
dd %%prefix %+ .w%5 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE w_avg, 8, 8bpc, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
AVG_JMP_TABLE w_avg, 16, 16bpc, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
|
||||
SECTION .text
|
||||
|
||||
@@ -72,9 +70,10 @@ SECTION .text
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%macro AVG_FN 2 ; bpc, op
|
||||
%macro AVG_FN 2-3 1; bpc, op, instantiate implementation
|
||||
jmp wq
|
||||
|
||||
%if %3
|
||||
INIT_XMM cpuname
|
||||
.w2:
|
||||
movd xm0, [src0q]
|
||||
@@ -128,6 +127,7 @@ INIT_YMM cpuname
|
||||
|
||||
.ret:
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro AVG 2 ; bpc, width
|
||||
@@ -222,31 +222,24 @@ INIT_YMM cpuname
|
||||
|
||||
%define AVG_SRC_STRIDE MAX_PB_SIZE*2
|
||||
|
||||
;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
|
||||
%macro VVC_AVG_AVX2 1
|
||||
cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
|
||||
;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0,
|
||||
; const int16_t *src1, int width, int height);
|
||||
%macro VVC_AVG_AVX2 3
|
||||
cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
|
||||
movifnidn hd, hm
|
||||
|
||||
pcmpeqw m2, m2
|
||||
%if %1 != 8
|
||||
pxor m3, m3 ; pixel min
|
||||
vpbroadcastw m4, bdm ; pixel max
|
||||
%endif
|
||||
|
||||
movifnidn bdd, bdm
|
||||
inc bdd
|
||||
tzcnt bdd, bdd ; bit depth
|
||||
|
||||
sub bdd, 8
|
||||
movd xm0, bdd
|
||||
vpbroadcastd m2, [pw_256]
|
||||
psllw m2, xm0 ; shift
|
||||
|
||||
lea r6, [avg_%1 %+ SUFFIX %+ _table]
|
||||
tzcnt wd, wm
|
||||
movsxd wq, dword [r6+wq*4]
|
||||
psrlw m4, m2, 16-%2 ; pixel max
|
||||
psubw m2, m4, m2 ; 1 << bpp
|
||||
add wq, r6
|
||||
AVG_FN %1, AVG
|
||||
AVG_FN %1, AVG, %3
|
||||
%endmacro
|
||||
|
||||
;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
@@ -298,9 +291,11 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0,
|
||||
|
||||
INIT_YMM avx2
|
||||
|
||||
VVC_AVG_AVX2 16
|
||||
VVC_AVG_AVX2 16, 12, 0
|
||||
|
||||
VVC_AVG_AVX2 8
|
||||
VVC_AVG_AVX2 16, 10, 1
|
||||
|
||||
VVC_AVG_AVX2 8, 8, 1
|
||||
|
||||
VVC_W_AVG_AVX2 16
|
||||
|
||||
|
||||
Reference in New Issue
Block a user