mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-30 13:50:50 +08:00
avcodec/x86/vvc/mc: Avoid redundant clipping for 8bit
It is already done by packuswb. Old benchmarks: avg_8_2x2_c: 11.1 ( 1.00x) avg_8_2x2_avx2: 8.6 ( 1.28x) avg_8_4x4_c: 30.0 ( 1.00x) avg_8_4x4_avx2: 10.8 ( 2.78x) avg_8_8x8_c: 132.0 ( 1.00x) avg_8_8x8_avx2: 25.7 ( 5.14x) avg_8_16x16_c: 254.6 ( 1.00x) avg_8_16x16_avx2: 33.2 ( 7.67x) avg_8_32x32_c: 897.5 ( 1.00x) avg_8_32x32_avx2: 115.6 ( 7.76x) avg_8_64x64_c: 3316.9 ( 1.00x) avg_8_64x64_avx2: 626.5 ( 5.29x) avg_8_128x128_c: 12973.6 ( 1.00x) avg_8_128x128_avx2: 1914.0 ( 6.78x) w_avg_8_2x2_c: 16.7 ( 1.00x) w_avg_8_2x2_avx2: 14.4 ( 1.16x) w_avg_8_4x4_c: 48.2 ( 1.00x) w_avg_8_4x4_avx2: 16.5 ( 2.92x) w_avg_8_8x8_c: 168.1 ( 1.00x) w_avg_8_8x8_avx2: 49.7 ( 3.38x) w_avg_8_16x16_c: 392.4 ( 1.00x) w_avg_8_16x16_avx2: 61.1 ( 6.43x) w_avg_8_32x32_c: 1455.3 ( 1.00x) w_avg_8_32x32_avx2: 224.6 ( 6.48x) w_avg_8_64x64_c: 5632.1 ( 1.00x) w_avg_8_64x64_avx2: 896.9 ( 6.28x) w_avg_8_128x128_c: 22136.3 ( 1.00x) w_avg_8_128x128_avx2: 3626.7 ( 6.10x) New benchmarks: avg_8_2x2_c: 12.3 ( 1.00x) avg_8_2x2_avx2: 8.1 ( 1.52x) avg_8_4x4_c: 30.3 ( 1.00x) avg_8_4x4_avx2: 11.3 ( 2.67x) avg_8_8x8_c: 131.8 ( 1.00x) avg_8_8x8_avx2: 21.3 ( 6.20x) avg_8_16x16_c: 255.0 ( 1.00x) avg_8_16x16_avx2: 30.6 ( 8.33x) avg_8_32x32_c: 898.5 ( 1.00x) avg_8_32x32_avx2: 104.9 ( 8.57x) avg_8_64x64_c: 3317.7 ( 1.00x) avg_8_64x64_avx2: 540.9 ( 6.13x) avg_8_128x128_c: 12986.5 ( 1.00x) avg_8_128x128_avx2: 1663.4 ( 7.81x) w_avg_8_2x2_c: 16.8 ( 1.00x) w_avg_8_2x2_avx2: 13.9 ( 1.21x) w_avg_8_4x4_c: 48.2 ( 1.00x) w_avg_8_4x4_avx2: 16.2 ( 2.98x) w_avg_8_8x8_c: 168.6 ( 1.00x) w_avg_8_8x8_avx2: 46.3 ( 3.64x) w_avg_8_16x16_c: 392.4 ( 1.00x) w_avg_8_16x16_avx2: 57.7 ( 6.80x) w_avg_8_32x32_c: 1454.6 ( 1.00x) w_avg_8_32x32_avx2: 214.6 ( 6.78x) w_avg_8_64x64_c: 5638.4 ( 1.00x) w_avg_8_64x64_avx2: 875.6 ( 6.44x) w_avg_8_128x128_c: 22133.5 ( 1.00x) w_avg_8_128x128_avx2: 3334.3 ( 6.64x) Also saves 550B of .text here. The improvements will likely be even better on Win64, because it avoids using two nonvolatile registers in the weighted average case. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -64,12 +64,12 @@ SECTION .text
|
||||
%rep %3
|
||||
%define off %%i
|
||||
AVG_LOAD_W16 0, off
|
||||
%2
|
||||
%2 %1
|
||||
AVG_SAVE_W16 %1, 0, off
|
||||
|
||||
|
||||
AVG_LOAD_W16 1, off
|
||||
%2
|
||||
%2 %1
|
||||
AVG_SAVE_W16 %1, 1, off
|
||||
|
||||
%assign %%i %%i+1
|
||||
@@ -84,7 +84,7 @@ SECTION .text
|
||||
pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
|
||||
movd xm1, [src1q]
|
||||
pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
|
||||
%2
|
||||
%2 %1
|
||||
AVG_SAVE_W2 %1
|
||||
AVG_LOOP_END .w2
|
||||
|
||||
@@ -93,7 +93,7 @@ SECTION .text
|
||||
pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
|
||||
movq xm1, [src1q]
|
||||
pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
|
||||
%2
|
||||
%2 %1
|
||||
AVG_SAVE_W4 %1
|
||||
|
||||
AVG_LOOP_END .w4
|
||||
@@ -103,7 +103,7 @@ SECTION .text
|
||||
vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
|
||||
vinserti128 m1, m1, [src1q], 0
|
||||
vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
|
||||
%2
|
||||
%2 %1
|
||||
AVG_SAVE_W8 %1
|
||||
|
||||
AVG_LOOP_END .w8
|
||||
@@ -132,13 +132,15 @@ SECTION .text
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro AVG 0
|
||||
%macro AVG 1
|
||||
paddsw m0, m1
|
||||
pmulhrsw m0, m2
|
||||
%if %1 != 8
|
||||
CLIPW m0, m3, m4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro W_AVG 0
|
||||
%macro W_AVG 1
|
||||
punpckhwd m5, m0, m1
|
||||
pmaddwd m5, m3
|
||||
paddd m5, m4
|
||||
@@ -150,7 +152,9 @@ SECTION .text
|
||||
psrad m0, xm2
|
||||
|
||||
packssdw m0, m5
|
||||
%if %1 != 8
|
||||
CLIPW m0, m6, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro AVG_LOAD_W16 2 ; line, offset
|
||||
@@ -217,11 +221,13 @@ SECTION .text
|
||||
;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
|
||||
%macro VVC_AVG_AVX2 1
|
||||
cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
|
||||
cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
|
||||
movifnidn hd, hm
|
||||
|
||||
%if %1 != 8
|
||||
pxor m3, m3 ; pixel min
|
||||
vpbroadcastw m4, bdm ; pixel max
|
||||
%endif
|
||||
|
||||
movifnidn bdd, bdm
|
||||
inc bdd
|
||||
@@ -245,7 +251,7 @@ cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
|
||||
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
|
||||
; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
|
||||
%macro VVC_W_AVG_AVX2 1
|
||||
cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
|
||||
cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, t1
|
||||
|
||||
movifnidn hd, hm
|
||||
|
||||
@@ -255,8 +261,10 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
|
||||
movd xm3, t0d
|
||||
vpbroadcastd m3, xm3 ; w0, w1
|
||||
|
||||
%if %1 != 8
|
||||
pxor m6, m6 ;pixel min
|
||||
vpbroadcastw m7, r11m ;pixel max
|
||||
%endif
|
||||
|
||||
mov t1q, rcx ; save ecx
|
||||
mov ecx, r11m
|
||||
|
||||
Reference in New Issue
Block a user