avcodec/x86/vvc/mc: Avoid redundant clipping for 8bit

It is already done by packuswb.

Old benchmarks:
avg_8_2x2_c:                                            11.1 ( 1.00x)
avg_8_2x2_avx2:                                          8.6 ( 1.28x)
avg_8_4x4_c:                                            30.0 ( 1.00x)
avg_8_4x4_avx2:                                         10.8 ( 2.78x)
avg_8_8x8_c:                                           132.0 ( 1.00x)
avg_8_8x8_avx2:                                         25.7 ( 5.14x)
avg_8_16x16_c:                                         254.6 ( 1.00x)
avg_8_16x16_avx2:                                       33.2 ( 7.67x)
avg_8_32x32_c:                                         897.5 ( 1.00x)
avg_8_32x32_avx2:                                      115.6 ( 7.76x)
avg_8_64x64_c:                                        3316.9 ( 1.00x)
avg_8_64x64_avx2:                                      626.5 ( 5.29x)
avg_8_128x128_c:                                     12973.6 ( 1.00x)
avg_8_128x128_avx2:                                   1914.0 ( 6.78x)
w_avg_8_2x2_c:                                          16.7 ( 1.00x)
w_avg_8_2x2_avx2:                                       14.4 ( 1.16x)
w_avg_8_4x4_c:                                          48.2 ( 1.00x)
w_avg_8_4x4_avx2:                                       16.5 ( 2.92x)
w_avg_8_8x8_c:                                         168.1 ( 1.00x)
w_avg_8_8x8_avx2:                                       49.7 ( 3.38x)
w_avg_8_16x16_c:                                       392.4 ( 1.00x)
w_avg_8_16x16_avx2:                                     61.1 ( 6.43x)
w_avg_8_32x32_c:                                      1455.3 ( 1.00x)
w_avg_8_32x32_avx2:                                    224.6 ( 6.48x)
w_avg_8_64x64_c:                                      5632.1 ( 1.00x)
w_avg_8_64x64_avx2:                                    896.9 ( 6.28x)
w_avg_8_128x128_c:                                   22136.3 ( 1.00x)
w_avg_8_128x128_avx2:                                 3626.7 ( 6.10x)

New benchmarks:
avg_8_2x2_c:                                            12.3 ( 1.00x)
avg_8_2x2_avx2:                                          8.1 ( 1.52x)
avg_8_4x4_c:                                            30.3 ( 1.00x)
avg_8_4x4_avx2:                                         11.3 ( 2.67x)
avg_8_8x8_c:                                           131.8 ( 1.00x)
avg_8_8x8_avx2:                                         21.3 ( 6.20x)
avg_8_16x16_c:                                         255.0 ( 1.00x)
avg_8_16x16_avx2:                                       30.6 ( 8.33x)
avg_8_32x32_c:                                         898.5 ( 1.00x)
avg_8_32x32_avx2:                                      104.9 ( 8.57x)
avg_8_64x64_c:                                        3317.7 ( 1.00x)
avg_8_64x64_avx2:                                      540.9 ( 6.13x)
avg_8_128x128_c:                                     12986.5 ( 1.00x)
avg_8_128x128_avx2:                                   1663.4 ( 7.81x)
w_avg_8_2x2_c:                                          16.8 ( 1.00x)
w_avg_8_2x2_avx2:                                       13.9 ( 1.21x)
w_avg_8_4x4_c:                                          48.2 ( 1.00x)
w_avg_8_4x4_avx2:                                       16.2 ( 2.98x)
w_avg_8_8x8_c:                                         168.6 ( 1.00x)
w_avg_8_8x8_avx2:                                       46.3 ( 3.64x)
w_avg_8_16x16_c:                                       392.4 ( 1.00x)
w_avg_8_16x16_avx2:                                     57.7 ( 6.80x)
w_avg_8_32x32_c:                                      1454.6 ( 1.00x)
w_avg_8_32x32_avx2:                                    214.6 ( 6.78x)
w_avg_8_64x64_c:                                      5638.4 ( 1.00x)
w_avg_8_64x64_avx2:                                    875.6 ( 6.44x)
w_avg_8_128x128_c:                                   22133.5 ( 1.00x)
w_avg_8_128x128_avx2:                                 3334.3 ( 6.64x)

Also saves 550B of .text here. The improvements will likely
be even better on Win64, because it avoids using two nonvolatile
registers in the weighted average case.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-02-17 15:14:04 +01:00
parent b22b65f2f8
commit 7bf9c1e3f6

View File

@@ -64,12 +64,12 @@ SECTION .text
%rep %3
%define off %%i
AVG_LOAD_W16 0, off
%2
%2 %1
AVG_SAVE_W16 %1, 0, off
AVG_LOAD_W16 1, off
%2
%2 %1
AVG_SAVE_W16 %1, 1, off
%assign %%i %%i+1
@@ -84,7 +84,7 @@ SECTION .text
pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
movd xm1, [src1q]
pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
%2
%2 %1
AVG_SAVE_W2 %1
AVG_LOOP_END .w2
@@ -93,7 +93,7 @@ SECTION .text
pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
movq xm1, [src1q]
pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
%2
%2 %1
AVG_SAVE_W4 %1
AVG_LOOP_END .w4
@@ -103,7 +103,7 @@ SECTION .text
vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
vinserti128 m1, m1, [src1q], 0
vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
%2
%2 %1
AVG_SAVE_W8 %1
AVG_LOOP_END .w8
@@ -132,13 +132,15 @@ SECTION .text
RET
%endmacro
%macro AVG 0
%macro AVG 1
paddsw m0, m1
pmulhrsw m0, m2
%if %1 != 8
CLIPW m0, m3, m4
%endif
%endmacro
%macro W_AVG 0
%macro W_AVG 1
punpckhwd m5, m0, m1
pmaddwd m5, m3
paddd m5, m4
@@ -150,7 +152,9 @@ SECTION .text
psrad m0, xm2
packssdw m0, m5
%if %1 != 8
CLIPW m0, m6, m7
%endif
%endmacro
%macro AVG_LOAD_W16 2 ; line, offset
@@ -217,11 +221,13 @@ SECTION .text
;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
%macro VVC_AVG_AVX2 1
cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
movifnidn hd, hm
%if %1 != 8
pxor m3, m3 ; pixel min
vpbroadcastw m4, bdm ; pixel max
%endif
movifnidn bdd, bdm
inc bdd
@@ -245,7 +251,7 @@ cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
%macro VVC_W_AVG_AVX2 1
cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, t1
movifnidn hd, hm
@@ -255,8 +261,10 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
movd xm3, t0d
vpbroadcastd m3, xm3 ; w0, w1
%if %1 != 8
pxor m6, m6 ;pixel min
vpbroadcastw m7, r11m ;pixel max
%endif
mov t1q, rcx ; save ecx
mov ecx, r11m