avcodec/x86/vvc/of: Avoid jump

At the end of the height==8 codepath, a jump to RET at the end
of the height==16 codepath is performed. Yet the epilogue
is so cheap on Unix64 that this jump is not worthwhile.
For Win64 meanwhile, one can still avoid jumps, because
for width 16 >8bpp and width 8 8bpp content a jump is performed
to the end of the height==8 position, immediately followed
by a jump to RET. These two jumps can be combined into one.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-02-28 16:25:32 +01:00
parent 2a93d09968
commit 2159e40ab3

View File

@@ -71,7 +71,7 @@ INIT_YMM avx2
pmulhrsw %1, m11
%endmacro
%macro SAVE 2 ; dst, src
%macro SAVE 2-3 ""; dst, src, jump target
cmp pixel_maxd, (1 << 8) - 1
jne %%save_16bpc
@@ -80,14 +80,22 @@ INIT_YMM avx2
cmp wd, 16
je %%w16_8
movq %1, xm%2
%ifnidn %3, ""
jmp %3
%else
jmp %%end
%endif
%%save_16bpc:
CLIPW m%2, m9, m10
cmp wd, 16
jne %%w8_16
movu %1, m%2
%ifnidn %3, ""
jmp %3
%else
jmp %%end
%endif
%%w16_8:
vpermq m%2, m%2, q0020
@@ -98,7 +106,7 @@ INIT_YMM avx2
; [rsp + even * mmsize] are gradient_h[0] - gradient_h[1]
; [rsp + odd * mmsize] are gradient_v[0] - gradient_v[1]
%macro APPLY_BDOF_MIN_BLOCK 4 ; block_num, vx, vy, bd
%macro APPLY_BDOF_MIN_BLOCK 3-4 ""; block_num, vx, vy, jump target
pxor m9, m9
movd xm10, pixel_maxd
@@ -118,7 +126,7 @@ INIT_YMM avx2
SAVE [dstq + 2 * dsq], 6
APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 3
SAVE [dstq + ds3q], 6
SAVE [dstq + ds3q], 6, %4
%endmacro
%macro SUM_MIN_BLOCK_W16 4 ; src/dst, shuffle, perm, tmp
@@ -327,7 +335,12 @@ INIT_YMM avx2
%if (%2)
BDOF_PROF_GRAD %1 * 4 + 3, %2
BDOF_VX_VY 12, 13
APPLY_BDOF_MIN_BLOCK %1, m12, m13, bd
%if UNIX64
APPLY_BDOF_MIN_BLOCK %1, m12, m13
%else
APPLY_BDOF_MIN_BLOCK %1, m12, m13, .end
%endif
%else
mova m14, m12
mova m15, m13
@@ -340,7 +353,7 @@ INIT_YMM avx2
paddw m15, m13
BDOF_VX_VY 14, 15
APPLY_BDOF_MIN_BLOCK %1, m14, m15, bd
APPLY_BDOF_MIN_BLOCK %1, m14, m15
lea dstq, [dstq + 4 * dsq]
%endif
%endmacro
@@ -375,7 +388,11 @@ PROLOGUE 6, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3
cmp hd, 16
je .h16
BDOF_MINI_BLOCKS 1, 1
%if UNIX64
RET
%else
jmp .end
%endif
.h16:
BDOF_MINI_BLOCKS 1, 0