avcodec/x86/h264_deblock_10bit: Remove custom stack allocation code

Allocate it via cglobal as usual. This makes the SSE2/AVX functions
available when HAVE_ALIGNED_STACK is false; it also avoids
modifying rsp unnecessarily in the deblock_h_luma_intra_10 functions
on Win64.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-01-07 12:46:22 +01:00
parent b1140d3c98
commit dbdf514c17
2 changed files with 5 additions and 22 deletions

View File

@@ -153,14 +153,12 @@ cextern pw_1023
; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
; int8_t *tc0)
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
%assign pad 5*mmsize+12-(stack_offset&15)
cglobal deblock_v_luma_10, 5,5,8,-5*mmsize
%define tcm [rsp]
%define ms1 [rsp+mmsize]
%define ms2 [rsp+mmsize*2]
%define am [rsp+mmsize*3]
%define bm [rsp+mmsize*4]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
LOAD_AB m4, m5, r2d, r3d
@@ -205,11 +203,9 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
add r4, mmsize/8
dec r3
jg .loop
ADD rsp, pad
RET
cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
%assign pad 7*mmsize+12-(stack_offset&15)
cglobal deblock_h_luma_10, 5,6,8,-7*mmsize
%define tcm [rsp]
%define ms1 [rsp+mmsize]
%define ms2 [rsp+mmsize*2]
@@ -217,7 +213,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
%define p2m [rsp+mmsize*4]
%define am [rsp+mmsize*5]
%define bm [rsp+mmsize*6]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
LOAD_AB m4, m5, r2d, r3d
@@ -295,7 +290,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
lea r2, [r2+r1*(mmsize/2)]
dec r5
jg .loop
ADD rsp, pad
RET
%endmacro
@@ -482,7 +476,6 @@ DEBLOCK_LUMA_64
%endmacro
%macro LUMA_INTRA_INIT 1
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
%define t0 m4
%define t1 m5
%define t2 m6
@@ -492,7 +485,6 @@ DEBLOCK_LUMA_64
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
%assign i i+1
%endrep
SUB rsp, pad
%endmacro
; in: %1-%3=tmp, %4=p2, %5=q2
@@ -654,7 +646,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16
; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
; int beta)
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_10, 4,7,16
cglobal deblock_h_luma_intra_10, 4,7,16,mmsize
%define t0 m15
%define t1 m14
%define t2 m2
@@ -667,8 +659,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
%define p2 m13
%define p3 m4
%define spill [rsp]
%assign pad 24-(stack_offset&15)
SUB rsp, pad
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
add r4, r0 ; pix+4*stride
@@ -709,7 +699,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
lea r4, [r4+r1*8]
dec r6
jg .loop
ADD rsp, pad
RET
%endmacro
@@ -727,7 +716,7 @@ DEBLOCK_LUMA_INTRA_64
; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
; int beta)
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
cglobal deblock_v_luma_intra_10, 4,7,8,-3*mmsize
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
@@ -749,14 +738,13 @@ cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
add r4, mmsize
dec r6
jg .loop
ADD rsp, pad
RET
;-----------------------------------------------------------------------------
; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
; int beta)
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
cglobal deblock_h_luma_intra_10, 4,7,8,-8*mmsize
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
@@ -793,7 +781,6 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
dec r6
%endif
jg .loop
ADD rsp, pad
RET
%endmacro

View File

@@ -314,12 +314,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
@@ -354,12 +352,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
#endif /* HAVE_ALIGNED_STACK */
}
}
}