avcodec/x86/h264_deblock_10bit: Remove custom stack allocation code

Allocate it via cglobal as usual. This makes the SSE2/AVX functions available when HAVE_ALIGNED_STACK is false; it also avoids modifying rsp unnecessarily in the deblock_h_luma_intra_10 functions on Win64. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-04-23 02:11:14 +08:00 · 2026-01-07 12:46:22 +01:00
parent b1140d3c98
commit dbdf514c17
2 changed files with 5 additions and 22 deletions
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -153,14 +153,12 @@ cextern pw_1023
 ; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
 ;                           int8_t *tc0)
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
-    %assign pad 5*mmsize+12-(stack_offset&15)
+cglobal deblock_v_luma_10, 5,5,8,-5*mmsize
    %define tcm [rsp]
    %define ms1 [rsp+mmsize]
    %define ms2 [rsp+mmsize*2]
    %define am  [rsp+mmsize*3]
    %define bm  [rsp+mmsize*4]
-    SUB        rsp, pad
    shl        r2d, 2
    shl        r3d, 2
    LOAD_AB     m4, m5, r2d, r3d
@@ -205,11 +203,9 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
    add         r4, mmsize/8
    dec         r3
    jg .loop
-    ADD         rsp, pad
    RET

-cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
-    %assign pad 7*mmsize+12-(stack_offset&15)
+cglobal deblock_h_luma_10, 5,6,8,-7*mmsize
    %define tcm [rsp]
    %define ms1 [rsp+mmsize]
    %define ms2 [rsp+mmsize*2]
@@ -217,7 +213,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
    %define p2m [rsp+mmsize*4]
    %define am  [rsp+mmsize*5]
    %define bm  [rsp+mmsize*6]
-    SUB        rsp, pad
    shl        r2d, 2
    shl        r3d, 2
    LOAD_AB     m4, m5, r2d, r3d
@@ -295,7 +290,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
    lea         r2, [r2+r1*(mmsize/2)]
    dec         r5
    jg .loop
-    ADD        rsp, pad
    RET
 %endmacro

@@ -482,7 +476,6 @@ DEBLOCK_LUMA_64
 %endmacro

 %macro LUMA_INTRA_INIT 1
-    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
    %define t0 m4
    %define t1 m5
    %define t2 m6
@@ -492,7 +485,6 @@ DEBLOCK_LUMA_64
    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
    %assign i i+1
 %endrep
-    SUB    rsp, pad
 %endmacro

 ; in: %1-%3=tmp, %4=p2, %5=q2
@@ -654,7 +646,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16
 ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
 ;                                 int beta)
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10, 4,7,16
+cglobal deblock_h_luma_intra_10, 4,7,16,mmsize
    %define t0 m15
    %define t1 m14
    %define t2 m2
@@ -667,8 +659,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
    %define p2 m13
    %define p3 m4
    %define spill [rsp]
-    %assign pad 24-(stack_offset&15)
-    SUB     rsp, pad
    lea     r4, [r1*4]
    lea     r5, [r1*3] ; 3*stride
    add     r4, r0     ; pix+4*stride
@@ -709,7 +699,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
    lea     r4, [r4+r1*8]
    dec     r6
    jg .loop
-    ADD    rsp, pad
    RET
 %endmacro

@@ -727,7 +716,7 @@ DEBLOCK_LUMA_INTRA_64
 ; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
 ;                                 int beta)
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
+cglobal deblock_v_luma_intra_10, 4,7,8,-3*mmsize
    LUMA_INTRA_INIT 3
    lea     r4, [r1*4]
    lea     r5, [r1*3]
@@ -749,14 +738,13 @@ cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
    add     r4, mmsize
    dec     r6
    jg .loop
-    ADD    rsp, pad
    RET

 ;-----------------------------------------------------------------------------
 ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
 ;                                 int beta)
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
+cglobal deblock_h_luma_intra_10, 4,7,8,-8*mmsize
    LUMA_INTRA_INIT 8
 %if mmsize == 8
    lea     r4, [r1*3]
@@ -793,7 +781,6 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
    dec     r6
 %endif
    jg .loop
-    ADD    rsp, pad
    RET
 %endmacro

--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -314,12 +314,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
            } else {
                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
            }
-#if HAVE_ALIGNED_STACK
            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
-#endif /* HAVE_ALIGNED_STACK */
        }
        if (EXTERNAL_SSE4(cpu_flags)) {
            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
@@ -354,12 +352,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
            } else {
                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
            }
-#if HAVE_ALIGNED_STACK
            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_avx;
            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_avx;
-#endif /* HAVE_ALIGNED_STACK */
        }
    }
 }