mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-23 02:11:14 +08:00
avcodec/x86/h264_deblock_10bit: Remove custom stack allocation code
Allocate it via cglobal as usual. This makes the SSE2/AVX functions available when HAVE_ALIGNED_STACK is false; it also avoids modifying rsp unnecessarily in the deblock_h_luma_intra_10 functions on Win64. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -153,14 +153,12 @@ cextern pw_1023
|
||||
; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
|
||||
; int8_t *tc0)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
|
||||
%assign pad 5*mmsize+12-(stack_offset&15)
|
||||
cglobal deblock_v_luma_10, 5,5,8,-5*mmsize
|
||||
%define tcm [rsp]
|
||||
%define ms1 [rsp+mmsize]
|
||||
%define ms2 [rsp+mmsize*2]
|
||||
%define am [rsp+mmsize*3]
|
||||
%define bm [rsp+mmsize*4]
|
||||
SUB rsp, pad
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB m4, m5, r2d, r3d
|
||||
@@ -205,11 +203,9 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
|
||||
add r4, mmsize/8
|
||||
dec r3
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
|
||||
cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
|
||||
%assign pad 7*mmsize+12-(stack_offset&15)
|
||||
cglobal deblock_h_luma_10, 5,6,8,-7*mmsize
|
||||
%define tcm [rsp]
|
||||
%define ms1 [rsp+mmsize]
|
||||
%define ms2 [rsp+mmsize*2]
|
||||
@@ -217,7 +213,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
|
||||
%define p2m [rsp+mmsize*4]
|
||||
%define am [rsp+mmsize*5]
|
||||
%define bm [rsp+mmsize*6]
|
||||
SUB rsp, pad
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB m4, m5, r2d, r3d
|
||||
@@ -295,7 +290,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
|
||||
lea r2, [r2+r1*(mmsize/2)]
|
||||
dec r5
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@@ -482,7 +476,6 @@ DEBLOCK_LUMA_64
|
||||
%endmacro
|
||||
|
||||
%macro LUMA_INTRA_INIT 1
|
||||
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
|
||||
%define t0 m4
|
||||
%define t1 m5
|
||||
%define t2 m6
|
||||
@@ -492,7 +485,6 @@ DEBLOCK_LUMA_64
|
||||
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
|
||||
%assign i i+1
|
||||
%endrep
|
||||
SUB rsp, pad
|
||||
%endmacro
|
||||
|
||||
; in: %1-%3=tmp, %4=p2, %5=q2
|
||||
@@ -654,7 +646,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16
|
||||
; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
|
||||
; int beta)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_h_luma_intra_10, 4,7,16
|
||||
cglobal deblock_h_luma_intra_10, 4,7,16,mmsize
|
||||
%define t0 m15
|
||||
%define t1 m14
|
||||
%define t2 m2
|
||||
@@ -667,8 +659,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
|
||||
%define p2 m13
|
||||
%define p3 m4
|
||||
%define spill [rsp]
|
||||
%assign pad 24-(stack_offset&15)
|
||||
SUB rsp, pad
|
||||
lea r4, [r1*4]
|
||||
lea r5, [r1*3] ; 3*stride
|
||||
add r4, r0 ; pix+4*stride
|
||||
@@ -709,7 +699,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
|
||||
lea r4, [r4+r1*8]
|
||||
dec r6
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@@ -727,7 +716,7 @@ DEBLOCK_LUMA_INTRA_64
|
||||
; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
|
||||
; int beta)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
|
||||
cglobal deblock_v_luma_intra_10, 4,7,8,-3*mmsize
|
||||
LUMA_INTRA_INIT 3
|
||||
lea r4, [r1*4]
|
||||
lea r5, [r1*3]
|
||||
@@ -749,14 +738,13 @@ cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
|
||||
add r4, mmsize
|
||||
dec r6
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
|
||||
; int beta)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
|
||||
cglobal deblock_h_luma_intra_10, 4,7,8,-8*mmsize
|
||||
LUMA_INTRA_INIT 8
|
||||
%if mmsize == 8
|
||||
lea r4, [r1*3]
|
||||
@@ -793,7 +781,6 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
|
||||
dec r6
|
||||
%endif
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
|
||||
@@ -314,12 +314,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
|
||||
@@ -354,12 +352,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user