From 52ba2ac7bd48d09d1f8527376970e2b0e8ee5068 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sun, 9 Nov 2025 19:10:30 +0100 Subject: [PATCH] avfilter/x86/vf_fspp: Port mul_thrmat to SSE2 This fixes an ABI violation, as mul_thrmat did not issue emms. It seems that this ABI violation could reach the user, namely if ff_get_video_buffer() fails. Notice that ff_get_video_buffer() itself could fail because of this, namely if the allocator uses floating point registers. On x64 (where GCC already used SSE2 in the C version) mul_thrmat_c: 4.4 ( 1.00x) mul_thrmat_mmx: 8.6 ( 0.52x) mul_thrmat_sse2: 4.4 ( 1.00x) On 32bit (where SSE2 is not known to be available): mul_thrmat_c: 56.0 ( 1.00x) mul_thrmat_sse2: 6.0 ( 9.40x) Signed-off-by: Andreas Rheinhardt --- libavfilter/vf_fspp.c | 5 +- libavfilter/vf_fsppdsp.h | 3 +- libavfilter/x86/vf_fspp.asm | 84 +++++++++++++--------------------- libavfilter/x86/vf_fspp_init.c | 6 ++- tests/checkasm/vf_fspp.c | 8 ++-- 5 files changed, 45 insertions(+), 61 deletions(-) diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 9371c63e77..fa562cbd45 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -54,8 +54,6 @@ typedef struct FSPPContext { const struct AVClass *class; - uint64_t threshold_mtx_noq[8 * 2]; - uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions int log2_count; int strength; @@ -72,6 +70,9 @@ typedef struct FSPPContext { int use_bframe_qp; FSPPDSPContext dsp; + + DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2]; + DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2]; } FSPPContext; diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index 0dbd628abf..e87fa6861c 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -39,7 +39,8 @@ typedef struct FSPPDSPContext { ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q); + void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */, + int16_t *thr_adr /* align 16 */, int q); void (*column_fidct)(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index 0ea6216193..c9408978d8 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 jl .loop_height RET -;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -cglobal mul_thrmat, 3, 3, 0, thrn, thr, q - movd m7, qd - movq m0, [thrnq] - punpcklwd m7, m7 - movq m1, [thrnq+8] - punpckldq m7, m7 - pmullw m0, m7 - movq m2, [thrnq+8*2] - pmullw m1, m7 - movq m3, [thrnq+8*3] - pmullw m2, m7 - movq [thrq], m0 - movq m4, [thrnq+8*4] - pmullw m3, m7 - movq [thrq+8], m1 - movq m5, [thrnq+8*5] - pmullw m4, m7 - movq [thrq+8*2], m2 - movq m6, [thrnq+8*6] - pmullw m5, m7 - movq [thrq+8*3], m3 - movq m0, [thrnq+8*7] - pmullw m6, m7 - movq [thrq+8*4], m4 - movq m1, [thrnq+8*7+8] - pmullw m0, m7 - movq [thrq+8*5], m5 - movq m2, [thrnq+8*7+8*2] - pmullw m1, m7 - movq [thrq+8*6], m6 - movq m3, [thrnq+8*7+8*3] - pmullw m2, m7 - movq [thrq+8*7], m0 - movq m4, [thrnq+8*7+8*4] - pmullw m3, m7 - movq [thrq+8*7+8], m1 - movq m5, [thrnq+8*7+8*5] - pmullw m4, m7 - movq [thrq+8*7+8*2], m2 - movq m6, [thrnq+8*7+8*6] - pmullw m5, m7 - movq [thrq+8*7+8*3], m3 - movq m0, [thrnq+14*8] - pmullw m6, m7 - movq [thrq+8*7+8*4], m4 - movq m1, [thrnq+14*8+8] - pmullw m0, m7 - movq [thrq+8*7+8*5], m5 - pmullw m1, m7 - movq [thrq+8*7+8*6], m6 - movq [thrq+14*8], m0 - movq [thrq+14*8+8], m1 +;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +INIT_XMM sse2 +cglobal mul_thrmat, 3, 3, 5, thrn, thr, q + movd m4, qd + mova m0, [thrnq] + punpcklwd m4, m4 + mova m1, [thrnq+16] + pshufd m4, m4, 0 + pmullw m0, m4 + mova m2, [thrnq+16*2] + pmullw m1, m4 + mova m3, [thrnq+16*3] + pmullw m2, m4 + mova [thrq], m0 + mova m0, [thrnq+16*4] + pmullw m3, m4 + mova [thrq+16], m1 + mova m1, [thrnq+16*5] + pmullw m0, m4 + mova [thrq+16*2], m2 + mova m2, [thrnq+16*6] + pmullw m1, m4 + mova [thrq+16*3], m3 + mova m3, [thrnq+16*7] + pmullw m2, m4 + mova [thrq+16*4], m0 + pmullw m3, m4 + mova [thrq+16*5], m1 + mova [thrq+16*6], m2 + mova [thrq+16*7], m3 RET %macro COLUMN_FDCT 1-3 0, 0 @@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q add outq, 8+%1 %endmacro +INIT_MMX mmx ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp .fdct1: diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index 2aadb50967..9f6095ce24 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src, void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); @@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) if (EXTERNAL_MMX(cpu_flags)) { s->store_slice = ff_store_slice_mmx; s->store_slice2 = ff_store_slice2_mmx; - s->mul_thrmat = ff_mul_thrmat_mmx; s->column_fidct = ff_column_fidct_mmx; s->row_idct = ff_row_idct_mmx; s->row_fdct = ff_row_fdct_mmx; } + if (EXTERNAL_SSE2(cpu_flags)) { + s->mul_thrmat = ff_mul_thrmat_sse2; + } } diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index a84ae8d5af..117e1c670e 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -18,6 +18,7 @@ #include "checkasm.h" #include "libavfilter/vf_fsppdsp.h" +#include "libavutil/mem_internal.h" #define randomize_buffers(buf) \ do { \ @@ -29,10 +30,11 @@ static void check_mul_thrmat(void) { FSPPDSPContext fspp; - int16_t src[64]; - int16_t dst_ref[64], dst_new[64]; + DECLARE_ALIGNED(16, int16_t, src)[64]; + DECLARE_ALIGNED(16, int16_t, dst_ref)[64]; + DECLARE_ALIGNED(16, int16_t, dst_new)[64]; const int q = (uint8_t)rnd(); - declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t *thr_adr, int q); + declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q); ff_fsppdsp_init(&fspp);