From 52ba2ac7bd48d09d1f8527376970e2b0e8ee5068 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 9 Nov 2025 19:10:30 +0100
Subject: [PATCH] avfilter/x86/vf_fspp: Port mul_thrmat to SSE2

This fixes an ABI violation, as mul_thrmat did not issue emms.
It seems that this ABI violation could reach the user, namely
if ff_get_video_buffer() fails. Notice that ff_get_video_buffer()
itself could fail because of this, namely if the allocator uses
floating point registers.

On x64 (where GCC already used SSE2 in the C version)
mul_thrmat_c:                                            4.4 ( 1.00x)
mul_thrmat_mmx:                                          8.6 ( 0.52x)
mul_thrmat_sse2:                                         4.4 ( 1.00x)

On 32bit (where SSE2 is not known to be available):
mul_thrmat_c:                                           56.0 ( 1.00x)
mul_thrmat_sse2:                                         6.0 ( 9.40x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_fspp.c          |  5 +-
 libavfilter/vf_fsppdsp.h       |  3 +-
 libavfilter/x86/vf_fspp.asm    | 84 +++++++++++++---------------------
 libavfilter/x86/vf_fspp_init.c |  6 ++-
 tests/checkasm/vf_fspp.c       |  8 ++--
 5 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9371c63e77..fa562cbd45 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -54,8 +54,6 @@
 
 typedef struct FSPPContext {
     const struct AVClass *class;
-    uint64_t threshold_mtx_noq[8 * 2];
-    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later SSE2) versions
 
     int log2_count;
     int strength;
@@ -72,6 +70,9 @@ typedef struct FSPPContext {
     int use_bframe_qp;
 
     FSPPDSPContext dsp;
+
+    DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
+    DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
 } FSPPContext;
 
 
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 0dbd628abf..e87fa6861c 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,7 +39,8 @@ typedef struct FSPPDSPContext {
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
 
-    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+    void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
+                       int16_t *thr_adr /* align 16 */, int q);
 
     void (*column_fidct)(int16_t *thr_adr, int16_t *data,
                          int16_t *output, int cnt);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 0ea6216193..c9408978d8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
     jl .loop_height
     RET
 
-;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
-    movd      m7, qd
-    movq      m0, [thrnq]
-    punpcklwd m7, m7
-    movq      m1, [thrnq+8]
-    punpckldq m7, m7
-    pmullw    m0, m7
-    movq      m2, [thrnq+8*2]
-    pmullw    m1, m7
-    movq      m3, [thrnq+8*3]
-    pmullw    m2, m7
-    movq      [thrq], m0
-    movq      m4, [thrnq+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8], m1
-    movq      m5, [thrnq+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*2], m2
-    movq      m6, [thrnq+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*3], m3
-    movq      m0, [thrnq+8*7]
-    pmullw    m6, m7
-    movq      [thrq+8*4], m4
-    movq      m1, [thrnq+8*7+8]
-    pmullw    m0, m7
-    movq      [thrq+8*5], m5
-    movq      m2, [thrnq+8*7+8*2]
-    pmullw    m1, m7
-    movq      [thrq+8*6], m6
-    movq      m3, [thrnq+8*7+8*3]
-    pmullw    m2, m7
-    movq      [thrq+8*7], m0
-    movq      m4, [thrnq+8*7+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8*7+8], m1
-    movq      m5, [thrnq+8*7+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*7+8*2], m2
-    movq      m6, [thrnq+8*7+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*7+8*3], m3
-    movq      m0, [thrnq+14*8]
-    pmullw    m6, m7
-    movq      [thrq+8*7+8*4], m4
-    movq      m1, [thrnq+14*8+8]
-    pmullw    m0, m7
-    movq      [thrq+8*7+8*5], m5
-    pmullw    m1, m7
-    movq      [thrq+8*7+8*6], m6
-    movq      [thrq+14*8], m0
-    movq      [thrq+14*8+8], m1
+;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+INIT_XMM sse2
+cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
+    movd      m4, qd
+    mova      m0, [thrnq]
+    punpcklwd m4, m4
+    mova      m1, [thrnq+16]
+    pshufd    m4, m4, 0
+    pmullw    m0, m4
+    mova      m2, [thrnq+16*2]
+    pmullw    m1, m4
+    mova      m3, [thrnq+16*3]
+    pmullw    m2, m4
+    mova      [thrq], m0
+    mova      m0, [thrnq+16*4]
+    pmullw    m3, m4
+    mova      [thrq+16], m1
+    mova      m1, [thrnq+16*5]
+    pmullw    m0, m4
+    mova      [thrq+16*2], m2
+    mova      m2, [thrnq+16*6]
+    pmullw    m1, m4
+    mova      [thrq+16*3], m3
+    mova      m3, [thrnq+16*7]
+    pmullw    m2, m4
+    mova      [thrq+16*4], m0
+    pmullw    m3, m4
+    mova      [thrq+16*5], m1
+    mova      [thrq+16*6], m2
+    mova      [thrq+16*7], m3
     RET
 
 %macro COLUMN_FDCT 1-3 0, 0
@@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
     add       outq, 8+%1
 %endmacro
 
+INIT_MMX mmx
 ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
 cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
 .fdct1:
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 2aadb50967..9f6095ce24 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
 void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
@@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     if (EXTERNAL_MMX(cpu_flags)) {
         s->store_slice  = ff_store_slice_mmx;
         s->store_slice2 = ff_store_slice2_mmx;
-        s->mul_thrmat   = ff_mul_thrmat_mmx;
         s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->mul_thrmat   = ff_mul_thrmat_sse2;
+    }
 }
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index a84ae8d5af..117e1c670e 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -18,6 +18,7 @@
 
 #include "checkasm.h"
 #include "libavfilter/vf_fsppdsp.h"
+#include "libavutil/mem_internal.h"
 
 #define randomize_buffers(buf)                           \
     do {                                                 \
@@ -29,10 +30,11 @@
 static void check_mul_thrmat(void)
 {
     FSPPDSPContext fspp;
-    int16_t src[64];
-    int16_t dst_ref[64], dst_new[64];
+    DECLARE_ALIGNED(16, int16_t, src)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_new)[64];
     const int q = (uint8_t)rnd();
-    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+    declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 
     ff_fsppdsp_init(&fspp);