From 88879f2effb241fdabe1c9b87f6e50b11f9443a2 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 12 Apr 2026 23:26:56 +0200
Subject: [PATCH] tests/checkasm/vp3dsp: Add test for idct_add, idct_put,
 idct_dc_add

Due to a discrepancy between SSE2 and the C version coefficients
for idct_put and idct_add are restricted to a range not causing
overflows.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 tests/checkasm/vp3dsp.c | 64 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index b48a7514de..f75e4a0617 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -16,8 +16,8 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#include <assert.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "checkasm.h"
 #include "libavutil/intreadwrite.h"
@@ -27,15 +27,14 @@
 
 #define randomize_buffers(buf0, buf1, size)                \
     do {                                                   \
-        static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
-                      "Pointer arithmetic needs to be adapted"); \
+        char *b0 = (char*)buf0, *b1 = (char*)buf1;         \
         for (size_t k = 0; k < (size & ~3); k += 4) {      \
             uint32_t r = rnd();                            \
-            AV_WN32A(buf0 + k, r);                         \
-            AV_WN32A(buf1 + k, r);                         \
+            AV_WN32A(b0 + k, r);                           \
+            AV_WN32A(b1 + k, r);                           \
         }                                                  \
         for (size_t k = size & ~3; k < size; ++k)          \
-            buf0[k] = buf1[k] = rnd();                     \
+            b0[k] = b1[k] = rnd();                         \
     } while (0)
 
 static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp)
@@ -83,6 +82,46 @@ static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp)
     bench_new(dst_new, src1, src1, stride, h);
 }
 
+static void vp3_check_idct(int nb_bits)
+{
+    enum {
+        MAX_STRIDE   = 64,
+        MIN_STRIDE   = 16,
+        NB_LINES     = 8,
+        WIDTH        = 8,
+        BUF_SIZE     = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
+    };
+
+    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+    DECLARE_ALIGNED(16, int16_t, block_new)[64];
+    DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+    DECLARE_ALIGNED(8, uint8_t, dstbuf_new)[BUF_SIZE];
+    DECLARE_ALIGNED(8, uint8_t, dstbuf_ref)[BUF_SIZE];
+
+    ptrdiff_t stride = (rnd() % (MAX_STRIDE / MIN_STRIDE) + 1) * MIN_STRIDE;
+    uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+
+    if (rnd() & 1) {
+        // Flip stride.
+        dst_new += (NB_LINES - 1) * stride;
+        dst_ref += (NB_LINES - 1) * stride;
+        stride   = -stride;
+    }
+
+    randomize_buffers(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref));
+    for (size_t k = 0; k < FF_ARRAY_ELEMS(block_new); ++k) {
+        int32_t r = (int32_t)rnd() >> (32 - nb_bits);
+        block_new[k] = block_ref[k] = r;
+    }
+
+    call_ref(dst_ref, stride, block_ref);
+    call_new(dst_new, stride, block_new);
+    if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_new)) ||
+        memcmp(block_new, block_ref, sizeof(block_new)))
+        fail();
+    bench_new(dst_new, stride, block_new);
+}
 
 static void vp3_check_loop_filter(const VP3DSPContext *const vp3dsp)
 {
@@ -160,6 +199,19 @@ void checkasm_check_vp3dsp(void)
     vp3_check_put_no_rnd_pixels_l2(&vp3dsp);
     report("put_no_rnd_pixels_l2");
 
+#define IDCT_TEST(func, mask)           \
+    if (check_func(vp3dsp.func, #func)) \
+        vp3_check_idct(mask);           \
+    report(#func)
+    IDCT_TEST(idct_dc_add, 16);
+    // FIXME: The Theora specification actually requires using unsaturated
+    // 16-bit arithmetic for its idct. Yet the SSE2 version uses saturated
+    // arithmetic and even the C version seems to forget truncating
+    // intermediate values to 16 bit. For the time being, use a range
+    // that does not trigger overflow.
+    IDCT_TEST(idct_put, 8);
+    IDCT_TEST(idct_add, 8);
+
     vp3_check_loop_filter(&vp3dsp);
     report("loop_filter");
 }