tests/checkasm/vp3dsp: Add test for idct_add, idct_put, idct_dc_add

Due to a discrepancy between SSE2 and the C version coefficients for idct_put and idct_add are restricted to a range not causing overflows. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-04-20 12:50:49 +08:00 · 2026-04-12 23:26:56 +02:00
parent 84b9de0633
commit 88879f2eff
1 changed files with 58 additions and 6 deletions
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -16,8 +16,8 @@
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

-#include <assert.h>
 #include <stddef.h>
+#include <string.h>

 #include "checkasm.h"
 #include "libavutil/intreadwrite.h"
@@ -27,15 +27,14 @@

 #define randomize_buffers(buf0, buf1, size)                \
    do {                                                   \
-        static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
-                      "Pointer arithmetic needs to be adapted"); \
+        char *b0 = (char*)buf0, *b1 = (char*)buf1;         \
        for (size_t k = 0; k < (size & ~3); k += 4) {      \
            uint32_t r = rnd();                            \
-            AV_WN32A(buf0 + k, r);                         \
-            AV_WN32A(buf1 + k, r);                         \
+            AV_WN32A(b0 + k, r);                           \
+            AV_WN32A(b1 + k, r);                           \
        }                                                  \
        for (size_t k = size & ~3; k < size; ++k)          \
-            buf0[k] = buf1[k] = rnd();                     \
+            b0[k] = b1[k] = rnd();                         \
    } while (0)

 static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp)
@@ -83,6 +82,46 @@ static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp)
    bench_new(dst_new, src1, src1, stride, h);
 }

+static void vp3_check_idct(int nb_bits)
+{
+    enum {
+        MAX_STRIDE   = 64,
+        MIN_STRIDE   = 16,
+        NB_LINES     = 8,
+        WIDTH        = 8,
+        BUF_SIZE     = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
+    };
+
+    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+    DECLARE_ALIGNED(16, int16_t, block_new)[64];
+    DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+    DECLARE_ALIGNED(8, uint8_t, dstbuf_new)[BUF_SIZE];
+    DECLARE_ALIGNED(8, uint8_t, dstbuf_ref)[BUF_SIZE];
+
+    ptrdiff_t stride = (rnd() % (MAX_STRIDE / MIN_STRIDE) + 1) * MIN_STRIDE;
+    uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+
+    if (rnd() & 1) {
+        // Flip stride.
+        dst_new += (NB_LINES - 1) * stride;
+        dst_ref += (NB_LINES - 1) * stride;
+        stride   = -stride;
+    }
+
+    randomize_buffers(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref));
+    for (size_t k = 0; k < FF_ARRAY_ELEMS(block_new); ++k) {
+        int32_t r = (int32_t)rnd() >> (32 - nb_bits);
+        block_new[k] = block_ref[k] = r;
+    }
+
+    call_ref(dst_ref, stride, block_ref);
+    call_new(dst_new, stride, block_new);
+    if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_new)) ||
+        memcmp(block_new, block_ref, sizeof(block_new)))
+        fail();
+    bench_new(dst_new, stride, block_new);
+}

 static void vp3_check_loop_filter(const VP3DSPContext *const vp3dsp)
 {
@@ -160,6 +199,19 @@ void checkasm_check_vp3dsp(void)
    vp3_check_put_no_rnd_pixels_l2(&vp3dsp);
    report("put_no_rnd_pixels_l2");

+#define IDCT_TEST(func, mask)           \
+    if (check_func(vp3dsp.func, #func)) \
+        vp3_check_idct(mask);           \
+    report(#func)
+    IDCT_TEST(idct_dc_add, 16);
+    // FIXME: The Theora specification actually requires using unsaturated
+    // 16-bit arithmetic for its idct. Yet the SSE2 version uses saturated
+    // arithmetic and even the C version seems to forget truncating
+    // intermediate values to 16 bit. For the time being, use a range
+    // that does not trigger overflow.
+    IDCT_TEST(idct_put, 8);
+    IDCT_TEST(idct_add, 8);
+
    vp3_check_loop_filter(&vp3dsp);
    report("loop_filter");
 }