From 88879f2effb241fdabe1c9b87f6e50b11f9443a2 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sun, 12 Apr 2026 23:26:56 +0200 Subject: [PATCH] tests/checkasm/vp3dsp: Add test for idct_add, idct_put, idct_dc_add Due to a discrepancy between SSE2 and the C version coefficients for idct_put and idct_add are restricted to a range not causing overflows. Signed-off-by: Andreas Rheinhardt --- tests/checkasm/vp3dsp.c | 64 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c index b48a7514de..f75e4a0617 100644 --- a/tests/checkasm/vp3dsp.c +++ b/tests/checkasm/vp3dsp.c @@ -16,8 +16,8 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ -#include #include +#include #include "checkasm.h" #include "libavutil/intreadwrite.h" @@ -27,15 +27,14 @@ #define randomize_buffers(buf0, buf1, size) \ do { \ - static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \ - "Pointer arithmetic needs to be adapted"); \ + char *b0 = (char*)buf0, *b1 = (char*)buf1; \ for (size_t k = 0; k < (size & ~3); k += 4) { \ uint32_t r = rnd(); \ - AV_WN32A(buf0 + k, r); \ - AV_WN32A(buf1 + k, r); \ + AV_WN32A(b0 + k, r); \ + AV_WN32A(b1 + k, r); \ } \ for (size_t k = size & ~3; k < size; ++k) \ - buf0[k] = buf1[k] = rnd(); \ + b0[k] = b1[k] = rnd(); \ } while (0) static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp) @@ -83,6 +82,46 @@ static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp) bench_new(dst_new, src1, src1, stride, h); } +static void vp3_check_idct(int nb_bits) +{ + enum { + MAX_STRIDE = 64, + MIN_STRIDE = 16, + NB_LINES = 8, + WIDTH = 8, + BUF_SIZE = MAX_STRIDE * (NB_LINES - 1) + WIDTH, + }; + + declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block); + + DECLARE_ALIGNED(16, int16_t, block_new)[64]; + DECLARE_ALIGNED(16, int16_t, block_ref)[64]; + DECLARE_ALIGNED(8, uint8_t, dstbuf_new)[BUF_SIZE]; + DECLARE_ALIGNED(8, uint8_t, dstbuf_ref)[BUF_SIZE]; + + ptrdiff_t stride = (rnd() % (MAX_STRIDE / MIN_STRIDE) + 1) * MIN_STRIDE; + uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref; + + if (rnd() & 1) { + // Flip stride. + dst_new += (NB_LINES - 1) * stride; + dst_ref += (NB_LINES - 1) * stride; + stride = -stride; + } + + randomize_buffers(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)); + for (size_t k = 0; k < FF_ARRAY_ELEMS(block_new); ++k) { + int32_t r = (int32_t)rnd() >> (32 - nb_bits); + block_new[k] = block_ref[k] = r; + } + + call_ref(dst_ref, stride, block_ref); + call_new(dst_new, stride, block_new); + if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_new)) || + memcmp(block_new, block_ref, sizeof(block_new))) + fail(); + bench_new(dst_new, stride, block_new); +} static void vp3_check_loop_filter(const VP3DSPContext *const vp3dsp) { @@ -160,6 +199,19 @@ void checkasm_check_vp3dsp(void) vp3_check_put_no_rnd_pixels_l2(&vp3dsp); report("put_no_rnd_pixels_l2"); +#define IDCT_TEST(func, mask) \ + if (check_func(vp3dsp.func, #func)) \ + vp3_check_idct(mask); \ + report(#func) + IDCT_TEST(idct_dc_add, 16); + // FIXME: The Theora specification actually requires using unsaturated + // 16-bit arithmetic for its idct. Yet the SSE2 version uses saturated + // arithmetic and even the C version seems to forget truncating + // intermediate values to 16 bit. For the time being, use a range + // that does not trigger overflow. + IDCT_TEST(idct_put, 8); + IDCT_TEST(idct_add, 8); + vp3_check_loop_filter(&vp3dsp); report("loop_filter"); }