tests/checkasm/vp3dsp: Add test for idct_add, idct_put, idct_dc_add

Due to a discrepancy between SSE2 and the C version coefficients
for idct_put and idct_add are restricted to a range not causing
overflows.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-04-12 23:26:56 +02:00
parent 84b9de0633
commit 88879f2eff

View File

@@ -16,8 +16,8 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "checkasm.h"
#include "libavutil/intreadwrite.h"
@@ -27,15 +27,14 @@
#define randomize_buffers(buf0, buf1, size) \
do { \
static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
"Pointer arithmetic needs to be adapted"); \
char *b0 = (char*)buf0, *b1 = (char*)buf1; \
for (size_t k = 0; k < (size & ~3); k += 4) { \
uint32_t r = rnd(); \
AV_WN32A(buf0 + k, r); \
AV_WN32A(buf1 + k, r); \
AV_WN32A(b0 + k, r); \
AV_WN32A(b1 + k, r); \
} \
for (size_t k = size & ~3; k < size; ++k) \
buf0[k] = buf1[k] = rnd(); \
b0[k] = b1[k] = rnd(); \
} while (0)
static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp)
@@ -83,6 +82,46 @@ static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp)
bench_new(dst_new, src1, src1, stride, h);
}
static void vp3_check_idct(int nb_bits)
{
enum {
MAX_STRIDE = 64,
MIN_STRIDE = 16,
NB_LINES = 8,
WIDTH = 8,
BUF_SIZE = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
};
declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
DECLARE_ALIGNED(16, int16_t, block_new)[64];
DECLARE_ALIGNED(16, int16_t, block_ref)[64];
DECLARE_ALIGNED(8, uint8_t, dstbuf_new)[BUF_SIZE];
DECLARE_ALIGNED(8, uint8_t, dstbuf_ref)[BUF_SIZE];
ptrdiff_t stride = (rnd() % (MAX_STRIDE / MIN_STRIDE) + 1) * MIN_STRIDE;
uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
if (rnd() & 1) {
// Flip stride.
dst_new += (NB_LINES - 1) * stride;
dst_ref += (NB_LINES - 1) * stride;
stride = -stride;
}
randomize_buffers(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref));
for (size_t k = 0; k < FF_ARRAY_ELEMS(block_new); ++k) {
int32_t r = (int32_t)rnd() >> (32 - nb_bits);
block_new[k] = block_ref[k] = r;
}
call_ref(dst_ref, stride, block_ref);
call_new(dst_new, stride, block_new);
if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_new)) ||
memcmp(block_new, block_ref, sizeof(block_new)))
fail();
bench_new(dst_new, stride, block_new);
}
static void vp3_check_loop_filter(const VP3DSPContext *const vp3dsp)
{
@@ -160,6 +199,19 @@ void checkasm_check_vp3dsp(void)
vp3_check_put_no_rnd_pixels_l2(&vp3dsp);
report("put_no_rnd_pixels_l2");
#define IDCT_TEST(func, mask) \
if (check_func(vp3dsp.func, #func)) \
vp3_check_idct(mask); \
report(#func)
IDCT_TEST(idct_dc_add, 16);
// FIXME: The Theora specification actually requires using unsaturated
// 16-bit arithmetic for its idct. Yet the SSE2 version uses saturated
// arithmetic and even the C version seems to forget truncating
// intermediate values to 16 bit. For the time being, use a range
// that does not trigger overflow.
IDCT_TEST(idct_put, 8);
IDCT_TEST(idct_add, 8);
vp3_check_loop_filter(&vp3dsp);
report("loop_filter");
}