diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index b79477288a..87943cb302 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -34,7 +34,6 @@ vp3_idct_data: times 8 dw 64277 times 8 dw 12785 cextern pb_80 -cextern pb_FE cextern pw_4 cextern pw_8 @@ -155,40 +154,35 @@ cglobal vp3_h_loop_filter, 3, 4, 6 RET %macro PAVGB_NO_RND 0 - mova m4, m0 - mova m5, m2 - pand m4, m1 - pand m5, m3 - pxor m1, m0 - pxor m3, m2 - pand m1, m6 - pand m3, m6 - psrlq m1, 1 - psrlq m3, 1 - paddb m4, m1 - paddb m5, m3 + pxor m0, m4 + pxor m1, m4 + pavgb m0, m1 + pxor m0, m4 %endmacro -INIT_MMX mmx -cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3 - mova m6, [pb_FE] +INIT_XMM sse2 +; void ff_vp3_put_no_rnd_pixels8_l2_sse2(uint8_t *dst, const uint8_t *a, +; const uint8_t *b, ptrdiff_t stride, +; int h) +cglobal vp3_put_no_rnd_pixels8_l2, 5, 6, 5, dst, src1, src2, stride, h, stride3 lea stride3q,[strideq+strideq*2] + pcmpeqb m4, m4 .loop: - mova m0, [src1q] - mova m1, [src2q] - mova m2, [src1q+strideq] - mova m3, [src2q+strideq] + movq m0, [src1q] + movq m1, [src2q] + movhps m0, [src1q+strideq] + movhps m1, [src2q+strideq] PAVGB_NO_RND - mova [dstq], m4 - mova [dstq+strideq], m5 + movq [dstq], m0 + movhps [dstq+strideq], m0 - mova m0, [src1q+strideq*2] - mova m1, [src2q+strideq*2] - mova m2, [src1q+stride3q] - mova m3, [src2q+stride3q] + movq m0, [src1q+strideq*2] + movq m1, [src2q+strideq*2] + movhps m0, [src1q+stride3q] + movhps m1, [src2q+stride3q] PAVGB_NO_RND - mova [dstq+strideq*2], m4 - mova [dstq+stride3q], m5 + movq [dstq+strideq*2], m0 + movhps [dstq+stride3q], m0 lea src1q, [src1q+strideq*4] lea src2q, [src2q+strideq*4] diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index 42daf99981..4a416c6f9a 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -36,23 +36,21 @@ void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride, void ff_vp3_h_loop_filter_sse2(uint8_t *src, ptrdiff_t stride, int *bounding_values); -void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, - const uint8_t *b, ptrdiff_t stride, - int h); +void ff_vp3_put_no_rnd_pixels8_l2_sse2(uint8_t *dst, const uint8_t *a, + const uint8_t *b, ptrdiff_t stride, + int h); av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { - c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx; - } - if (EXTERNAL_MMXEXT(cpu_flags)) { c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { + c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2; + c->idct_put = ff_vp3_idct_put_sse2; c->idct_add = ff_vp3_idct_add_sse2; diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c index 44910b515b..b48a7514de 100644 --- a/tests/checkasm/vp3dsp.c +++ b/tests/checkasm/vp3dsp.c @@ -47,7 +47,7 @@ static void vp3_check_put_no_rnd_pixels_l2(const VP3DSPContext *const vp3dsp) BUF_SIZE = MAX_STRIDE * (HEIGHT - 1) + WIDTH, SRC_BUF_SIZE = BUF_SIZE + (WIDTH - 1), ///< WIDTH-1 to use misaligned input }; - declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, + declare_func(void, uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h);