From ff85a20b7db4d3226ada8533b181989944f30e75 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Mon, 10 Nov 2025 22:06:34 +0100 Subject: [PATCH] avfilter/x86/vf_fspp: Port store_slice to SSE2 Old benchmarks: store_slice_c: 2798.3 ( 1.00x) store_slice_mmx: 950.2 ( 2.94x) store_slice2_c: 3811.7 ( 1.00x) store_slice2_mmx: 682.3 ( 5.59x) New benchmarks: store_slice_c: 2797.2 ( 1.00x) store_slice_sse2: 543.5 ( 5.15x) store_slice2_c: 3817.0 ( 1.00x) store_slice2_sse2: 408.2 ( 9.35x) Signed-off-by: Andreas Rheinhardt --- libavfilter/vf_fsppdsp.h | 4 +- libavfilter/x86/vf_fspp.asm | 70 +++++++++++++--------------------- libavfilter/x86/vf_fspp_init.c | 12 +++--- 3 files changed, 34 insertions(+), 52 deletions(-) diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index e87fa6861c..b440809f02 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -31,11 +31,11 @@ #include "libavutil/attributes_internal.h" typedef struct FSPPDSPContext { - void (*store_slice)(uint8_t *dst, int16_t *src, + void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*store_slice2)(uint8_t *dst, int16_t *src, + void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index c9408978d8..489e69f8ce 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -43,15 +43,15 @@ SECTION .text %define DCTSIZE 8 -INIT_MMX mmx +INIT_XMM sse2 -;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, -; ptrdiff_t dst_stride, ptrdiff_t src_stride, -; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +;void ff_store_slice_sse2(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) %if ARCH_X86_64 -cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 %else -cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2 %define dst_strideq r2m %define src_strideq r3m mov widthq, r4m @@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov tmpq, src_strideq and widthq, ~7 sub dst_strideq, widthq - movd m5, ditherd ; log2_scale + movd m4, ditherd ; log2_scale xor ditherq, -1 ; log2_scale mov tmp2q, tmpq add ditherq, 7 ; log2_scale @@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov src_strideq, tmp2q shl tmpq, 4 lea dither_heightq, [ditherq+dither_heightq*8] - pxor m7, m7 + pxor m1, m1 .loop_height: movq m3, [ditherq] - movq m4, m3 - punpcklbw m3, m7 - punpckhbw m4, m7 + punpcklbw m3, m1 mov tmp2q, widthq - psraw m3, m5 - psraw m4, m5 + psraw m3, m4 .loop_width: - movq [srcq+tmpq], m7 - movq m0, [srcq] - movq m1, [srcq+8] - movq [srcq+tmpq+8], m7 + mova m0, [srcq] + mova [srcq+tmpq], m1 paddw m0, m3 - paddw m1, m4 - movq [srcq], m7 + mova [srcq], m1 psraw m0, m2 - psraw m1, m2 - movq [srcq+8], m7 - packuswb m0, m1 + packuswb m0, m0 add srcq, 16 movq [dstq], m0 add dstq, 8 @@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 jl .loop_height RET -;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, -; ptrdiff_t dst_stride, ptrdiff_t src_stride, -; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) %if ARCH_X86_64 -cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 %else -cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2 %define dst_strideq r2m %define src_strideq r3m mov dstq, dstm @@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov tmpq, src_strideq and widthq, ~7 sub dst_strideq, widthq - movd m5, ditherd ; log2_scale + movd m4, ditherd ; log2_scale xor ditherq, -1 ; log2_scale mov tmp2q, tmpq add ditherq, 7 ; log2_scale @@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov src_strideq, tmp2q shl tmpq, 5 lea dither_heightq, [ditherq+dither_heightq*8] - pxor m7, m7 + pxor m1, m1 .loop_height: movq m3, [ditherq] - movq m4, m3 - punpcklbw m3, m7 - punpckhbw m4, m7 + punpcklbw m3, m1 mov tmp2q,widthq - psraw m3, m5 - psraw m4, m5 + psraw m3, m4 .loop_width: - movq m0, [srcq] - movq m1, [srcq+8] + mova m0, [srcq] paddw m0, m3 paddw m0, [srcq+tmpq] - paddw m1, m4 - movq m6, [srcq+tmpq+8] - movq [srcq+tmpq], m7 + mova [srcq+tmpq], m1 psraw m0, m2 - paddw m1, m6 - movq [srcq+tmpq+8], m7 - psraw m1, m2 - packuswb m0, m1 + packuswb m0, m0 movq [dstq], m0 add srcq, 16 add dstq, 8 @@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 RET ;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -INIT_XMM sse2 cglobal mul_thrmat, 3, 3, 5, thrn, thr, q movd m4, qd mova m0, [thrnq] diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index 9f6095ce24..ee875547d2 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -23,12 +23,12 @@ #include "libavutil/x86/cpu.h" #include "libavfilter/vf_fsppdsp.h" -void ff_store_slice_mmx(uint8_t *dst, int16_t *src, - ptrdiff_t dst_stride, ptrdiff_t src_stride, - ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, +void ff_store_slice_sse2(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); @@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - s->store_slice = ff_store_slice_mmx; - s->store_slice2 = ff_store_slice2_mmx; s->column_fidct = ff_column_fidct_mmx; s->row_idct = ff_row_idct_mmx; s->row_fdct = ff_row_fdct_mmx; } if (EXTERNAL_SSE2(cpu_flags)) { + s->store_slice = ff_store_slice_sse2; + s->store_slice2 = ff_store_slice2_sse2; s->mul_thrmat = ff_mul_thrmat_sse2; } }