avfilter/x86/vf_fspp: Port store_slice to SSE2

Old benchmarks:
store_slice_c:                                        2798.3 ( 1.00x)
store_slice_mmx:                                       950.2 ( 2.94x)
store_slice2_c:                                       3811.7 ( 1.00x)
store_slice2_mmx:                                      682.3 ( 5.59x)

New benchmarks:
store_slice_c:                                        2797.2 ( 1.00x)
store_slice_sse2:                                      543.5 ( 5.15x)
store_slice2_c:                                       3817.0 ( 1.00x)
store_slice2_sse2:                                     408.2 ( 9.35x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-10 22:06:34 +01:00
parent 570f8fc6c9
commit ff85a20b7d
3 changed files with 34 additions and 52 deletions

View File

@@ -31,11 +31,11 @@
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
void (*store_slice)(uint8_t *dst, int16_t *src,
void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*store_slice2)(uint8_t *dst, int16_t *src,
void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);

View File

@@ -43,15 +43,15 @@ SECTION .text
%define DCTSIZE 8
INIT_MMX mmx
INIT_XMM sse2
;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
%if ARCH_X86_64
cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
%else
cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov widthq, r4m
@@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
movd m5, ditherd ; log2_scale
movd m4, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov src_strideq, tmp2q
shl tmpq, 4
lea dither_heightq, [ditherq+dither_heightq*8]
pxor m7, m7
pxor m1, m1
.loop_height:
movq m3, [ditherq]
movq m4, m3
punpcklbw m3, m7
punpckhbw m4, m7
punpcklbw m3, m1
mov tmp2q, widthq
psraw m3, m5
psraw m4, m5
psraw m3, m4
.loop_width:
movq [srcq+tmpq], m7
movq m0, [srcq]
movq m1, [srcq+8]
movq [srcq+tmpq+8], m7
mova m0, [srcq]
mova [srcq+tmpq], m1
paddw m0, m3
paddw m1, m4
movq [srcq], m7
mova [srcq], m1
psraw m0, m2
psraw m1, m2
movq [srcq+8], m7
packuswb m0, m1
packuswb m0, m0
add srcq, 16
movq [dstq], m0
add dstq, 8
@@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
jl .loop_height
RET
;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
%if ARCH_X86_64
cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
%else
cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov dstq, dstm
@@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
movd m5, ditherd ; log2_scale
movd m4, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov src_strideq, tmp2q
shl tmpq, 5
lea dither_heightq, [ditherq+dither_heightq*8]
pxor m7, m7
pxor m1, m1
.loop_height:
movq m3, [ditherq]
movq m4, m3
punpcklbw m3, m7
punpckhbw m4, m7
punpcklbw m3, m1
mov tmp2q,widthq
psraw m3, m5
psraw m4, m5
psraw m3, m4
.loop_width:
movq m0, [srcq]
movq m1, [srcq+8]
mova m0, [srcq]
paddw m0, m3
paddw m0, [srcq+tmpq]
paddw m1, m4
movq m6, [srcq+tmpq+8]
movq [srcq+tmpq], m7
mova [srcq+tmpq], m1
psraw m0, m2
paddw m1, m6
movq [srcq+tmpq+8], m7
psraw m1, m2
packuswb m0, m1
packuswb m0, m0
movq [dstq], m0
add srcq, 16
add dstq, 8
@@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
RET
;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
INIT_XMM sse2
cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movd m4, qd
mova m0, [thrnq]

View File

@@ -23,12 +23,12 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_fsppdsp.h"
void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
@@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->store_slice = ff_store_slice_mmx;
s->store_slice2 = ff_store_slice2_mmx;
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->store_slice = ff_store_slice_sse2;
s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
}
}