mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avfilter/x86/vf_fspp: Port store_slice to SSE2
Old benchmarks: store_slice_c: 2798.3 ( 1.00x) store_slice_mmx: 950.2 ( 2.94x) store_slice2_c: 3811.7 ( 1.00x) store_slice2_mmx: 682.3 ( 5.59x) New benchmarks: store_slice_c: 2797.2 ( 1.00x) store_slice_sse2: 543.5 ( 5.15x) store_slice2_c: 3817.0 ( 1.00x) store_slice2_sse2: 408.2 ( 9.35x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -31,11 +31,11 @@
|
||||
#include "libavutil/attributes_internal.h"
|
||||
|
||||
typedef struct FSPPDSPContext {
|
||||
void (*store_slice)(uint8_t *dst, int16_t *src,
|
||||
void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
void (*store_slice2)(uint8_t *dst, int16_t *src,
|
||||
void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
|
||||
@@ -43,15 +43,15 @@ SECTION .text
|
||||
|
||||
%define DCTSIZE 8
|
||||
|
||||
INIT_MMX mmx
|
||||
INIT_XMM sse2
|
||||
|
||||
;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
%if ARCH_X86_64
|
||||
cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
%else
|
||||
cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
%define dst_strideq r2m
|
||||
%define src_strideq r3m
|
||||
mov widthq, r4m
|
||||
@@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
mov tmpq, src_strideq
|
||||
and widthq, ~7
|
||||
sub dst_strideq, widthq
|
||||
movd m5, ditherd ; log2_scale
|
||||
movd m4, ditherd ; log2_scale
|
||||
xor ditherq, -1 ; log2_scale
|
||||
mov tmp2q, tmpq
|
||||
add ditherq, 7 ; log2_scale
|
||||
@@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
mov src_strideq, tmp2q
|
||||
shl tmpq, 4
|
||||
lea dither_heightq, [ditherq+dither_heightq*8]
|
||||
pxor m7, m7
|
||||
pxor m1, m1
|
||||
|
||||
.loop_height:
|
||||
movq m3, [ditherq]
|
||||
movq m4, m3
|
||||
punpcklbw m3, m7
|
||||
punpckhbw m4, m7
|
||||
punpcklbw m3, m1
|
||||
mov tmp2q, widthq
|
||||
psraw m3, m5
|
||||
psraw m4, m5
|
||||
psraw m3, m4
|
||||
|
||||
.loop_width:
|
||||
movq [srcq+tmpq], m7
|
||||
movq m0, [srcq]
|
||||
movq m1, [srcq+8]
|
||||
movq [srcq+tmpq+8], m7
|
||||
mova m0, [srcq]
|
||||
mova [srcq+tmpq], m1
|
||||
paddw m0, m3
|
||||
paddw m1, m4
|
||||
movq [srcq], m7
|
||||
mova [srcq], m1
|
||||
psraw m0, m2
|
||||
psraw m1, m2
|
||||
movq [srcq+8], m7
|
||||
packuswb m0, m1
|
||||
packuswb m0, m0
|
||||
add srcq, 16
|
||||
movq [dstq], m0
|
||||
add dstq, 8
|
||||
@@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
jl .loop_height
|
||||
RET
|
||||
|
||||
;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
%if ARCH_X86_64
|
||||
cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
%else
|
||||
cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
%define dst_strideq r2m
|
||||
%define src_strideq r3m
|
||||
mov dstq, dstm
|
||||
@@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
mov tmpq, src_strideq
|
||||
and widthq, ~7
|
||||
sub dst_strideq, widthq
|
||||
movd m5, ditherd ; log2_scale
|
||||
movd m4, ditherd ; log2_scale
|
||||
xor ditherq, -1 ; log2_scale
|
||||
mov tmp2q, tmpq
|
||||
add ditherq, 7 ; log2_scale
|
||||
@@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
mov src_strideq, tmp2q
|
||||
shl tmpq, 5
|
||||
lea dither_heightq, [ditherq+dither_heightq*8]
|
||||
pxor m7, m7
|
||||
pxor m1, m1
|
||||
|
||||
.loop_height:
|
||||
movq m3, [ditherq]
|
||||
movq m4, m3
|
||||
punpcklbw m3, m7
|
||||
punpckhbw m4, m7
|
||||
punpcklbw m3, m1
|
||||
mov tmp2q,widthq
|
||||
psraw m3, m5
|
||||
psraw m4, m5
|
||||
psraw m3, m4
|
||||
|
||||
.loop_width:
|
||||
movq m0, [srcq]
|
||||
movq m1, [srcq+8]
|
||||
mova m0, [srcq]
|
||||
paddw m0, m3
|
||||
paddw m0, [srcq+tmpq]
|
||||
paddw m1, m4
|
||||
movq m6, [srcq+tmpq+8]
|
||||
movq [srcq+tmpq], m7
|
||||
mova [srcq+tmpq], m1
|
||||
psraw m0, m2
|
||||
paddw m1, m6
|
||||
movq [srcq+tmpq+8], m7
|
||||
psraw m1, m2
|
||||
packuswb m0, m1
|
||||
packuswb m0, m0
|
||||
movq [dstq], m0
|
||||
add srcq, 16
|
||||
add dstq, 8
|
||||
@@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
RET
|
||||
|
||||
;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
INIT_XMM sse2
|
||||
cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
|
||||
movd m4, qd
|
||||
mova m0, [thrnq]
|
||||
|
||||
@@ -23,12 +23,12 @@
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/vf_fsppdsp.h"
|
||||
|
||||
void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
|
||||
void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
|
||||
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
|
||||
@@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
s->store_slice = ff_store_slice_mmx;
|
||||
s->store_slice2 = ff_store_slice2_mmx;
|
||||
s->column_fidct = ff_column_fidct_mmx;
|
||||
s->row_idct = ff_row_idct_mmx;
|
||||
s->row_fdct = ff_row_fdct_mmx;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->store_slice = ff_store_slice_sse2;
|
||||
s->store_slice2 = ff_store_slice2_sse2;
|
||||
s->mul_thrmat = ff_mul_thrmat_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user