mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 12:50:49 +08:00
avcodec/x86/vp3dsp: Port ff_vp3_idct_dc_add_mmxext to SSE2
This change should improve performance on Skylake and later Intel CPUs (which have only half the ports for saturated adds/subs for mmx register compared to xmm register): llvm-mca predicts a 25% performance improvement on Skylake. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the VP3 decoder
|
||||
;* SSE2-optimized functions for the VP3 decoder
|
||||
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
; MMX-optimized functions cribbed from the original VP3 source code.
|
||||
; ASM-optimized functions cribbed from the original VP3 source code.
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
@@ -421,9 +421,9 @@ cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3
|
||||
movq [dstq+s3q ], m5
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block)
|
||||
cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
|
||||
INIT_XMM sse2
|
||||
; void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
|
||||
cglobal vp3_idct_dc_add, 3, 4, 6, dst, s, block, dc
|
||||
movsx dcd, word [blockq]
|
||||
mov word [blockq], 0
|
||||
%define s3q blockq
|
||||
@@ -431,7 +431,7 @@ cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
|
||||
add dcd, 15
|
||||
sar dcd, 5
|
||||
movd m0, dcd
|
||||
pshufw m0, m0, 0x0
|
||||
SPLATW m0, m0
|
||||
pxor m1, m1
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
@@ -44,15 +44,12 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2;
|
||||
|
||||
c->idct_put = ff_vp3_idct_put_sse2;
|
||||
c->idct_add = ff_vp3_idct_add_sse2;
|
||||
c->idct_dc_add = ff_vp3_idct_dc_add_sse2;
|
||||
|
||||
c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
|
||||
c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;
|
||||
|
||||
@@ -92,7 +92,7 @@ static void vp3_check_idct(int nb_bits)
|
||||
BUF_SIZE = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
|
||||
};
|
||||
|
||||
declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
declare_func(void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
DECLARE_ALIGNED(16, int16_t, block_new)[64];
|
||||
DECLARE_ALIGNED(16, int16_t, block_ref)[64];
|
||||
|
||||
Reference in New Issue
Block a user