mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avcodec/x86/sbcdsp: Port ff_sbc_analyze_[48]_mmx to SSE2
Halfs the amount of pmaddwd and improves performance a lot: sbc_analyze_4_c: 55.7 ( 1.00x) sbc_analyze_4_mmx: 7.0 ( 7.94x) sbc_analyze_4_sse2: 4.3 (12.93x) sbc_analyze_8_c: 131.1 ( 1.00x) sbc_analyze_8_mmx: 22.4 ( 5.84x) sbc_analyze_8_sse2: 10.7 (12.25x) It also saves 224B of .text and allows to remove the emms_c() from sbcenc.c (notice that ff_sbc_calc_scalefactors_mmx() issues emms on its own, so it already abides by the ABI). Hint: A pshufd could be avoided per function if the constants were reordered. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -31,7 +31,6 @@
|
||||
*/
|
||||
|
||||
#include "libavutil/channel_layout.h"
|
||||
#include "libavutil/emms.h"
|
||||
#include "libavutil/opt.h"
|
||||
#include "avcodec.h"
|
||||
#include "codec_internal.h"
|
||||
@@ -322,7 +321,6 @@ static int sbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
|
||||
frame->blocks,
|
||||
frame->channels,
|
||||
frame->subbands);
|
||||
emms_c();
|
||||
sbc_pack_frame(avpkt, frame, j, sbc->msbc);
|
||||
|
||||
*got_packet_ptr = 1;
|
||||
|
||||
@@ -38,43 +38,44 @@ SECTION .text
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
|
||||
NIDN movq, %5, %3
|
||||
NIDN movq, %6, %4
|
||||
pmaddwd %5, [constsq+%9]
|
||||
pmaddwd %6, [constsq+%9+8]
|
||||
NIDN paddd, %1, %7
|
||||
NIDN paddd, %2, %8
|
||||
%endmacro
|
||||
|
||||
%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
|
||||
ANALYZE_MAC %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
|
||||
%endmacro
|
||||
|
||||
%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
|
||||
%ifidn %7, pack
|
||||
psrad %3, 16 ; SBC_PROTO_FIXED_SCALE
|
||||
packssdw %3, %3
|
||||
%macro ANALYZE_MAC 6 ; out1, out2, tmp1, tmp2, offset, aligned
|
||||
mov%6 %3, [inq+%5]
|
||||
mov%6 %4, [inq+%5+mmsize]
|
||||
%if %5 == 0
|
||||
pcmpeqd m0, m0
|
||||
psrld m0, 31
|
||||
%endif
|
||||
ANALYZE_MAC %1, %2, %3, %3, %4, %5, %4, %5, %6
|
||||
pmaddwd %3, [constsq+%5]
|
||||
pmaddwd %4, [constsq+%5+mmsize]
|
||||
%if %5 == 0
|
||||
pslld m0, 15 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) as dword
|
||||
%endif
|
||||
NIDN paddd, %1, %3
|
||||
NIDN paddd, %2, %4
|
||||
%endmacro
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
;*******************************************************************
|
||||
INIT_MMX mmx
|
||||
cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
|
||||
ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
|
||||
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 16
|
||||
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 32
|
||||
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 48
|
||||
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 64
|
||||
INIT_XMM sse2
|
||||
cglobal sbc_analyze_4, 3, 3, 5, in, out, consts
|
||||
ANALYZE_MAC m1, m2, m1, m2, 0, u
|
||||
ANALYZE_MAC m1, m2, m3, m4, 32, u
|
||||
movu m3, [inq+64]
|
||||
paddd m1, m0
|
||||
pmaddwd m3, [constsq+64]
|
||||
paddd m1, m2
|
||||
paddd m1, m3
|
||||
|
||||
ANALYZE_MAC_REG m0, m2, m0, m0, m2, 80, pack
|
||||
ANALYZE_MAC_REG m0, m2, m1, m1, m3, 96, pack
|
||||
psrad m1, 16
|
||||
packssdw m1, m1
|
||||
pshufd m2, m1, q0000
|
||||
pmaddwd m2, [constsq+80]
|
||||
pshufd m1, m1, q1111
|
||||
pmaddwd m1, [constsq+96]
|
||||
paddd m1, m2
|
||||
|
||||
movq [outq ], m0
|
||||
movq [outq+8], m2
|
||||
mova [outq], m1
|
||||
|
||||
RET
|
||||
|
||||
@@ -82,34 +83,41 @@ cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
|
||||
;*******************************************************************
|
||||
;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
;*******************************************************************
|
||||
INIT_MMX mmx
|
||||
cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
|
||||
ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
|
||||
ANALYZE_MAC_IN m2, m3, m2, m3, [scale_mask], [scale_mask], 16
|
||||
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 32
|
||||
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 48
|
||||
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 64
|
||||
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 80
|
||||
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 96
|
||||
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 112
|
||||
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 128
|
||||
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 144
|
||||
INIT_XMM sse2
|
||||
cglobal sbc_analyze_8, 3, 3, 6, in, out, consts
|
||||
ANALYZE_MAC m1, m2, m1, m2, 0, a
|
||||
ANALYZE_MAC m1, m2, m3, m4, 32, a
|
||||
paddd m1, m0
|
||||
ANALYZE_MAC m1, m2, m3, m4, 64, a
|
||||
ANALYZE_MAC m1, m2, m3, m4, 96, a
|
||||
paddd m2, m0
|
||||
ANALYZE_MAC m1, m2, m3, m4, 128, a
|
||||
|
||||
ANALYZE_MAC_REG m4, m5, m0, m4, m5, 160, pack
|
||||
ANALYZE_MAC_REG m4, m5, m1, m6, m7, 192, pack
|
||||
ANALYZE_MAC_REG m4, m5, m2, m6, m7, 224, pack
|
||||
ANALYZE_MAC_REG m4, m5, m3, m6, m7, 256, pack
|
||||
psrad m1, 16
|
||||
psrad m2, 16
|
||||
packssdw m1, m2
|
||||
|
||||
movq [outq ], m4
|
||||
movq [outq+8], m5
|
||||
pshufd m2, m1, q0000
|
||||
pmaddwd m0, m2, [constsq+160]
|
||||
pshufd m3, m1, q1111
|
||||
pmaddwd m2, [constsq+176]
|
||||
pmaddwd m4, m3, [constsq+192]
|
||||
pshufd m5, m1, q2222
|
||||
pmaddwd m3, [constsq+208]
|
||||
paddd m0, m4
|
||||
pmaddwd m4, m5, [constsq+224]
|
||||
pshufd m1, m1, q3333
|
||||
pmaddwd m5, [constsq+240]
|
||||
paddd m2, m3
|
||||
pmaddwd m3, m1, [constsq+256]
|
||||
paddd m0, m4
|
||||
pmaddwd m1, [constsq+272]
|
||||
paddd m0, m3
|
||||
paddd m2, m5
|
||||
|
||||
ANALYZE_MAC_REG m0, m5, m0, m0, m5, 176, no
|
||||
ANALYZE_MAC_REG m0, m5, m1, m1, m7, 208, no
|
||||
ANALYZE_MAC_REG m0, m5, m2, m2, m7, 240, no
|
||||
ANALYZE_MAC_REG m0, m5, m3, m3, m7, 272, no
|
||||
|
||||
movq [outq+16], m0
|
||||
movq [outq+24], m5
|
||||
mova [outq], m0
|
||||
paddd m2, m1
|
||||
mova [outq+16], m2
|
||||
|
||||
RET
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
|
||||
/**
|
||||
* @file
|
||||
* SBC MMX optimization for some basic "building bricks"
|
||||
* SBC DSP optimization for some basic "building bricks"
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
@@ -34,8 +34,8 @@
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/sbcdsp.h"
|
||||
|
||||
void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int channels, int subbands);
|
||||
@@ -45,8 +45,10 @@ av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
|
||||
s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
|
||||
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->sbc_analyze_4 = ff_sbc_analyze_4_sse2;
|
||||
s->sbc_analyze_8 = ff_sbc_analyze_8_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ static void check_sbc_analyze(SBCDSPContext *sbcdsp)
|
||||
DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_ref)[SBC_MAX_SUBBANDS];
|
||||
DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_new)[SBC_MAX_SUBBANDS];
|
||||
|
||||
declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
declare_func(void, const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
if (check_func(i ? sbcdsp->sbc_analyze_8 : sbcdsp->sbc_analyze_4, "sbc_analyze_%u", i ? 8 : 4)) {
|
||||
|
||||
Reference in New Issue
Block a user