avcodec/x86/sbcdsp: Port ff_sbc_analyze_[48]_mmx to SSE2

Halfs the amount of pmaddwd and improves performance a lot:
sbc_analyze_4_c:                                        55.7 ( 1.00x)
sbc_analyze_4_mmx:                                       7.0 ( 7.94x)
sbc_analyze_4_sse2:                                      4.3 (12.93x)
sbc_analyze_8_c:                                       131.1 ( 1.00x)
sbc_analyze_8_mmx:                                      22.4 ( 5.84x)
sbc_analyze_8_sse2:                                     10.7 (12.25x)

It also saves 224B of .text and allows to remove the emms_c()
from sbcenc.c (notice that ff_sbc_calc_scalefactors_mmx()
issues emms on its own, so it already abides by the ABI).

Hint: A pshufd could be avoided per function if the constants
were reordered.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-03-24 15:55:45 +01:00
parent 7cf5e90586
commit cd886bf0a5
4 changed files with 70 additions and 62 deletions

View File

@@ -31,7 +31,6 @@
*/
#include "libavutil/channel_layout.h"
#include "libavutil/emms.h"
#include "libavutil/opt.h"
#include "avcodec.h"
#include "codec_internal.h"
@@ -322,7 +321,6 @@ static int sbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
frame->blocks,
frame->channels,
frame->subbands);
emms_c();
sbc_pack_frame(avpkt, frame, j, sbc->msbc);
*got_packet_ptr = 1;

View File

@@ -38,43 +38,44 @@ SECTION .text
%endif
%endmacro
%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
NIDN movq, %5, %3
NIDN movq, %6, %4
pmaddwd %5, [constsq+%9]
pmaddwd %6, [constsq+%9+8]
NIDN paddd, %1, %7
NIDN paddd, %2, %8
%endmacro
%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
ANALYZE_MAC %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
%endmacro
%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
%ifidn %7, pack
psrad %3, 16 ; SBC_PROTO_FIXED_SCALE
packssdw %3, %3
%macro ANALYZE_MAC 6 ; out1, out2, tmp1, tmp2, offset, aligned
mov%6 %3, [inq+%5]
mov%6 %4, [inq+%5+mmsize]
%if %5 == 0
pcmpeqd m0, m0
psrld m0, 31
%endif
ANALYZE_MAC %1, %2, %3, %3, %4, %5, %4, %5, %6
pmaddwd %3, [constsq+%5]
pmaddwd %4, [constsq+%5+mmsize]
%if %5 == 0
pslld m0, 15 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) as dword
%endif
NIDN paddd, %1, %3
NIDN paddd, %2, %4
%endmacro
;*******************************************************************
;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
;*******************************************************************
INIT_MMX mmx
cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 16
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 32
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 48
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 64
INIT_XMM sse2
cglobal sbc_analyze_4, 3, 3, 5, in, out, consts
ANALYZE_MAC m1, m2, m1, m2, 0, u
ANALYZE_MAC m1, m2, m3, m4, 32, u
movu m3, [inq+64]
paddd m1, m0
pmaddwd m3, [constsq+64]
paddd m1, m2
paddd m1, m3
ANALYZE_MAC_REG m0, m2, m0, m0, m2, 80, pack
ANALYZE_MAC_REG m0, m2, m1, m1, m3, 96, pack
psrad m1, 16
packssdw m1, m1
pshufd m2, m1, q0000
pmaddwd m2, [constsq+80]
pshufd m1, m1, q1111
pmaddwd m1, [constsq+96]
paddd m1, m2
movq [outq ], m0
movq [outq+8], m2
mova [outq], m1
RET
@@ -82,34 +83,41 @@ cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
;*******************************************************************
;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
;*******************************************************************
INIT_MMX mmx
cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
ANALYZE_MAC_IN m2, m3, m2, m3, [scale_mask], [scale_mask], 16
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 32
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 48
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 64
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 80
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 96
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 112
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 128
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 144
INIT_XMM sse2
cglobal sbc_analyze_8, 3, 3, 6, in, out, consts
ANALYZE_MAC m1, m2, m1, m2, 0, a
ANALYZE_MAC m1, m2, m3, m4, 32, a
paddd m1, m0
ANALYZE_MAC m1, m2, m3, m4, 64, a
ANALYZE_MAC m1, m2, m3, m4, 96, a
paddd m2, m0
ANALYZE_MAC m1, m2, m3, m4, 128, a
ANALYZE_MAC_REG m4, m5, m0, m4, m5, 160, pack
ANALYZE_MAC_REG m4, m5, m1, m6, m7, 192, pack
ANALYZE_MAC_REG m4, m5, m2, m6, m7, 224, pack
ANALYZE_MAC_REG m4, m5, m3, m6, m7, 256, pack
psrad m1, 16
psrad m2, 16
packssdw m1, m2
movq [outq ], m4
movq [outq+8], m5
pshufd m2, m1, q0000
pmaddwd m0, m2, [constsq+160]
pshufd m3, m1, q1111
pmaddwd m2, [constsq+176]
pmaddwd m4, m3, [constsq+192]
pshufd m5, m1, q2222
pmaddwd m3, [constsq+208]
paddd m0, m4
pmaddwd m4, m5, [constsq+224]
pshufd m1, m1, q3333
pmaddwd m5, [constsq+240]
paddd m2, m3
pmaddwd m3, m1, [constsq+256]
paddd m0, m4
pmaddwd m1, [constsq+272]
paddd m0, m3
paddd m2, m5
ANALYZE_MAC_REG m0, m5, m0, m0, m5, 176, no
ANALYZE_MAC_REG m0, m5, m1, m1, m7, 208, no
ANALYZE_MAC_REG m0, m5, m2, m2, m7, 240, no
ANALYZE_MAC_REG m0, m5, m3, m3, m7, 272, no
movq [outq+16], m0
movq [outq+24], m5
mova [outq], m0
paddd m2, m1
mova [outq+16], m2
RET

View File

@@ -26,7 +26,7 @@
/**
* @file
* SBC MMX optimization for some basic "building bricks"
* SBC DSP optimization for some basic "building bricks"
*/
#include "libavutil/attributes.h"
@@ -34,8 +34,8 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/sbcdsp.h"
void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8],
uint32_t scale_factor[2][8],
int blocks, int channels, int subbands);
@@ -45,8 +45,10 @@ av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->sbc_analyze_4 = ff_sbc_analyze_4_sse2;
s->sbc_analyze_8 = ff_sbc_analyze_8_sse2;
}
}

View File

@@ -41,7 +41,7 @@ static void check_sbc_analyze(SBCDSPContext *sbcdsp)
DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_ref)[SBC_MAX_SUBBANDS];
DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_new)[SBC_MAX_SUBBANDS];
declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *in, int32_t *out, const int16_t *consts);
declare_func(void, const int16_t *in, int32_t *out, const int16_t *consts);
for (int i = 0; i < 2; ++i) {
if (check_func(i ? sbcdsp->sbc_analyze_8 : sbcdsp->sbc_analyze_4, "sbc_analyze_%u", i ? 8 : 4)) {