swscale/aarch64: add NEON YUV420P/YUV422P/YUVA420P to RGB conversion

Add ARM64 NEON-accelerated unscaled YUV-to-RGB conversion for planar
YUV input formats. This extends the existing NV12/NV21 NEON paths with
YUV420P, YUV422P, and YUVA420P support for all packed RGB output
formats (ARGB, RGBA, ABGR, BGRA, RGB24, BGR24) and planar GBRP.

Register with ff_yuv2rgb_init_aarch64() to also cover the scaled path.

checkasm: all 42 sw_yuv2rgb tests pass.
Speedup vs C at 1920px width (Apple M3 Max, avg of 20 runs):
  yuv420p->rgb24:   4.3x    yuv420p->argb:   3.1x
  yuv422p->rgb24:   5.5x    yuv422p->argb:   4.1x
  yuva420p->argb:   3.5x    yuva420p->rgba:  3.5x

Signed-off-by: David Christle <dev@christle.is>
This commit is contained in:
David Christle
2026-02-06 01:17:38 -05:00
committed by Martin Storsjö
parent 8e591af32b
commit 7fab0becab
4 changed files with 208 additions and 15 deletions

View File

@@ -89,10 +89,45 @@ DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba)
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb24) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24) \
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
#define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt) \
int ff_yuva420p_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcU, int linesizeU, \
const uint8_t *srcV, int linesizeV, \
const int16_t *table, \
int y_offset, int y_coeff, \
const uint8_t *srcA, int linesizeA); \
\
static int yuva420p_to_##ofmt##_neon_wrapper(SwsInternal *c, \
const uint8_t *const src[], \
const int srcStride[], int srcSliceY, \
int srcSliceH, uint8_t *const dst[], \
const int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
return ff_yuva420p_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], \
src[1], srcStride[1], \
src[2], srcStride[2], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff, \
src[3], srcStride[3]); \
}
DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(argb)
DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(rgba)
DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(abgr)
DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(bgra)
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
@@ -176,6 +211,8 @@ DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb24) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
@@ -199,6 +236,8 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb24, RGB24, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd); \
} while (0)
static void get_unscaled_swscale_neon(SwsInternal *c) {
@@ -208,6 +247,13 @@ static void get_unscaled_swscale_neon(SwsInternal *c) {
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd);
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd);
if (c->opts.dst_format == AV_PIX_FMT_YUV420P &&
(c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) &&
@@ -221,3 +267,47 @@ void ff_get_unscaled_swscale_aarch64(SwsInternal *c)
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}
av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
if (!have_neon(cpu_flags) ||
(c->opts.src_h & 1) || (c->opts.src_w & 15) ||
(c->opts.flags & SWS_ACCURATE_RND))
return NULL;
if (c->opts.src_format == AV_PIX_FMT_YUV420P) {
switch (c->opts.dst_format) {
case AV_PIX_FMT_ARGB: return yuv420p_to_argb_neon_wrapper;
case AV_PIX_FMT_RGBA: return yuv420p_to_rgba_neon_wrapper;
case AV_PIX_FMT_ABGR: return yuv420p_to_abgr_neon_wrapper;
case AV_PIX_FMT_BGRA: return yuv420p_to_bgra_neon_wrapper;
case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
}
} else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
switch (c->opts.dst_format) {
#if CONFIG_SWSCALE_ALPHA
case AV_PIX_FMT_ARGB: return yuva420p_to_argb_neon_wrapper;
case AV_PIX_FMT_RGBA: return yuva420p_to_rgba_neon_wrapper;
case AV_PIX_FMT_ABGR: return yuva420p_to_abgr_neon_wrapper;
case AV_PIX_FMT_BGRA: return yuva420p_to_bgra_neon_wrapper;
#endif
case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
}
} else if (c->opts.src_format == AV_PIX_FMT_YUV422P) {
switch (c->opts.dst_format) {
case AV_PIX_FMT_ARGB: return yuv422p_to_argb_neon_wrapper;
case AV_PIX_FMT_RGBA: return yuv422p_to_rgba_neon_wrapper;
case AV_PIX_FMT_ABGR: return yuv422p_to_abgr_neon_wrapper;
case AV_PIX_FMT_BGRA: return yuv422p_to_bgra_neon_wrapper;
case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper;
case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper;
case AV_PIX_FMT_GBRP: return yuv422p_to_gbrp_neon_wrapper;
}
}
return NULL;
}

View File

@@ -55,7 +55,17 @@
load_dst1_dst2 24, 32, 40, 48
sub w3, w3, w0 // w3 = linesize - width (padding)
.else
.ifc \ofmt,rgb24
add w17, w0, w0, lsl #1
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
.else
.ifc \ofmt,bgr24
add w17, w0, w0, lsl #1
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
.else
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
.endif
.endif
.endif
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
@@ -78,7 +88,17 @@
load_dst1_dst2 40, 48, 56, 64
sub w3, w3, w0 // w3 = linesize - width (padding)
.else
.ifc \ofmt,rgb24
add w17, w0, w0, lsl #1
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
.else
.ifc \ofmt,bgr24
add w17, w0, w0, lsl #1
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
.else
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
.endif
.endif
.endif
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@@ -87,6 +107,18 @@
neg w11, w11
.endm
.macro load_args_yuva420p ofmt
load_args_yuv420p \ofmt
#if defined(__APPLE__)
ldr x15, [sp, #32] // srcA
ldr w16, [sp, #40] // linesizeA
#else
ldr x15, [sp, #40] // srcA
ldr w16, [sp, #48] // linesizeA
#endif
sub w16, w16, w0 // w16 = linesizeA - width (paddingA)
.endm
.macro load_args_yuv422p ofmt
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
@@ -99,7 +131,17 @@
load_dst1_dst2 40, 48, 56, 64
sub w3, w3, w0 // w3 = linesize - width (padding)
.else
.ifc \ofmt,rgb24
add w17, w0, w0, lsl #1
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
.else
.ifc \ofmt,bgr24
add w17, w0, w0, lsl #1
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
.else
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
.endif
.endif
.endif
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@@ -125,6 +167,10 @@
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_yuva420p
load_chroma_yuv420p
.endm
.macro load_chroma_yuv422p
load_chroma_yuv420p
.endm
@@ -147,6 +193,11 @@
add x13, x13, w17, sxtw // srcV += incV
.endm
.macro increment_yuva420p
increment_yuv420p
add x15, x15, w16, sxtw // srcA += paddingA (every row)
.endm
.macro increment_yuv422p
add x6, x6, w7, sxtw // srcU += incU
add x13, x13, w14, sxtw // srcV += incV
@@ -169,65 +220,103 @@
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
compute_rgb \r1, \g1, \b1, \r2, \g2, \b2
movi \a1, #255
movi \a2, #255
mov \a1, v30.8b
mov \a2, v30.8b
.endm
.macro compute_rgba_alpha r1 g1 b1 a1 r2 g2 b2 a2
compute_rgb \r1, \g1, \b1, \r2, \g2, \b2
mov \a1, v28.8b // real alpha (first 8 pixels)
mov \a2, v29.8b // real alpha (next 8 pixels)
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt \ofmt
movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant)
movi v30.8b, #255 // alpha = 255 (loop-invariant)
mov w9, w1
1:
mov w8, w0 // w8 = width
2:
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
sub v18.8h, v18.8h, v31.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v31.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
ld1 {v2.16b}, [x4], #16 // load luma (interleaved)
.ifc \ifmt,yuva420p
ld1 {v28.8b, v29.8b}, [x15], #16 // load 16 alpha bytes
.endif
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8h, v20.8h, v20.8h // R1
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8h, v24.8h, v24.8h // B1
ld1 {v2.16b}, [x4], #16 // load luma
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0
.ifc \ifmt,yuva420p
compute_rgba_alpha v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.else
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.endif
.endif
.ifc \ofmt,rgba // 0 1 2 3
.ifc \ifmt,yuva420p
compute_rgba_alpha v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.else
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.endif
.endif
.ifc \ofmt,abgr // 3 2 1 0
.ifc \ifmt,yuva420p
compute_rgba_alpha v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.else
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.endif
.endif
.ifc \ofmt,bgra // 2 1 0 3
.ifc \ifmt,yuva420p
compute_rgba_alpha v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.else
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif
.endif
.ifc \ofmt,gbrp
.ifc \ofmt,rgb24
compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
st3 { v4.8b, v5.8b, v6.8b}, [x2], #24
st3 {v16.8b,v17.8b,v18.8b}, [x2], #24
.else
.ifc \ofmt,bgr24
compute_rgb v6.8b,v5.8b,v4.8b, v18.8b,v17.8b,v16.8b
st3 { v4.8b, v5.8b, v6.8b}, [x2], #24
st3 {v16.8b,v17.8b,v18.8b}, [x2], #24
.else
.ifc \ofmt,gbrp
compute_rgb v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
st1 { v4.8b, v5.8b }, [x2], #16
st1 { v6.8b, v7.8b }, [x10], #16
st1 { v18.8b, v19.8b }, [x15], #16
.else
.else
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
.endif
.endif
.endif
subs w8, w8, #16 // width -= 16
b.gt 2b
@@ -251,9 +340,20 @@ endfunc
declare_func \ifmt, abgr
declare_func \ifmt, bgra
declare_func \ifmt, gbrp
declare_func \ifmt, rgb24
declare_func \ifmt, bgr24
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p
.macro declare_yuva_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_yuva_funcs yuva420p

View File

@@ -735,6 +735,7 @@ av_cold int ff_sws_fill_xyztables(SwsInternal *c);
SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c);
SwsFunc ff_yuv2rgb_init_loongarch(SwsInternal *c);
SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c);
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
{

View File

@@ -568,6 +568,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsInternal *c)
t = ff_yuv2rgb_init_x86(c);
#elif ARCH_LOONGARCH64
t = ff_yuv2rgb_init_loongarch(c);
#elif ARCH_AARCH64
t = ff_yuv2rgb_init_aarch64(c);
#endif
if (t)