swscale/aarch64: add NEON YUV420P/YUV422P/YUVA420P to RGB conversion

Add ARM64 NEON-accelerated unscaled YUV-to-RGB conversion for planar YUV input formats. This extends the existing NV12/NV21 NEON paths with YUV420P, YUV422P, and YUVA420P support for all packed RGB output formats (ARGB, RGBA, ABGR, BGRA, RGB24, BGR24) and planar GBRP. Register with ff_yuv2rgb_init_aarch64() to also cover the scaled path. checkasm: all 42 sw_yuv2rgb tests pass. Speedup vs C at 1920px width (Apple M3 Max, avg of 20 runs): yuv420p->rgb24: 4.3x yuv420p->argb: 3.1x yuv422p->rgb24: 5.5x yuv422p->argb: 4.1x yuva420p->argb: 3.5x yuva420p->rgba: 3.5x Signed-off-by: David Christle <dev@christle.is>
2026-04-20 21:00:41 +08:00 · 2026-02-06 01:17:38 -05:00
parent 8e591af32b
commit 7fab0becab
4 changed files with 208 additions and 15 deletions
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@@ -89,10 +89,45 @@ DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba)
 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr)                                                   \
 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra)                                                   \
 DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp)                                                   \
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb24)                                                  \
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24)                                                  \

 DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
 DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)

+#define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt)                                            \
+int ff_yuva420p_to_##ofmt##_neon(int w, int h,                                              \
+                                 uint8_t *dst, int linesize,                                \
+                                 const uint8_t *srcY, int linesizeY,                        \
+                                 const uint8_t *srcU, int linesizeU,                        \
+                                 const uint8_t *srcV, int linesizeV,                        \
+                                 const int16_t *table,                                      \
+                                 int y_offset, int y_coeff,                                 \
+                                 const uint8_t *srcA, int linesizeA);                       \
+                                                                                            \
+static int yuva420p_to_##ofmt##_neon_wrapper(SwsInternal *c,                                \
+                                             const uint8_t *const src[],                    \
+                                             const int srcStride[], int srcSliceY,          \
+                                             int srcSliceH, uint8_t *const dst[],           \
+                                             const int dstStride[]) {                       \
+    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };                                   \
+                                                                                            \
+    return ff_yuva420p_to_##ofmt##_neon(c->opts.src_w, srcSliceH,                           \
+                                        dst[0] + srcSliceY * dstStride[0], dstStride[0],    \
+                                        src[0], srcStride[0],                               \
+                                        src[1], srcStride[1],                               \
+                                        src[2], srcStride[2],                               \
+                                        yuv2rgb_table,                                      \
+                                        c->yuv2rgb_y_offset >> 6,                           \
+                                        c->yuv2rgb_y_coeff,                                 \
+                                        src[3], srcStride[3]);                              \
+}
+
+DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(argb)
+DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(rgba)
+DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(abgr)
+DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(bgra)
+
 #define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt)                                            \
 int ff_##ifmt##_to_##ofmt##_neon(int w, int h,                                              \
                                 uint8_t *dst, int linesize,                                \
@@ -176,6 +211,8 @@ DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr)                                                     \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra)                                                     \
 DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp)                                                     \
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb24)                                                    \
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24)                                                    \

 DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
 DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
@@ -199,6 +236,8 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd);                            \
    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd);                            \
    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd);                            \
+    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb24, RGB24, accurate_rnd);                          \
+    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd);                          \
 } while (0)

 static void get_unscaled_swscale_neon(SwsInternal *c) {
@@ -208,6 +247,13 @@ static void get_unscaled_swscale_neon(SwsInternal *c) {
    SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
    SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
    SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd);
+    SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp,  GBRP,  accurate_rnd);

    if (c->opts.dst_format == AV_PIX_FMT_YUV420P &&
        (c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) &&
@@ -221,3 +267,47 @@ void ff_get_unscaled_swscale_aarch64(SwsInternal *c)
    if (have_neon(cpu_flags))
        get_unscaled_swscale_neon(c);
 }
+
+av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (!have_neon(cpu_flags) ||
+        (c->opts.src_h & 1) || (c->opts.src_w & 15) ||
+        (c->opts.flags & SWS_ACCURATE_RND))
+        return NULL;
+
+    if (c->opts.src_format == AV_PIX_FMT_YUV420P) {
+        switch (c->opts.dst_format) {
+        case AV_PIX_FMT_ARGB:  return yuv420p_to_argb_neon_wrapper;
+        case AV_PIX_FMT_RGBA:  return yuv420p_to_rgba_neon_wrapper;
+        case AV_PIX_FMT_ABGR:  return yuv420p_to_abgr_neon_wrapper;
+        case AV_PIX_FMT_BGRA:  return yuv420p_to_bgra_neon_wrapper;
+        case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
+        case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
+        case AV_PIX_FMT_GBRP:  return yuv420p_to_gbrp_neon_wrapper;
+        }
+    } else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
+        switch (c->opts.dst_format) {
+#if CONFIG_SWSCALE_ALPHA
+        case AV_PIX_FMT_ARGB:  return yuva420p_to_argb_neon_wrapper;
+        case AV_PIX_FMT_RGBA:  return yuva420p_to_rgba_neon_wrapper;
+        case AV_PIX_FMT_ABGR:  return yuva420p_to_abgr_neon_wrapper;
+        case AV_PIX_FMT_BGRA:  return yuva420p_to_bgra_neon_wrapper;
+#endif
+        case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
+        case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
+        case AV_PIX_FMT_GBRP:  return yuv420p_to_gbrp_neon_wrapper;
+        }
+    } else if (c->opts.src_format == AV_PIX_FMT_YUV422P) {
+        switch (c->opts.dst_format) {
+        case AV_PIX_FMT_ARGB:  return yuv422p_to_argb_neon_wrapper;
+        case AV_PIX_FMT_RGBA:  return yuv422p_to_rgba_neon_wrapper;
+        case AV_PIX_FMT_ABGR:  return yuv422p_to_abgr_neon_wrapper;
+        case AV_PIX_FMT_BGRA:  return yuv422p_to_bgra_neon_wrapper;
+        case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper;
+        case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper;
+        case AV_PIX_FMT_GBRP:  return yuv422p_to_gbrp_neon_wrapper;
+        }
+    }
+    return NULL;
+}
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -55,7 +55,17 @@
        load_dst1_dst2  24, 32, 40, 48
        sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
 .else
+ .ifc \ofmt,rgb24
+        add             w17, w0, w0, lsl #1
+        sub             w3, w3, w17                                     // w3 = linesize  - width * 3 (padding)
+ .else
+  .ifc \ofmt,bgr24
+        add             w17, w0, w0, lsl #1
+        sub             w3, w3, w17                                     // w3 = linesize  - width * 3 (padding)
+  .else
        sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
+  .endif
+ .endif
 .endif
        sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
        sub             w7, w7, w0                                      // w7 = linesizeC - width     (paddingC)
@@ -78,7 +88,17 @@
        load_dst1_dst2  40, 48, 56, 64
        sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
 .else
+ .ifc \ofmt,rgb24
+        add             w17, w0, w0, lsl #1
+        sub             w3, w3, w17                                     // w3 = linesize  - width * 3 (padding)
+ .else
+  .ifc \ofmt,bgr24
+        add             w17, w0, w0, lsl #1
+        sub             w3, w3, w17                                     // w3 = linesize  - width * 3 (padding)
+  .else
        sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
+  .endif
+ .endif
 .endif
        sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
        sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
@@ -87,6 +107,18 @@
        neg             w11, w11
 .endm

+.macro load_args_yuva420p ofmt
+        load_args_yuv420p \ofmt
+#if defined(__APPLE__)
+        ldr             x15, [sp, #32]                                  // srcA
+        ldr             w16, [sp, #40]                                  // linesizeA
+#else
+        ldr             x15, [sp, #40]                                  // srcA
+        ldr             w16, [sp, #48]                                  // linesizeA
+#endif
+        sub             w16, w16, w0                                    // w16 = linesizeA - width    (paddingA)
+.endm
+
 .macro load_args_yuv422p ofmt
        ldr             x13, [sp]                                       // srcV
        ldr             w14, [sp, #8]                                   // linesizeV
@@ -99,7 +131,17 @@
        load_dst1_dst2  40, 48, 56, 64
        sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
 .else
+ .ifc \ofmt,rgb24
+        add             w17, w0, w0, lsl #1
+        sub             w3, w3, w17                                     // w3 = linesize  - width * 3 (padding)
+ .else
+  .ifc \ofmt,bgr24
+        add             w17, w0, w0, lsl #1
+        sub             w3, w3, w17                                     // w3 = linesize  - width * 3 (padding)
+  .else
        sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
+  .endif
+ .endif
 .endif
        sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
        sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
@@ -125,6 +167,10 @@
        ushll           v19.8h, v17.8b, #3
 .endm

+.macro load_chroma_yuva420p
+        load_chroma_yuv420p
+.endm
+
 .macro load_chroma_yuv422p
        load_chroma_yuv420p
 .endm
@@ -147,6 +193,11 @@
        add             x13, x13, w17, sxtw                             // srcV += incV
 .endm

+.macro increment_yuva420p
+        increment_yuv420p
+        add             x15, x15, w16, sxtw                             // srcA += paddingA (every row)
+.endm
+
 .macro increment_yuv422p
        add             x6,  x6,  w7, sxtw                              // srcU += incU
        add             x13, x13, w14, sxtw                             // srcV += incV
@@ -169,65 +220,103 @@

 .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
        compute_rgb     \r1, \g1, \b1, \r2, \g2, \b2
-        movi            \a1, #255
-        movi            \a2, #255
+        mov             \a1, v30.8b
+        mov             \a2, v30.8b
+.endm
+
+.macro compute_rgba_alpha r1 g1 b1 a1 r2 g2 b2 a2
+        compute_rgb     \r1, \g1, \b1, \r2, \g2, \b2
+        mov             \a1, v28.8b                                     // real alpha (first 8 pixels)
+        mov             \a2, v29.8b                                     // real alpha (next 8 pixels)
 .endm

 .macro declare_func ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
        load_args_\ifmt \ofmt

+        movi            v31.8h, #4, lsl #8                              // 128 * (1<<3) (loop-invariant)
+        movi            v30.8b, #255                                    // alpha = 255  (loop-invariant)
        mov             w9, w1
 1:
        mov             w8, w0                                          // w8 = width
 2:
-        movi            v5.8h, #4, lsl #8                               // 128 * (1<<3)
        load_chroma_\ifmt
-        sub             v18.8h, v18.8h, v5.8h                           // U*(1<<3) - 128*(1<<3)
-        sub             v19.8h, v19.8h, v5.8h                           // V*(1<<3) - 128*(1<<3)
+        sub             v18.8h, v18.8h, v31.8h                          // U*(1<<3) - 128*(1<<3)
+        sub             v19.8h, v19.8h, v31.8h                          // V*(1<<3) - 128*(1<<3)
        sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * v2r            (R)
        sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * u2g
+        ld1             {v2.16b}, [x4], #16                             // load luma (interleaved)
+.ifc \ifmt,yuva420p
+        ld1             {v28.8b, v29.8b}, [x15], #16                    // load 16 alpha bytes
+.endif
        sqdmulh         v19.8h, v19.8h, v1.h[2]                         //           V * v2g
-        add             v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G)
        sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * u2b            (B)
+        ushll           v26.8h, v2.8b,  #3                              // Y1*(1<<3)
+        ushll2          v27.8h, v2.16b, #3                              // Y2*(1<<3)
+        add             v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G)
+        sub             v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset
+        sub             v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset
        zip2            v21.8h, v20.8h, v20.8h                          // R2
        zip1            v20.8h, v20.8h, v20.8h                          // R1
+        sqdmulh         v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
+        sqdmulh         v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
        zip2            v23.8h, v22.8h, v22.8h                          // G2
        zip1            v22.8h, v22.8h, v22.8h                          // G1
        zip2            v25.8h, v24.8h, v24.8h                          // B2
        zip1            v24.8h, v24.8h, v24.8h                          // B1
-        ld1             {v2.16b}, [x4], #16                             // load luma
-        ushll           v26.8h, v2.8b,  #3                              // Y1*(1<<3)
-        ushll2          v27.8h, v2.16b, #3                              // Y2*(1<<3)
-        sub             v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset
-        sub             v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset
-        sqdmulh         v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
-        sqdmulh         v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15

 .ifc \ofmt,argb // 1 2 3 0
+ .ifc \ifmt,yuva420p
+        compute_rgba_alpha v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
+ .else
        compute_rgba    v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
+ .endif
 .endif

 .ifc \ofmt,rgba // 0 1 2 3
+ .ifc \ifmt,yuva420p
+        compute_rgba_alpha v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
+ .else
        compute_rgba    v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
+ .endif
 .endif

 .ifc \ofmt,abgr // 3 2 1 0
+ .ifc \ifmt,yuva420p
+        compute_rgba_alpha v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
+ .else
        compute_rgba    v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
+ .endif
 .endif

 .ifc \ofmt,bgra // 2 1 0 3
+ .ifc \ifmt,yuva420p
+        compute_rgba_alpha v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
+ .else
        compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
+ .endif
 .endif

-.ifc \ofmt,gbrp
+.ifc \ofmt,rgb24
+        compute_rgb     v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
+        st3             { v4.8b, v5.8b, v6.8b}, [x2], #24
+        st3             {v16.8b,v17.8b,v18.8b}, [x2], #24
+.else
+ .ifc \ofmt,bgr24
+        compute_rgb     v6.8b,v5.8b,v4.8b, v18.8b,v17.8b,v16.8b
+        st3             { v4.8b, v5.8b, v6.8b}, [x2], #24
+        st3             {v16.8b,v17.8b,v18.8b}, [x2], #24
+ .else
+  .ifc \ofmt,gbrp
        compute_rgb     v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
        st1             {  v4.8b,  v5.8b }, [x2],  #16
        st1             {  v6.8b,  v7.8b }, [x10], #16
        st1             { v18.8b, v19.8b }, [x15], #16
-.else
+  .else
        st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
        st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
+  .endif
+ .endif
 .endif
        subs            w8, w8, #16                                     // width -= 16
        b.gt            2b
@@ -251,9 +340,20 @@ endfunc
        declare_func    \ifmt, abgr
        declare_func    \ifmt, bgra
        declare_func    \ifmt, gbrp
+        declare_func    \ifmt, rgb24
+        declare_func    \ifmt, bgr24
 .endm

 declare_rgb_funcs nv12
 declare_rgb_funcs nv21
 declare_rgb_funcs yuv420p
 declare_rgb_funcs yuv422p
+
+.macro declare_yuva_funcs ifmt
+        declare_func    \ifmt, argb
+        declare_func    \ifmt, rgba
+        declare_func    \ifmt, abgr
+        declare_func    \ifmt, bgra
+.endm
+
+declare_yuva_funcs yuva420p
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -735,6 +735,7 @@ av_cold int ff_sws_fill_xyztables(SwsInternal *c);
 SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c);
 SwsFunc ff_yuv2rgb_init_loongarch(SwsInternal *c);
+SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c);

 static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
 {
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -568,6 +568,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsInternal *c)
    t = ff_yuv2rgb_init_x86(c);
 #elif ARCH_LOONGARCH64
    t = ff_yuv2rgb_init_loongarch(c);
+#elif ARCH_AARCH64
+    t = ff_yuv2rgb_init_aarch64(c);
 #endif

    if (t)