swscale/ops_memcpy: add 'memcpy' backend for plane->plane copies

Provides a generic fast path for any operation list that can be decomposed into a series of memcpy and memset operations. 25% faster than the x86 backend for yuv444p -> yuva444p 33% faster than the x86 backend for gray -> yuvj444p
2026-04-20 21:00:41 +08:00 · 2025-05-08 19:04:49 +02:00
parent 5aef513fb4
commit a151b426f9
3 changed files with 136 additions and 1 deletions
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -18,6 +18,7 @@ OBJS = alphablend.o                                     \
       ops.o                                            \
       ops_backend.o                                    \
       ops_chain.o                                      \
+       ops_memcpy.o                                     \
       ops_optimizer.o                                  \
       options.o                                        \
       output.o                                         \
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -27,9 +27,11 @@
 #include "ops.h"
 #include "ops_internal.h"

-const extern SwsOpBackend backend_c;
+extern const SwsOpBackend backend_c;
+extern const SwsOpBackend backend_murder;

 const SwsOpBackend * const ff_sws_op_backends[] = {
+    &backend_murder,
    &backend_c,
    NULL
 };
--- a/libswscale/ops_memcpy.c
+++ b/libswscale/ops_memcpy.c
@@ -0,0 +1,132 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "ops_backend.h"
+
+typedef struct MemcpyPriv {
+    int num_planes;
+    int index[4]; /* or -1 to clear plane */
+    uint8_t clear_value[4];
+} MemcpyPriv;
+
+/* Memcpy backend for trivial cases */
+
+static void process(const SwsOpExec *exec, const void *priv,
+                    int x_start, int y_start, int x_end, int y_end)
+{
+    const MemcpyPriv *p = priv;
+    const int lines = y_end - y_start;
+    av_assert1(x_start == 0 && x_end == exec->width);
+
+    for (int i = 0; i < p->num_planes; i++) {
+        uint8_t *out = exec->out[i];
+        const int idx = p->index[i];
+        if (idx < 0) {
+            memset(out, p->clear_value[i], exec->out_stride[i] * lines);
+        } else if (exec->out_stride[i] == exec->in_stride[idx]) {
+            memcpy(out, exec->in[idx], exec->out_stride[i] * lines);
+        } else {
+            const int bytes = x_end * exec->block_size_out;
+            const uint8_t *in = exec->in[idx];
+            for (int y = y_start; y < y_end; y++) {
+                memcpy(out, in, bytes);
+                out += exec->out_stride[i];
+                in  += exec->in_stride[idx];
+            }
+        }
+    }
+}
+
+static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+    MemcpyPriv p = {0};
+
+    for (int n = 0; n < ops->num_ops; n++) {
+        const SwsOp *op = &ops->ops[n];
+        switch (op->op) {
+        case SWS_OP_READ:
+            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac)
+                return AVERROR(ENOTSUP);
+            for (int i = 0; i < op->rw.elems; i++)
+                p.index[i] = i;
+            break;
+
+        case SWS_OP_SWIZZLE: {
+            const MemcpyPriv orig = p;
+            for (int i = 0; i < 4; i++) {
+                /* Explicitly exclude swizzle masks that contain duplicates,
+                 * because these are wasteful to implement as a memcpy */
+                for (int j = 0; j < i; j++) {
+                    if (op->swizzle.in[i] == op->swizzle.in[j])
+                        return AVERROR(ENOTSUP);
+                }
+                p.index[i] = orig.index[op->swizzle.in[i]];
+            }
+            break;
+        }
+
+        case SWS_OP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (!op->c.q4[i].den)
+                    continue;
+                if (op->c.q4[i].den != 1)
+                    return AVERROR(ENOTSUP);
+
+                /* Ensure all bytes to be cleared are the same, because we
+                 * can't memset on multi-byte sequences */
+                uint8_t val = op->c.q4[i].num & 0xFF;
+                uint32_t ref = val;
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 2: ref *= 0x101; break;
+                case 4: ref *= 0x1010101; break;
+                }
+                if (ref != op->c.q4[i].num)
+                    return AVERROR(ENOTSUP);
+                p.clear_value[i] = val;
+                p.index[i] = -1;
+            }
+            break;
+
+        case SWS_OP_WRITE:
+            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac)
+                return AVERROR(ENOTSUP);
+            p.num_planes = op->rw.elems;
+            break;
+
+        default:
+            return AVERROR(ENOTSUP);
+        }
+    }
+
+    *out = (SwsCompiledOp) {
+        .block_size = 1,
+        .func = process,
+        .priv = av_memdup(&p, sizeof(p)),
+        .free = av_free,
+    };
+    return out->priv ? 0 : AVERROR(ENOMEM);
+}
+
+const SwsOpBackend backend_murder = {
+    .name    = "memcpy",
+    .compile = compile,
+};