swscale/ops_backend: allocate block storage up-front

Instead of in each read() function. Not only is this slightly faster, due to promoting more tail calls, but it also allows us to have operation chains that don't start with a read. Also simplifies the implementations. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <git@haasn.dev>
2026-04-20 21:00:41 +08:00 · 2026-01-13 17:20:55 +01:00
parent 7961e3a48f
commit e729f49645
3 changed files with 16 additions and 31 deletions
--- a/libswscale/ops_backend.c
+++ b/libswscale/ops_backend.c
@@ -53,18 +53,20 @@ static void process(const SwsOpExec *exec, const void *priv,
 {
    const SwsOpChain *chain = priv;
    const SwsOpImpl *impl = chain->impl;
-    SwsOpIter iter;
+    u32block_t x, y, z, w; /* allocate enough space for any intermediate */

-    for (iter.y = y_start; iter.y < y_end; iter.y++) {
+    SwsOpIter iterdata;
+    SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
+
+    for (iter->y = y_start; iter->y < y_end; iter->y++) {
        for (int i = 0; i < 4; i++) {
-            iter.in[i]  = exec->in[i]  + (iter.y - y_start) * exec->in_stride[i];
-            iter.out[i] = exec->out[i] + (iter.y - y_start) * exec->out_stride[i];
+            iter->in[i]  = exec->in[i]  + (iter->y - y_start) * exec->in_stride[i];
+            iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i];
        }

        for (int block = bx_start; block < bx_end; block++) {
-            iter.x = block * SWS_BLOCK_SIZE;
-            ((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont)
-                (&iter, &impl[1]);
+            iter->x = block * SWS_BLOCK_SIZE;
+            CONTINUE(u32block_t, x, y, z, w);
        }
    }
 }
--- a/libswscale/ops_backend.h
+++ b/libswscale/ops_backend.h
@@ -78,13 +78,9 @@ typedef struct SwsOpIter {
                                          __VA_ARGS__)

 #define DECL_READ(NAME, ...)                                                    \
-    static av_always_inline void fn(NAME)(SwsOpIter *restrict iter,             \
-                                          const SwsOpImpl *restrict impl,       \
-                                          const pixel_t *restrict in0,          \
-                                          const pixel_t *restrict in1,          \
-                                          const pixel_t *restrict in2,          \
-                                          const pixel_t *restrict in3,          \
-                                          __VA_ARGS__)
+    DECL_FUNC(NAME, const pixel_t *restrict in0, const pixel_t *restrict in1,   \
+                    const pixel_t *restrict in2, const pixel_t *restrict in3,   \
+                    __VA_ARGS__)

 #define DECL_WRITE(NAME, ...)                                                   \
    DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1,             \
@@ -96,10 +92,9 @@ typedef struct SwsOpIter {
    fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__)

 #define CALL_READ(FUNC, ...)                                                    \
-    fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0],                         \
-                         (const pixel_t *) iter->in[1],                         \
-                         (const pixel_t *) iter->in[2],                         \
-                         (const pixel_t *) iter->in[3], __VA_ARGS__)
+    CALL(FUNC, (const pixel_t *) iter->in[0], (const pixel_t *) iter->in[1],    \
+               (const pixel_t *) iter->in[2], (const pixel_t *) iter->in[3],    \
+               __VA_ARGS__)

 #define CALL_WRITE(FUNC, ...)                                                   \
    CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1],              \
@@ -112,10 +107,6 @@ typedef struct SwsOpIter {
                                  block_t x, block_t y,                         \
                                  block_t z, block_t w)

-#define DECL_IMPL_READ(NAME)                                                    \
-    static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter,                     \
-                                  const SwsOpImpl *restrict impl)
-
 /* Helper macro to call into the next continuation with a given type */
 #define CONTINUE(TYPE, ...)                                                     \
    ((void (*)(SwsOpIter *, const SwsOpImpl *,                                  \
--- a/libswscale/ops_tmpl_int.c
+++ b/libswscale/ops_tmpl_int.c
@@ -58,8 +58,6 @@

 DECL_READ(read_planar, const int elems)
 {
-    block_t x, y, z, w;
-
    SWS_LOOP
    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
        x[i] = in0[i];
@@ -76,8 +74,6 @@ DECL_READ(read_planar, const int elems)

 DECL_READ(read_packed, const int elems)
 {
-    block_t x, y, z, w;
-
    SWS_LOOP
    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
        x[i] = in0[elems * i + 0];
@@ -121,7 +117,7 @@ DECL_WRITE(write_packed, const int elems)
 }

 #define WRAP_READ(FUNC, ELEMS, FRAC, PACKED)                                    \
-DECL_IMPL_READ(FUNC##ELEMS)                                                     \
+DECL_IMPL(FUNC##ELEMS)                                                          \
 {                                                                               \
    CALL_READ(FUNC, ELEMS);                                                     \
    for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                              \
@@ -173,8 +169,6 @@ WRAP_WRITE(write_packed, 4, 0, true)
 #if BIT_DEPTH == 8
 DECL_READ(read_nibbles, const int elems)
 {
-    block_t x, y, z, w;
-
    SWS_LOOP
    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
        const pixel_t val = ((const pixel_t *) in0)[i >> 1];
@@ -187,8 +181,6 @@ DECL_READ(read_nibbles, const int elems)

 DECL_READ(read_bits, const int elems)
 {
-    block_t x, y, z, w;
-
    SWS_LOOP
    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
        const pixel_t val = ((const pixel_t *) in0)[i >> 3];