diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h
index 7aadc9d32e..f97b4303aa 100644
--- a/libswscale/ops_backend.h
+++ b/libswscale/ops_backend.h
@@ -102,17 +102,21 @@ typedef struct SwsOpIter {
                (pixel_t *) iter->out[2], (pixel_t *) iter->out[3], __VA_ARGS__)
 
 /* Helper macros to declare continuation functions */
-#define DECL_IMPL(NAME)                                                         \
-    static void fn(NAME)(SwsOpIter *restrict iter,                              \
-                         const SwsOpImpl *restrict impl,                        \
-                         block_t x, block_t y,                                  \
-                         block_t z, block_t w)
+#define DECL_IMPL(FUNC, NAME, ...)                                              \
+    static void av_flatten fn(NAME)(SwsOpIter *restrict iter,                   \
+                                    const SwsOpImpl *restrict impl,             \
+                                    void *restrict x, void *restrict y,         \
+                                    void *restrict z, void *restrict w)         \
+    {                                                                           \
+        CALL(FUNC, __VA_ARGS__);                                                \
+    }
 
-/* Helper macro to call into the next continuation with a given type */
-#define CONTINUE(TYPE, ...)                                                     \
+/* Helper macro to call into the next continuation */
+#define CONTINUE(X, Y, Z, W)                                                    \
     ((void (*)(SwsOpIter *, const SwsOpImpl *,                                  \
-               TYPE x, TYPE y, TYPE z, TYPE w)) impl->cont)                     \
-        (iter, &impl[1], __VA_ARGS__)
+               void *restrict, void *restrict,                                  \
+               void *restrict, void *restrict)) impl->cont)                     \
+        (iter, &impl[1], (X), (Y), (Z), (W))
 
 /* Helper macros for common op setup code */
 #define DECL_SETUP(NAME, PARAMS, OUT)                                           \
@@ -139,10 +143,7 @@ static inline int ff_setup_memdup(const void *c, size_t size, SwsImplResult *out
     DECL_FUNC(NAME, const bool X, const bool Y, const bool Z, const bool W)
 
 #define WRAP_PATTERN(FUNC, X, Y, Z, W, ...)                                     \
-    DECL_IMPL(FUNC##_##X##Y##Z##W)                                              \
-    {                                                                           \
-        CALL(FUNC, X, Y, Z, W);                                                 \
-    }                                                                           \
+    DECL_IMPL(FUNC, FUNC##_##X##Y##Z##W, X, Y, Z, W)                            \
                                                                                 \
     DECL_ENTRY(FUNC##_##X##Y##Z##W,                                             \
         .unused = { !X, !Y, !Z, !W },                                           \
diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
index 3817a437e5..0763a08b1c 100644
--- a/libswscale/ops_tmpl_common.c
+++ b/libswscale/ops_tmpl_common.c
@@ -41,7 +41,7 @@ DECL_PATTERN(convert_uint##N)
             wu[i] = w[i];                                                       \
     }                                                                           \
                                                                                 \
-    CONTINUE(u##N##block_t, xu, yu, zu, wu);                                    \
+    CONTINUE(xu, yu, zu, wu);                                                   \
 }                                                                               \
                                                                                 \
 WRAP_COMMON_PATTERNS(convert_uint##N,                                           \
@@ -75,14 +75,11 @@ DECL_PATTERN(clear)
             w[i] = impl->priv.px[3];
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 #define WRAP_CLEAR(X, Y, Z, W)                                                  \
-DECL_IMPL(clear##_##X##Y##Z##W)                                                 \
-{                                                                               \
-    CALL(clear, X, Y, Z, W);                                                    \
-}                                                                               \
+DECL_IMPL(clear, clear##_##X##Y##Z##W, X, Y, Z, W)                              \
                                                                                 \
 DECL_ENTRY(clear##_##X##Y##Z##W,                                                \
     .setup = ff_sws_setup_clear,                                                \
@@ -119,7 +116,7 @@ DECL_PATTERN(min)
             w[i] = FFMIN(w[i], impl->priv.px[3]);
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 DECL_PATTERN(max)
@@ -136,7 +133,7 @@ DECL_PATTERN(max)
             w[i] = FFMAX(w[i], impl->priv.px[3]);
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 WRAP_COMMON_PATTERNS(min,
@@ -167,7 +164,7 @@ DECL_PATTERN(scale)
             w[i] *= scale;
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 WRAP_COMMON_PATTERNS(scale,
@@ -239,7 +236,7 @@ DECL_READ(filter_v, const int elems)
     for (int i = 0; i < elems; i++)
         iter->in[i] += sizeof(block_t);
 
-    CONTINUE(f32block_t, xs, ys, zs, ws);
+    CONTINUE(xs, ys, zs, ws);
 }
 
 DECL_SETUP(setup_filter_h, params, out)
@@ -292,11 +289,14 @@ DECL_READ(filter_h, const int elems)
         weights += filter_size;
     }
 
-    CONTINUE(f32block_t, xs, ys, zs, ws);
+    CONTINUE(xs, ys, zs, ws);
 }
 
 #define WRAP_FILTER(FUNC, DIR, ELEMS, SUFFIX)                                   \
-DECL_IMPL(FUNC##ELEMS##SUFFIX)                                                  \
+static av_flatten void fn(FUNC##ELEMS##SUFFIX)(SwsOpIter *restrict iter,        \
+                                             const SwsOpImpl *restrict impl,    \
+                                             void *restrict x, void *restrict y,\
+                                             void *restrict z, void *restrict w)\
 {                                                                               \
     CALL_READ(FUNC##SUFFIX, ELEMS);                                             \
 }                                                                               \
@@ -337,7 +337,7 @@ static void fn(process)(const SwsOpExec *exec, const void *priv,
     for (iter->y = y_start; iter->y < y_end; iter->y++) {
         for (int block = bx_start; block < bx_end; block++) {
             iter->x = block * SWS_BLOCK_SIZE;
-            CONTINUE(block_t, (void *) x, (void *) y, (void *) z, (void *) w);
+            CONTINUE(x, y, z, w);
         }
 
         const int y_bump = exec->in_bump_y ? exec->in_bump_y[iter->y] : 0;
diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c
index 0d00714ff4..687b08b99b 100644
--- a/libswscale/ops_tmpl_float.c
+++ b/libswscale/ops_tmpl_float.c
@@ -98,14 +98,11 @@ DECL_FUNC(dither, const int size_log2)
     DITHER_COMP(z, 2)
     DITHER_COMP(w, 3)
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 #define WRAP_DITHER(N)                                                          \
-DECL_IMPL(dither##N)                                                            \
-{                                                                               \
-    CALL(dither, N);                                                            \
-}                                                                               \
+DECL_IMPL(dither, dither##N, N)                                                 \
                                                                                 \
 DECL_ENTRY(dither##N,                                                           \
     .op = SWS_OP_DITHER,                                                        \
@@ -185,14 +182,11 @@ DECL_FUNC(linear_mask, const uint32_t mask)
         w[i] += (mask & SWS_MASK(3, 3))  ? c.m[3][3] * ww : ww;
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 #define WRAP_LINEAR(NAME, MASK)                                                 \
-DECL_IMPL(linear_##NAME)                                                        \
-{                                                                               \
-    CALL(linear_mask, MASK);                                                    \
-}                                                                               \
+DECL_IMPL(linear_mask, linear_##NAME, MASK)                                     \
                                                                                 \
 DECL_ENTRY(linear_##NAME,                                                       \
     .op    = SWS_OP_LINEAR,                                                     \
diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c
index fea0430799..baa3526029 100644
--- a/libswscale/ops_tmpl_int.c
+++ b/libswscale/ops_tmpl_int.c
@@ -71,7 +71,7 @@ DECL_READ(read_planar, const int elems)
             w[i] = in3[i];
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 DECL_READ(read_packed, const int elems)
@@ -87,7 +87,7 @@ DECL_READ(read_packed, const int elems)
             w[i] = in0[elems * i + 3];
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 DECL_WRITE(write_planar, const int elems)
@@ -119,7 +119,10 @@ DECL_WRITE(write_packed, const int elems)
 }
 
 #define WRAP_READ(FUNC, ELEMS, FRAC, PACKED)                                    \
-DECL_IMPL(FUNC##ELEMS)                                                          \
+static av_flatten void fn(FUNC##ELEMS)(SwsOpIter *restrict iter,                \
+                                       const SwsOpImpl *restrict impl,          \
+                                       void *restrict x, void *restrict y,      \
+                                       void *restrict z, void *restrict w)      \
 {                                                                               \
     CALL_READ(FUNC, ELEMS);                                                     \
     for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                              \
@@ -144,7 +147,10 @@ WRAP_READ(read_packed, 3, 0, true)
 WRAP_READ(read_packed, 4, 0, true)
 
 #define WRAP_WRITE(FUNC, ELEMS, FRAC, PACKED)                                   \
-DECL_IMPL(FUNC##ELEMS)                                                          \
+static av_flatten void fn(FUNC##ELEMS)(SwsOpIter *restrict iter,                \
+                                       const SwsOpImpl *restrict impl,          \
+                                       void *restrict x, void *restrict y,      \
+                                       void *restrict z, void *restrict w)      \
 {                                                                               \
     CALL_WRITE(FUNC, ELEMS);                                                    \
     for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                              \
@@ -178,7 +184,7 @@ DECL_READ(read_nibbles, const int elems)
         x[i + 1] = val & 0xF; /* low nibble */
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 DECL_READ(read_bits, const int elems)
@@ -196,7 +202,7 @@ DECL_READ(read_bits, const int elems)
         x[i + 7] = (val >> 0) & 1;
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 WRAP_READ(read_nibbles, 1, 1, false)
@@ -243,7 +249,7 @@ DECL_PATTERN(swap_bytes)
             w[i] = SWAP_BYTES(w[i]);
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 WRAP_COMMON_PATTERNS(swap_bytes, .op = SWS_OP_SWAP_BYTES);
@@ -266,7 +272,7 @@ DECL_PATTERN(expand16)
             w16[i] = w[i] << 8 | w[i];
     }
 
-    CONTINUE(u16block_t, x16, y16, z16, w16);
+    CONTINUE(x16, y16, z16, w16);
 }
 
 WRAP_COMMON_PATTERNS(expand16,
@@ -287,7 +293,7 @@ DECL_PATTERN(expand32)
         w32[i] = (uint32_t)w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
     }
 
-    CONTINUE(u32block_t, x32, y32, z32, w32);
+    CONTINUE(x32, y32, z32, w32);
 }
 
 WRAP_COMMON_PATTERNS(expand32,
@@ -297,44 +303,48 @@ WRAP_COMMON_PATTERNS(expand32,
 );
 #endif
 
+DECL_FUNC(pack, const int bits0, const int bits1, const int bits2, const int bits3)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] = x[i] << (bits1 + bits2 + bits3);
+        if (bits1)
+            x[i] |= y[i] << (bits2 + bits3);
+        if (bits2)
+            x[i] |= z[i] << bits3;
+        if (bits3)
+            x[i] |= w[i];
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_FUNC(unpack, const int bits0, const int bits1, const int bits2, const int bits3)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        const pixel_t val = x[i];
+        x[i] = val >> (bits1 + bits2 + bits3);
+        if (bits1)
+            y[i] = (val >> (bits2 + bits3)) & ((1 << bits1) - 1);
+        if (bits2)
+            z[i] = (val >> bits3) & ((1 << bits2) - 1);
+        if (bits3)
+            w[i] = val & ((1 << bits3) - 1);
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
 #define WRAP_PACK_UNPACK(X, Y, Z, W)                                            \
-inline DECL_IMPL(pack_##X##Y##Z##W)                                             \
-{                                                                               \
-    SWS_LOOP                                                                    \
-    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                  \
-        x[i] = x[i] << (Y+Z+W);                                                 \
-        if (Y)                                                                  \
-            x[i] |= y[i] << (Z+W);                                              \
-        if (Z)                                                                  \
-            x[i] |= z[i] << W;                                                  \
-        if (W)                                                                  \
-            x[i] |= w[i];                                                       \
-    }                                                                           \
-                                                                                \
-    CONTINUE(block_t, x, y, z, w);                                              \
-}                                                                               \
+DECL_IMPL(pack, pack_##X##Y##Z##W, X, Y, Z, W)                                  \
                                                                                 \
 DECL_ENTRY(pack_##X##Y##Z##W,                                                   \
     .op = SWS_OP_PACK,                                                          \
     .pack.pattern = { X, Y, Z, W },                                             \
 );                                                                              \
                                                                                 \
-inline DECL_IMPL(unpack_##X##Y##Z##W)                                           \
-{                                                                               \
-    SWS_LOOP                                                                    \
-    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                  \
-        const pixel_t val = x[i];                                               \
-        x[i] = val >> (Y+Z+W);                                                  \
-        if (Y)                                                                  \
-            y[i] = (val >> (Z+W)) & ((1 << Y) - 1);                             \
-        if (Z)                                                                  \
-            z[i] = (val >> W) & ((1 << Z) - 1);                                 \
-        if (W)                                                                  \
-            w[i] = val & ((1 << W) - 1);                                        \
-    }                                                                           \
-                                                                                \
-    CONTINUE(block_t, x, y, z, w);                                              \
-}                                                                               \
+DECL_IMPL(unpack, unpack_##X##Y##Z##W, X, Y, Z, W)                              \
                                                                                 \
 DECL_ENTRY(unpack_##X##Y##Z##W,                                                 \
     .op = SWS_OP_UNPACK,                                                        \
@@ -363,7 +373,7 @@ DECL_PATTERN(lshift)
         w[i] <<= amount;
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 DECL_PATTERN(rshift)
@@ -378,7 +388,7 @@ DECL_PATTERN(rshift)
         w[i] >>= amount;
     }
 
-    CONTINUE(block_t, x, y, z, w);
+    CONTINUE(x, y, z, w);
 }
 
 WRAP_COMMON_PATTERNS(lshift,
@@ -406,7 +416,7 @@ DECL_PATTERN(convert_float)
         wf[i] = w[i];
     }
 
-    CONTINUE(f32block_t, xf, yf, zf, wf);
+    CONTINUE(xf, yf, zf, wf);
 }
 
 WRAP_COMMON_PATTERNS(convert_float,
@@ -422,9 +432,10 @@ WRAP_COMMON_PATTERNS(convert_float,
 static void                                                                     \
 fn(swizzle_##X##Y##Z##W)(SwsOpIter *restrict iter,                              \
                          const SwsOpImpl *restrict impl,                        \
-                         block_t c0, block_t c1, block_t c2, block_t c3)        \
+                         void *restrict c0, void *restrict c1,                  \
+                         void *restrict c2, void *restrict c3)                  \
 {                                                                               \
-    CONTINUE(block_t, c##X, c##Y, c##Z, c##W);                                  \
+    CONTINUE(c##X, c##Y, c##Z, c##W);                                           \
 }                                                                               \
                                                                                 \
 DECL_ENTRY(swizzle_##X##Y##Z##W,                                                \
@@ -453,18 +464,18 @@ DECL_SWIZZLE(0, 3, 2, 1)
 
 /* Broadcast luma -> rgb (only used for y(a) -> rgb(a)) */
 #define DECL_EXPAND_LUMA(X, W, T0, T1)                                          \
-static void                                                                     \
-fn(expand_luma_##X##W)(SwsOpIter *restrict iter,                                \
-                       const SwsOpImpl *restrict impl,                          \
-                       block_t c0, block_t c1,  block_t c2, block_t c3)         \
+DECL_FUNC(expand_luma_##X##W##_impl,                                            \
+          block_t c0, block_t c1, block_t c2, block_t c3)                       \
 {                                                                               \
     SWS_LOOP                                                                    \
     for (int i = 0; i < SWS_BLOCK_SIZE; i++)                                    \
         T0[i] = T1[i] = c0[i];                                                  \
                                                                                 \
-    CONTINUE(block_t, c##X, T0, T1, c##W);                                      \
+    CONTINUE(c##X, T0, T1, c##W);                                               \
 }                                                                               \
                                                                                 \
+DECL_IMPL(expand_luma_##X##W##_impl, expand_luma_##X##W, x, y, z, w)            \
+                                                                                \
 DECL_ENTRY(expand_luma_##X##W,                                                  \
     .op = SWS_OP_SWIZZLE,                                                       \
     .swizzle.in = { X, 0, 0, W },                                               \