From e729f49645b6f60e1f45efffee14ff8645a050f7 Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Tue, 13 Jan 2026 17:20:55 +0100 Subject: [PATCH] swscale/ops_backend: allocate block storage up-front Instead of in each read() function. Not only is this slightly faster, due to promoting more tail calls, but it also allows us to have operation chains that don't start with a read. Also simplifies the implementations. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas --- libswscale/ops_backend.c | 16 +++++++++------- libswscale/ops_backend.h | 21 ++++++--------------- libswscale/ops_tmpl_int.c | 10 +--------- 3 files changed, 16 insertions(+), 31 deletions(-) diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c index 248a591fd2..a503139016 100644 --- a/libswscale/ops_backend.c +++ b/libswscale/ops_backend.c @@ -53,18 +53,20 @@ static void process(const SwsOpExec *exec, const void *priv, { const SwsOpChain *chain = priv; const SwsOpImpl *impl = chain->impl; - SwsOpIter iter; + u32block_t x, y, z, w; /* allocate enough space for any intermediate */ - for (iter.y = y_start; iter.y < y_end; iter.y++) { + SwsOpIter iterdata; + SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */ + + for (iter->y = y_start; iter->y < y_end; iter->y++) { for (int i = 0; i < 4; i++) { - iter.in[i] = exec->in[i] + (iter.y - y_start) * exec->in_stride[i]; - iter.out[i] = exec->out[i] + (iter.y - y_start) * exec->out_stride[i]; + iter->in[i] = exec->in[i] + (iter->y - y_start) * exec->in_stride[i]; + iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i]; } for (int block = bx_start; block < bx_end; block++) { - iter.x = block * SWS_BLOCK_SIZE; - ((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont) - (&iter, &impl[1]); + iter->x = block * SWS_BLOCK_SIZE; + CONTINUE(u32block_t, x, y, z, w); } } } diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h index 4a1794af8a..b1616f6b02 100644 --- a/libswscale/ops_backend.h +++ b/libswscale/ops_backend.h @@ -78,13 +78,9 @@ typedef struct SwsOpIter { __VA_ARGS__) #define DECL_READ(NAME, ...) \ - static av_always_inline void fn(NAME)(SwsOpIter *restrict iter, \ - const SwsOpImpl *restrict impl, \ - const pixel_t *restrict in0, \ - const pixel_t *restrict in1, \ - const pixel_t *restrict in2, \ - const pixel_t *restrict in3, \ - __VA_ARGS__) + DECL_FUNC(NAME, const pixel_t *restrict in0, const pixel_t *restrict in1, \ + const pixel_t *restrict in2, const pixel_t *restrict in3, \ + __VA_ARGS__) #define DECL_WRITE(NAME, ...) \ DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1, \ @@ -96,10 +92,9 @@ typedef struct SwsOpIter { fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__) #define CALL_READ(FUNC, ...) \ - fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0], \ - (const pixel_t *) iter->in[1], \ - (const pixel_t *) iter->in[2], \ - (const pixel_t *) iter->in[3], __VA_ARGS__) + CALL(FUNC, (const pixel_t *) iter->in[0], (const pixel_t *) iter->in[1], \ + (const pixel_t *) iter->in[2], (const pixel_t *) iter->in[3], \ + __VA_ARGS__) #define CALL_WRITE(FUNC, ...) \ CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1], \ @@ -112,10 +107,6 @@ typedef struct SwsOpIter { block_t x, block_t y, \ block_t z, block_t w) -#define DECL_IMPL_READ(NAME) \ - static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter, \ - const SwsOpImpl *restrict impl) - /* Helper macro to call into the next continuation with a given type */ #define CONTINUE(TYPE, ...) \ ((void (*)(SwsOpIter *, const SwsOpImpl *, \ diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c index 84596e2763..d9870faf34 100644 --- a/libswscale/ops_tmpl_int.c +++ b/libswscale/ops_tmpl_int.c @@ -58,8 +58,6 @@ DECL_READ(read_planar, const int elems) { - block_t x, y, z, w; - SWS_LOOP for (int i = 0; i < SWS_BLOCK_SIZE; i++) { x[i] = in0[i]; @@ -76,8 +74,6 @@ DECL_READ(read_planar, const int elems) DECL_READ(read_packed, const int elems) { - block_t x, y, z, w; - SWS_LOOP for (int i = 0; i < SWS_BLOCK_SIZE; i++) { x[i] = in0[elems * i + 0]; @@ -121,7 +117,7 @@ DECL_WRITE(write_packed, const int elems) } #define WRAP_READ(FUNC, ELEMS, FRAC, PACKED) \ -DECL_IMPL_READ(FUNC##ELEMS) \ +DECL_IMPL(FUNC##ELEMS) \ { \ CALL_READ(FUNC, ELEMS); \ for (int i = 0; i < (PACKED ? 1 : ELEMS); i++) \ @@ -173,8 +169,6 @@ WRAP_WRITE(write_packed, 4, 0, true) #if BIT_DEPTH == 8 DECL_READ(read_nibbles, const int elems) { - block_t x, y, z, w; - SWS_LOOP for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) { const pixel_t val = ((const pixel_t *) in0)[i >> 1]; @@ -187,8 +181,6 @@ DECL_READ(read_nibbles, const int elems) DECL_READ(read_bits, const int elems) { - block_t x, y, z, w; - SWS_LOOP for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { const pixel_t val = ((const pixel_t *) in0)[i >> 3];