swscale/ops_backend: allocate block storage up-front

Instead of in each read() function. Not only is this slightly faster, due
to promoting more tail calls, but it also allows us to have operation chains
that don't start with a read.

Also simplifies the implementations.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
This commit is contained in:
Niklas Haas
2026-01-13 17:20:55 +01:00
committed by Niklas Haas
parent 7961e3a48f
commit e729f49645
3 changed files with 16 additions and 31 deletions

View File

@@ -53,18 +53,20 @@ static void process(const SwsOpExec *exec, const void *priv,
{
const SwsOpChain *chain = priv;
const SwsOpImpl *impl = chain->impl;
SwsOpIter iter;
u32block_t x, y, z, w; /* allocate enough space for any intermediate */
for (iter.y = y_start; iter.y < y_end; iter.y++) {
SwsOpIter iterdata;
SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
for (iter->y = y_start; iter->y < y_end; iter->y++) {
for (int i = 0; i < 4; i++) {
iter.in[i] = exec->in[i] + (iter.y - y_start) * exec->in_stride[i];
iter.out[i] = exec->out[i] + (iter.y - y_start) * exec->out_stride[i];
iter->in[i] = exec->in[i] + (iter->y - y_start) * exec->in_stride[i];
iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i];
}
for (int block = bx_start; block < bx_end; block++) {
iter.x = block * SWS_BLOCK_SIZE;
((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont)
(&iter, &impl[1]);
iter->x = block * SWS_BLOCK_SIZE;
CONTINUE(u32block_t, x, y, z, w);
}
}
}

View File

@@ -78,13 +78,9 @@ typedef struct SwsOpIter {
__VA_ARGS__)
#define DECL_READ(NAME, ...) \
static av_always_inline void fn(NAME)(SwsOpIter *restrict iter, \
const SwsOpImpl *restrict impl, \
const pixel_t *restrict in0, \
const pixel_t *restrict in1, \
const pixel_t *restrict in2, \
const pixel_t *restrict in3, \
__VA_ARGS__)
DECL_FUNC(NAME, const pixel_t *restrict in0, const pixel_t *restrict in1, \
const pixel_t *restrict in2, const pixel_t *restrict in3, \
__VA_ARGS__)
#define DECL_WRITE(NAME, ...) \
DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1, \
@@ -96,10 +92,9 @@ typedef struct SwsOpIter {
fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__)
#define CALL_READ(FUNC, ...) \
fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0], \
(const pixel_t *) iter->in[1], \
(const pixel_t *) iter->in[2], \
(const pixel_t *) iter->in[3], __VA_ARGS__)
CALL(FUNC, (const pixel_t *) iter->in[0], (const pixel_t *) iter->in[1], \
(const pixel_t *) iter->in[2], (const pixel_t *) iter->in[3], \
__VA_ARGS__)
#define CALL_WRITE(FUNC, ...) \
CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1], \
@@ -112,10 +107,6 @@ typedef struct SwsOpIter {
block_t x, block_t y, \
block_t z, block_t w)
#define DECL_IMPL_READ(NAME) \
static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter, \
const SwsOpImpl *restrict impl)
/* Helper macro to call into the next continuation with a given type */
#define CONTINUE(TYPE, ...) \
((void (*)(SwsOpIter *, const SwsOpImpl *, \

View File

@@ -58,8 +58,6 @@
DECL_READ(read_planar, const int elems)
{
block_t x, y, z, w;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
x[i] = in0[i];
@@ -76,8 +74,6 @@ DECL_READ(read_planar, const int elems)
DECL_READ(read_packed, const int elems)
{
block_t x, y, z, w;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
x[i] = in0[elems * i + 0];
@@ -121,7 +117,7 @@ DECL_WRITE(write_packed, const int elems)
}
#define WRAP_READ(FUNC, ELEMS, FRAC, PACKED) \
DECL_IMPL_READ(FUNC##ELEMS) \
DECL_IMPL(FUNC##ELEMS) \
{ \
CALL_READ(FUNC, ELEMS); \
for (int i = 0; i < (PACKED ? 1 : ELEMS); i++) \
@@ -173,8 +169,6 @@ WRAP_WRITE(write_packed, 4, 0, true)
#if BIT_DEPTH == 8
DECL_READ(read_nibbles, const int elems)
{
block_t x, y, z, w;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
const pixel_t val = ((const pixel_t *) in0)[i >> 1];
@@ -187,8 +181,6 @@ DECL_READ(read_nibbles, const int elems)
DECL_READ(read_bits, const int elems)
{
block_t x, y, z, w;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
const pixel_t val = ((const pixel_t *) in0)[i >> 3];