swscale/ops_dispatch: compute input x offset map for SwsOpExec

This is cheap to precompute and can be used as-is for gather-style horizontal
filter implementations.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
This commit is contained in:
Niklas Haas
2026-03-26 23:53:19 +01:00
parent dc88946d7b
commit e3daeff965
3 changed files with 62 additions and 6 deletions

View File

@@ -44,6 +44,7 @@ typedef struct SwsOpPass {
int idx_in[4];
int idx_out[4];
int *offsets_y;
int filter_size;
bool memcpy_first;
bool memcpy_last;
bool memcpy_out;
@@ -117,6 +118,7 @@ static void op_pass_free(void *ptr)
ff_sws_compiled_op_unref(&p->comp);
av_refstruct_unref(&p->offsets_y);
av_free(p->exec_base.in_bump_y);
av_free(p->exec_base.in_offset_x);
av_free(p->tail_buf);
av_free(p);
}
@@ -202,11 +204,18 @@ static int op_pass_setup(const SwsFrame *out, const SwsFrame *in,
const int safe_width = (num_blocks - 1) * block_size;
const int tail_size = pass->width - safe_width;
p->tail_off_in = safe_width * p->pixel_bits_in >> 3;
p->tail_off_out = safe_width * p->pixel_bits_out >> 3;
p->tail_size_in = (tail_size * p->pixel_bits_in + 7) >> 3;
p->tail_size_out = (tail_size * p->pixel_bits_out + 7) >> 3;
if (exec->in_offset_x) {
p->tail_off_in = exec->in_offset_x[safe_width];
p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in;
p->tail_size_in += (p->filter_size * p->pixel_bits_in + 7) >> 3;
} else {
p->tail_off_in = safe_width * p->pixel_bits_in >> 3;
p->tail_size_in = (tail_size * p->pixel_bits_in + 7) >> 3;
}
for (int i = 0; memcpy_in && i < p->planes_in; i++) {
size_t block_size = (comp->block_size * p->pixel_bits_in + 7) >> 3;
block_size += comp->over_read;
@@ -225,6 +234,14 @@ static int op_pass_setup(const SwsFrame *out, const SwsFrame *in,
alloc_size += tail->out_stride[i] * out->height;
}
if (memcpy_in && exec->in_offset_x) {
/* `in_offset_x` is indexed relative to the line start, not the start
* of the section being processed; so we need to over-allocate this
* array to the full width of the image, even though we will only
* partially fill in the offsets relevant to the tail region */
alloc_size += aligned_w * sizeof(*exec->in_offset_x);
}
uint8_t *tail_buf = av_fast_realloc(p->tail_buf, &p->tail_buf_size, alloc_size);
if (!tail_buf)
return AVERROR(ENOMEM);
@@ -240,6 +257,12 @@ static int op_pass_setup(const SwsFrame *out, const SwsFrame *in,
tail_buf += tail->out_stride[i] * out->height;
}
if (memcpy_in && exec->in_offset_x) {
tail->in_offset_x = (int32_t *) tail_buf;
for (int i = safe_width; i < aligned_w; i++)
tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in;
}
return 0;
}
@@ -308,7 +331,9 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y,
tail.slice_h = h;
for (int i = 0; i < p->planes_in; i++) {
exec.in[i] += p->tail_off_in;
/* Input offsets are relative to the base pointer */
if (!exec.in_offset_x || memcpy_in)
exec.in[i] += p->tail_off_in;
tail.in[i] += y * tail.in_stride[i];
}
for (int i = 0; i < p->planes_out; i++) {
@@ -397,8 +422,8 @@ static int compile(SwsGraph *graph, const SwsOpList *ops, SwsPass *input,
p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
}
const SwsFilterWeights *filter = read->rw.kernel;
if (read->rw.filter == SWS_OP_FILTER_V) {
const SwsFilterWeights *filter = read->rw.kernel;
p->offsets_y = av_refstruct_ref(filter->offsets);
/* Compute relative pointer bumps for each output line */
@@ -416,6 +441,22 @@ static int compile(SwsGraph *graph, const SwsOpList *ops, SwsPass *input,
}
bump[filter->dst_size - 1] = 0;
p->exec_base.in_bump_y = bump;
} else if (read->rw.filter == SWS_OP_FILTER_H) {
/* Compute pixel offset map for each output line */
const int pixels = FFALIGN(filter->dst_size, p->comp.block_size);
int32_t *offset = av_malloc_array(pixels, sizeof(*offset));
if (!offset) {
ret = AVERROR(ENOMEM);
goto fail;
}
for (int x = 0; x < filter->dst_size; x++)
offset[x] = filter->offsets[x] * p->pixel_bits_in >> 3;
for (int x = filter->dst_size; x < pixels; x++)
offset[x] = offset[filter->dst_size - 1];
p->exec_base.in_offset_x = offset;
p->exec_base.block_size_in = 0; /* ptr does not advance */
p->filter_size = filter->filter_size;
}
return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,

View File

@@ -41,7 +41,13 @@ typedef struct SwsOpExec {
ptrdiff_t in_stride[4];
ptrdiff_t out_stride[4];
/* Pointer bump, difference between stride and processed line size */
/**
* Pointer bump, difference between stride and processed line size.
*
* Assumes that each read kernel increments pointers by the processed
* block size, except when using horizontal filtering, in which case
* this is always equal to the input stride.
*/
ptrdiff_t in_bump[4];
ptrdiff_t out_bump[4];
@@ -64,12 +70,20 @@ typedef struct SwsOpExec {
* multiplied by the corresponding line stride.
*/
int32_t *in_bump_y;
/**
* Pixel offset map; for horizontal scaling, in bytes. Indexed by the x
* coordinate of the output pixel. This is always aligned up to a multiple
* of the block size, so implementations may safely over-read up to the
* next block boundary.
*/
int32_t *in_offset_x;
} SwsOpExec;
static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) +
6 * sizeof(int32_t) +
16 * sizeof(uint8_t) +
1 * sizeof(void *),
2 * sizeof(void *),
"SwsOpExec layout mismatch");
/**

View File

@@ -142,6 +142,7 @@ struc SwsOpExec
.in_sub_x4 resb 4
.out_sub_x4 resb 4
.in_bump_y resq 1
.in_offset_x resq 1
endstruc
struc SwsOpImpl